In [1]:
import pandas as pd
from pathlib import Path

In [2]:
DATA = Path("data")
train_path = DATA / "green_tripdata_2024-01.parquet"
validation_path = DATA / "green_tripdata_2024-02.parquet"

In [17]:
def load_dataset(path: Path) -> tuple[pd.DataFrame, pd.Series]:
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df['pickup_location'] = df.PULocationID.astype('string')
    df['dropoff_location'] = df.DOLocationID.astype('string')

    categoricals = ["pickup_location", "dropoff_location"]
    numericals = ["trip_distance"]
    target = "duration"

    x = df[categoricals + numericals]
    y = df[target]

    return x, y

In [18]:
train_x, train_y = load_dataset(train_path)
val_x, val_y = load_dataset(validation_path)

In [19]:
from sklearn.feature_extraction import DictVectorizer

In [20]:
train_x.iloc[:10].to_dict(orient="records")

[{'pickup_location': '236', 'dropoff_location': '239', 'trip_distance': 1.98},
 {'pickup_location': '65', 'dropoff_location': '170', 'trip_distance': 6.54},
 {'pickup_location': '74', 'dropoff_location': '262', 'trip_distance': 3.08},
 {'pickup_location': '74', 'dropoff_location': '116', 'trip_distance': 2.4},
 {'pickup_location': '74', 'dropoff_location': '243', 'trip_distance': 5.14},
 {'pickup_location': '33', 'dropoff_location': '209', 'trip_distance': 2.0},
 {'pickup_location': '74', 'dropoff_location': '238', 'trip_distance': 3.2},
 {'pickup_location': '166', 'dropoff_location': '239', 'trip_distance': 2.01},
 {'pickup_location': '226', 'dropoff_location': '226', 'trip_distance': 0.31},
 {'pickup_location': '7', 'dropoff_location': '129', 'trip_distance': 2.32}]

In [21]:
dv = DictVectorizer()
x = dv.fit_transform(train_x.iloc[:10].to_dict(orient="records"))

In [23]:
dv.vocabulary_

{'pickup_location=236': 11,
 'dropoff_location=239': 6,
 'trip_distance': 16,
 'pickup_location=65': 13,
 'dropoff_location=170': 2,
 'pickup_location=74': 15,
 'dropoff_location=262': 8,
 'dropoff_location=116': 0,
 'dropoff_location=243': 7,
 'pickup_location=33': 12,
 'dropoff_location=209': 3,
 'dropoff_location=238': 5,
 'pickup_location=166': 9,
 'pickup_location=226': 10,
 'dropoff_location=226': 4,
 'pickup_location=7': 14,
 'dropoff_location=129': 1}

In [24]:
x.

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 30 stored elements and shape (10, 17)>