In [1]:
# Module imports
from typing import Any

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error

In [2]:
df = pd.read_parquet("./data/yellow_tripdata_2023-01.parquet")
print(df.shape)
print(df.columns)

(3066766, 19)
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')


In [3]:
df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
print(np.std(df.duration))

42.59434429744776


In [4]:
print(((df.duration >= 1) & (df.duration <= 60)).mean())

0.9812202822125979


In [5]:
def read_data(
    path: Path,
) -> tuple[list, np.ndarray]:
    df = pd.read_parquet(path, columns=["tpep_pickup_datetime", "tpep_dropoff_datetime", "PULocationID", "DOLocationID"])
    
    # process target
    df["duration"] = df["duration"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    # process feature
    df["PULocationID"] = df["PULocationID"].astype(str)
    df["DOLocationID"] = df["DOLocationID"].astype(str)

    y = df.duration.values
    X_dict = df[["PULocationID", "DOLocationID"]].to_dict(orient="records")

    return X_dict, y

def forward(
        model: Any,
        X_train: np.ndarray,
        y_train: np.ndarray,
        X_val: np.ndarray,
        y_val: np.ndarray,
) -> None:
    model.fit(X_train, y_train)

    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    print("Prediction RMSE on train:", root_mean_squared_error(y_train, y_pred_train))
    print("Prediction RMSE on val:", root_mean_squared_error(y_val, y_pred_val))

In [6]:
dv = DictVectorizer()
X_train, y_train = read_data("./data/yellow_tripdata_2023-01.parquet")
X_val, y_val = read_data("./data/yellow_tripdata_2023-02.parquet")

X_train = dv.fit_transform(X_train)
X_val = dv.transform(X_val)

In [7]:
model = LinearRegression()
forward(
    model=model,
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
)

Prediction RMSE on train: 7.649262060509152
Prediction RMSE on val: 7.811819098973374
