In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

In [2]:
def inspect_data(filename):
    df = pd.read_parquet(filename)
    return {
        "num_records": len(df),
        "average_duration": (df.dropOff_datetime - df.pickup_datetime).mean().total_seconds() / 60,
        "missing_pickup_location": df["PUlocationID"].isna().mean(), 
    }

In [3]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    df["duration"] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    return df

In [4]:
inspect_data("./data/fhv_tripdata_2021-01.parquet")

{'num_records': 1154112,
 'average_duration': 19.167224083333334,
 'missing_pickup_location': 0.8303067639882438}

In [5]:
df_train = read_dataframe("./data/fhv_tripdata_2021-01.parquet")
df_val = read_dataframe("./data/fhv_tripdata_2021-02.parquet")

In [6]:
categorical = ["PUlocationID", "DOlocationID"]
numerical = []
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict("records")
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict("records")
X_val = dv.transform(val_dicts)

In [7]:
print("Dimensionality: ", X_train.shape[1])

Dimensionality:  525


In [8]:
target = "duration"
y_train = df_train[target].values
y_val = df_val[target].values

In [9]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
print("RMSE of the model on the training data: ", mean_squared_error(y_train, y_pred_train, squared=False))

RMSE of the model on the training data:  10.528519107206014


In [10]:
y_pred_val = lr.predict(X_val)
print("RMSE of the model on the validation data: ", mean_squared_error(y_val, y_pred_val, squared=False))

RMSE of the model on the validation data:  11.014283149148788
