In [1]:
!python -V

Python 3.12.3


In [3]:
import pandas as pd

In [91]:
import pickle

In [53]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

In [78]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error, root_mean_squared_error

In [64]:
def read_df(filename: str) -> pd.DataFrame():
    df = pd.read_parquet(filename)
    
    # get duration
    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    # duration is of timedelta format, need to modify to either minutes or seconds
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    # filter down to b/t 1 minute and 60 minutes (>98% of data)
    df = df.loc[(df.duration.values >= 1) & (df.duration.values <= 60)]
    # build features
    categorical = ['PULocationID', 'DOLocationID']
    numerical = ['trip_distance',] 
    df[categorical] = df[categorical].astype(str)
    
    return df

In [65]:
df_train = read_df('data/green_tripdata_2024-01.parquet')
df_val = read_df('data/green_tripdata_2024-02.parquet')

In [66]:
df_train.head(2)

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2024-01-01 00:46:55,2024-01-01 00:58:25,N,1.0,236,239,1.0,1.98,12.8,...,0.5,3.61,0.0,,1.0,21.66,1.0,1.0,2.75,11.5
1,2,2024-01-01 00:31:42,2024-01-01 00:52:34,N,1.0,65,170,5.0,6.54,30.3,...,0.5,7.11,0.0,,1.0,42.66,1.0,1.0,2.75,20.866667


In [86]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [87]:
categorical = ['PU_DO'] #['PULocationID', 'DOLocationID']
numerical = ['trip_distance',] 
dv = DictVectorizer()
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [69]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [92]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)



5.995211090245985

In [89]:
lr = Lasso(alpha=0.005)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.573247282099097

In [90]:
lr = Ridge(alpha=0.005)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

5.995273426641886

In [94]:
with open('models/linreg_model.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)