In [1]:
!python -V

Python 3.9.7


In [2]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import numpy as np

from sklearn.metrics import mean_squared_error

## Read Input Data

In [3]:
df_january = pd.read_parquet("./data/fhv_tripdata_2021-01.parquet")
df_january.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [4]:
df_january.shape

(1154112, 7)

In [5]:
df_january['duration'] = df_january.dropOff_datetime - df_january.pickup_datetime
df_january.duration = df_january.duration.apply(lambda td: td.total_seconds() / 60)
df_january.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.0
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667


In [6]:
# get average duration
np.mean(df_january.duration)

19.167224093791006

In [26]:
# drop the outliers
df_jan_dropped = df_january[(df_january.duration >= 1) & (df_january.duration <= 60)]

# number of dropped rows
print(df_january.shape[0] -  df_jan_dropped.shape[0])

44286


In [27]:
# get fraction of missing values
df_jan_dropped.info() # 83% missing values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1109826 entries, 0 to 1154111
Data columns (total 8 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   dispatching_base_num    1109826 non-null  object        
 1   pickup_datetime         1109826 non-null  datetime64[ns]
 2   dropOff_datetime        1109826 non-null  datetime64[ns]
 3   PUlocationID            182818 non-null   float64       
 4   DOlocationID            961919 non-null   float64       
 5   SR_Flag                 0 non-null        object        
 6   Affiliated_base_number  1109053 non-null  object        
 7   duration                1109826 non-null  float64       
dtypes: datetime64[ns](2), float64(3), object(3)
memory usage: 76.2+ MB


In [28]:
# let's replace the NaN with -1
df_jan_dropped = df_jan_dropped.fillna({"PUlocationID" : -1, "DOlocationID" : -1})
df_jan_dropped.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,-1.0,-1.0,,B00009,17.0
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,-1.0,-1.0,,B00009,17.0
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,-1.0,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,-1.0,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,-1.0,71.0,,B00037,9.05


In [29]:
df_jan_dropped = df_jan_dropped.astype({"PUlocationID":str, "DOlocationID":str})

In [30]:
train_dicts = df_jan_dropped[["PUlocationID", "DOlocationID"]].to_dict(orient="records")

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

target = "duration"
y_train = df_jan_dropped[target].values

In [31]:
X_train.shape

(1109826, 525)

## Train Model

In [33]:

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

10.528519429398262

## Evaluate Model

In [42]:
df_feb = pd.read_parquet("./data/fhv_tripdata_2021-02.parquet")
df_feb.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037


In [43]:
df_feb['duration'] = df_feb.dropOff_datetime - df_feb.pickup_datetime
df_feb.duration = df_feb.duration.apply(lambda td: td.total_seconds() / 60)
df_feb.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,duration
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014,92.0
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.95
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037,13.8


In [44]:
# drop the outliers
df_feb = df_feb[(df_feb.duration >= 1) & (df_feb.duration <= 60)]

In [45]:
df_feb = df_feb.fillna({"PUlocationID" : -1, "DOlocationID" : -1})

In [46]:
df_feb = df_feb.astype({"PUlocationID":str, "DOlocationID":str})

In [47]:
val_dicts = df_feb[["PUlocationID", "DOlocationID"]].to_dict(orient="records")

# create val data -> use trained DictVectorizer from training set
X_val = dv.transform(val_dicts)

target = "duration"
y_val = df_feb[target].values

In [48]:
y_val_pred = lr.predict(X_val)

mean_squared_error(y_val, y_val_pred, squared=False)

11.01428571667195