https://github.com/DataTalksClub/mlops-zoomcamp/blob/main/01-intro/homework.md

In [33]:
import pandas as pd
import numpy as np
import datetime
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [200]:
df=pd.read_parquet('fhv_tripdata_2021-01.parquet', engine="pyarrow")
df_val=pd.read_parquet('fhv_tripdata_2021-02.parquet', engine="pyarrow")

# Q1. Downloading the data

How many records are there?

The df dataframe shows that there are 1154112 rows in the dataset.

In [201]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037
...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285


In [202]:
df_val

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037
...,...,...,...,...,...,...,...
1037687,B03282,2021-02-28 23:01:16,2021-02-28 23:14:48,,31.0,,B01717
1037688,B03282,2021-02-28 23:36:10,2021-02-28 23:47:38,,169.0,,B01717
1037689,B03285,2021-02-28 23:18:36,2021-02-28 23:43:59,28.0,171.0,,B03285
1037690,B03285,2021-02-28 23:26:34,2021-02-28 23:44:37,16.0,252.0,,B03285


# Q2. Computing Duration

What is the average trip duration in January?

By calculating the difference between dropoff time and pickup time, a dataetime object indicating the trip duration for each ride is found. Then, the minutes can be extracted. The result is 18.61 minutes in average.

In [203]:
duration=df.dropOff_datetime-df.pickup_datetime
trips_in_minutes=duration.apply(lambda x: x.seconds/60)

duration_val=df_val.dropOff_datetime-df_val.pickup_datetime
trips_in_minutes_val=duration_val.apply(lambda x: x.seconds/60)

# Data preparation

In [204]:
df=pd.concat([df,trips_in_minutes],axis=1)
df=df.rename(columns={0:"DurationInMinutes"})

df_val=pd.concat([df_val,trips_in_minutes_val],axis=1)
df_val=df_val.rename(columns={0:"DurationInMinutes"})

In [205]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,DurationInMinutes
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.000000
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.000000
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013,110.000000
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667
...,...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266,8.750000
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,,57.600000
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285,16.200000
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285,19.433333


In [206]:
df_val

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,DurationInMinutes
0,B00013,2021-02-01 00:01:00,2021-02-01 01:33:00,,,,B00014,92.000000
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.950000
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037,13.800000
...,...,...,...,...,...,...,...,...
1037687,B03282,2021-02-28 23:01:16,2021-02-28 23:14:48,,31.0,,B01717,13.533333
1037688,B03282,2021-02-28 23:36:10,2021-02-28 23:47:38,,169.0,,B01717,11.466667
1037689,B03285,2021-02-28 23:18:36,2021-02-28 23:43:59,28.0,171.0,,B03285,25.383333
1037690,B03285,2021-02-28 23:26:34,2021-02-28 23:44:37,16.0,252.0,,B03285,18.050000


Remove outliers as per the given rules

In [207]:
initial_rows=df.shape[0]
df=df[df.DurationInMinutes>=1]
df=df[df.DurationInMinutes<=60]
after_removal=df.shape[0]
print(f"{initial_rows-after_removal} rows were dropped.")


initial_rows_val=df_val.shape[0]
df_val=df_val[df_val.DurationInMinutes>=1]
df_val=df_val[df_val.DurationInMinutes<=60]
after_removal_val=df_val.shape[0]
# print(f"{initial_rows_val-after_removal_val} rows were dropped.")

44259 rows were dropped.


In [208]:
df

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,DurationInMinutes
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009,17.000000
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009,17.000000
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037,8.283333
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037,15.216667
5,B00037,2021-01-01 00:59:02,2021-01-01 01:08:05,,71.0,,B00037,9.050000
...,...,...,...,...,...,...,...,...
1154107,B03266,2021-01-31 23:43:03,2021-01-31 23:51:48,7.0,7.0,,B03266,8.750000
1154108,B03284,2021-01-31 23:50:27,2021-02-01 00:48:03,44.0,91.0,,,57.600000
1154109,B03285,2021-01-31 23:13:46,2021-01-31 23:29:58,171.0,171.0,,B03285,16.200000
1154110,B03285,2021-01-31 23:58:03,2021-02-01 00:17:29,15.0,15.0,,B03285,19.433333


In [209]:
df_val

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,DurationInMinutes
1,B00021,2021-02-01 00:55:40,2021-02-01 01:06:20,173.0,82.0,,B00021,10.666667
2,B00021,2021-02-01 00:14:03,2021-02-01 00:28:37,173.0,56.0,,B00021,14.566667
3,B00021,2021-02-01 00:27:48,2021-02-01 00:35:45,82.0,129.0,,B00021,7.950000
4,B00037,2021-02-01 00:12:50,2021-02-01 00:26:38,,225.0,,B00037,13.800000
5,B00037,2021-02-01 00:00:37,2021-02-01 00:09:35,,61.0,,B00037,8.966667
...,...,...,...,...,...,...,...,...
1037687,B03282,2021-02-28 23:01:16,2021-02-28 23:14:48,,31.0,,B01717,13.533333
1037688,B03282,2021-02-28 23:36:10,2021-02-28 23:47:38,,169.0,,B01717,11.466667
1037689,B03285,2021-02-28 23:18:36,2021-02-28 23:43:59,28.0,171.0,,B03285,25.383333
1037690,B03285,2021-02-28 23:26:34,2021-02-28 23:44:37,16.0,252.0,,B03285,18.050000


In [210]:
df.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               927033
DOlocationID               147927
SR_Flag                   1109853
Affiliated_base_number        773
DurationInMinutes               0
dtype: int64

In [211]:
df=df.replace(np.NaN,-1)
df_val=df_val.replace(np.NaN,-1)

In [212]:
X=df[["PUlocationID","DOlocationID"]].astype(str)
Y=df["DurationInMinutes"].values

X_val=df_val[["PUlocationID","DOlocationID"]].astype(str)
Y_val=df_val["DurationInMinutes"].values

In [213]:
X

Unnamed: 0,PUlocationID,DOlocationID
0,-1.0,-1.0
1,-1.0,-1.0
3,-1.0,72.0
4,-1.0,61.0
5,-1.0,71.0
...,...,...
1154107,7.0,7.0
1154108,44.0,91.0
1154109,171.0,171.0
1154110,15.0,15.0


In [214]:
X_val

Unnamed: 0,PUlocationID,DOlocationID
1,173.0,82.0
2,173.0,56.0
3,82.0,129.0
4,-1.0,225.0
5,-1.0,61.0
...,...,...
1037687,-1.0,31.0
1037688,-1.0,169.0
1037689,28.0,171.0
1037690,16.0,252.0


In [215]:
X=X.to_dict(orient='records')
X_val=X_val.to_dict(orient='records')


In [216]:
X

[{'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '71.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '91.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '37.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '39.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '89.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '177.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '63.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '67.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '22.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '14.0'},
 {'PUlocationID': '-1.0', 'DO

In [217]:
X_val

[{'PUlocationID': '173.0', 'DOlocationID': '82.0'},
 {'PUlocationID': '173.0', 'DOlocationID': '56.0'},
 {'PUlocationID': '82.0', 'DOlocationID': '129.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '225.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '61.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '26.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '72.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '169.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '161.0'},
 {'PUlocationID': '13.0', 'DOlocationID': '182.0'},
 {'PUlocationID': '152.0', 'DOlocationID': '244.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '-1.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '265.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '237.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '248.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '248.0'},
 {'PUlocationID': '-1.0', 'DOlocationID': '159.0'},
 {'PUlocationID':

In [218]:
dict_vectorizer = DictVectorizer()
X_dict=dict_vectorizer.fit_transform(X)
X_dict_val=dict_vectorizer.transform(X_val)

You should use transform instead of fit_transform for validation dataset.

It is seen that there are 525 features in total.

# Q5. Train performance

In [219]:
lr = LinearRegression()
lr.fit(X_dict, Y)

y_pred = lr.predict(X_dict)

mean_squared_error(Y, y_pred, squared=False)

10.528643544992311

# Q6. Evaluation

In [221]:
lr = LinearRegression()
lr.fit(X_dict, Y)

y_pred_val = lr.predict(X_dict_val)

mean_squared_error(Y_val, y_pred_val, squared=False)

11.01459286499178