In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

import pickle

# Q1

In [2]:
df_train = pd.read_parquet('../../dataset/fhv_tripdata_2021-01.parquet')
df_val = pd.read_parquet('../../dataset/fhv_tripdata_2021-02.parquet')

In [3]:
len(df_train), len(df_val)

(1154112, 1037692)

# Q2

In [4]:
df_train.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00009,2021-01-01 00:27:00,2021-01-01 00:44:00,,,,B00009
1,B00009,2021-01-01 00:50:00,2021-01-01 01:07:00,,,,B00009
2,B00013,2021-01-01 00:01:00,2021-01-01 01:51:00,,,,B00013
3,B00037,2021-01-01 00:13:09,2021-01-01 00:21:26,,72.0,,B00037
4,B00037,2021-01-01 00:38:31,2021-01-01 00:53:44,,61.0,,B00037


In [5]:
df_train['duration'] = df_train.dropOff_datetime - df_train.pickup_datetime
df_train.duration = df_train.duration.apply(lambda td: td.total_seconds() / 60)

In [6]:
df_train.duration.mean()

19.167224093791006

# Q

In [7]:
orig_len = len(df_train)
orig_len

1154112

In [8]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]
len(df_train)

1109826

In [9]:
no_of_dropped = orig_len - len(df_train)
no_of_dropped

44286

# Q3

In [10]:
df_train.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                           object
Affiliated_base_number            object
duration                         float64
dtype: object

In [11]:
df_train.PUlocationID.unique()

array([ nan, 236., 196., 252.,  15., 165.,  70., 173., 260., 143., 115.,
       221., 206., 181., 189., 166., 129., 142., 211.,  72.,  56., 226.,
        92.,  73.,  51.,  55., 123., 150., 210.,  21., 245., 223.,   7.,
        17., 132., 233., 135., 121.,  29.,  35., 106.,  40.,  66., 256.,
        91., 179., 192., 171.,  11., 149., 214.,   6., 156., 235., 255.,
        82.,  16.,  22.,  62., 249.,  53., 108., 205., 187., 140.,  57.,
       230.,  71., 257., 228.,  83., 155., 213., 159., 175., 177.,  25.,
       225., 216., 243., 167., 193., 137., 119.,  50.,  69.,  89.,  61.,
       162.,  81., 204.,  10.,  36., 229., 247.,  76.,  39.,  85., 138.,
       238., 126., 231.,  23.,   9., 188., 109., 241., 118., 265., 237.,
        49., 141., 131.,  44.,   5.,  43.,  67.,  74., 264.,   1.,  28.,
        98., 176., 227., 191., 178.,  38.,  77., 107.,  95., 145.,  84.,
       101., 215., 201.,  65., 222., 232.,  26., 251., 172.,  54., 224.,
       125., 180., 212., 117.,   3.,  75., 100.,  7

In [12]:
df_train.isnull().sum()

dispatching_base_num            0
pickup_datetime                 0
dropOff_datetime                0
PUlocationID               927008
DOlocationID               147907
SR_Flag                   1109826
Affiliated_base_number        773
duration                        0
dtype: int64

In [13]:
df_train.PUlocationID.isnull().sum()

927008

In [14]:
frac = df_train.PUlocationID.isnull().sum() / len(df_train) * 100
frac

83.52732770722618

# Q4

In [15]:
categorical = ['PUlocationID', 'DOlocationID']
df_train[categorical] = df_train[categorical].astype(str)
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [16]:
X_train.shape

(1109826, 525)

In [17]:
len(dv.get_feature_names_out())

525

# Q5

In [18]:
target = 'duration'
y_train = df_train[target].values

In [19]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [20]:
y_pred = lr.predict(X_train)

In [21]:
mean_squared_error(y_train, y_pred, squared=False)

10.528519427219633

# Q6

In [23]:
df_val['duration'] = df_val.dropOff_datetime - df_val.pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)

df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]

categorical = ['PUlocationID', 'DOlocationID']
df_val[categorical] = df_val[categorical].astype(str)

val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'duration'
y_val = df_val[target].values

y_pred_val = lr.predict(X_val)
mean_squared_error(y_val, y_pred_val, squared=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_val[categorical] = df_val[categorical].astype(str)


11.01428685575068