In [1]:
!which python

/Users/mmukherjee/Documents/LearningAndDevelopment/DataTalksClub/mlops-zoomcamp/01-intro/mlops_venv/bin/python


# 01-Intro Homework
NYK: Taxi data: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [3]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

### 01. Read the parquet file

In [4]:
# Load 2022-Jan data from file: yellow_tripdata_2022-01.parquet

In [5]:
df = pd.read_parquet('./data/yellow_tripdata_2022-01.parquet')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [6]:
df.shape

(2463931, 19)

### 02. Find Duration

In [17]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime

In [15]:
type(df['duration'])

pandas.core.series.Series

In [18]:
df['duration']

0         0 days 00:17:49
1         0 days 00:08:24
2         0 days 00:08:58
3         0 days 00:10:02
4         0 days 00:37:32
                ...      
2463926   0 days 00:05:58
2463927   0 days 00:10:39
2463928   0 days 00:11:00
2463929   0 days 00:12:03
2463930   0 days 00:27:00
Name: duration, Length: 2463931, dtype: timedelta64[ns]

In [19]:
df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

In [20]:
df['duration']

0          17.816667
1           8.400000
2           8.966667
3          10.033333
4          37.533333
             ...    
2463926     5.966667
2463927    10.650000
2463928    11.000000
2463929    12.050000
2463930    27.000000
Name: duration, Length: 2463931, dtype: float64

In [22]:
# Find the standard deviation of the "duration" column
round(df['duration'].std(),2)

46.45

### 03. Remove outliers from the "duration" column

In [25]:
df = df[(df.duration >= 1) & (df.duration <=60)]
print(f"New shape of df: {df.shape}")

New shape of df: (2421440, 20)


 - Note: Previous shape of df was: (2463931, 19)

In [31]:
print(f"Percentage of records left in df after removing outliers: {round(df.shape[0]*100/2463931,2)}%")

Percentage of records left in df after removing outliers: 98.28%


### 04. One-hot encoding

In [32]:
cat_vars = ["PULocationID", "DOLocationID"]

In [35]:
df[cat_vars].astype('str').dtypes

PULocationID    object
DOLocationID    object
dtype: object

In [36]:
df[cat_vars] = df[cat_vars].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [37]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                     object
DOLocationID                     object
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

In [47]:
train_dicts = df[cat_vars].to_dict(orient="records")

In [48]:
len(train_dicts)

2421440

In [43]:
dv = DictVectorizer()

In [49]:
#Model's feature metrics 
X_train = dv.fit_transform(train_dicts)

In [51]:
X_train
"""
<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
with 4842880 stored elements in Compressed Sparse Row format>
"""

<2421440x515 sparse matrix of type '<class 'numpy.float64'>'
	with 4842880 stored elements in Compressed Sparse Row format>

In [52]:
dv.feature_names_

['DOLocationID=1',
 'DOLocationID=10',
 'DOLocationID=100',
 'DOLocationID=101',
 'DOLocationID=102',
 'DOLocationID=105',
 'DOLocationID=106',
 'DOLocationID=107',
 'DOLocationID=108',
 'DOLocationID=109',
 'DOLocationID=11',
 'DOLocationID=111',
 'DOLocationID=112',
 'DOLocationID=113',
 'DOLocationID=114',
 'DOLocationID=115',
 'DOLocationID=116',
 'DOLocationID=117',
 'DOLocationID=118',
 'DOLocationID=119',
 'DOLocationID=12',
 'DOLocationID=120',
 'DOLocationID=121',
 'DOLocationID=122',
 'DOLocationID=123',
 'DOLocationID=124',
 'DOLocationID=125',
 'DOLocationID=126',
 'DOLocationID=127',
 'DOLocationID=128',
 'DOLocationID=129',
 'DOLocationID=13',
 'DOLocationID=130',
 'DOLocationID=131',
 'DOLocationID=132',
 'DOLocationID=133',
 'DOLocationID=134',
 'DOLocationID=135',
 'DOLocationID=136',
 'DOLocationID=137',
 'DOLocationID=138',
 'DOLocationID=139',
 'DOLocationID=14',
 'DOLocationID=140',
 'DOLocationID=141',
 'DOLocationID=142',
 'DOLocationID=143',
 'DOLocationID=144',

### Q5. Training a model


In [53]:
target = 'duration'
y_train = df[target].values

In [54]:
y_train.shape

(2421440,)

In [55]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

6.98619013594897

 - Train RMSE: 6.98619013594897

### 06. Evaluating the model

In [96]:
df_val = pd.read_parquet('./data/yellow_tripdata_2022-02.parquet')
df_val.shape

(2979431, 19)

In [99]:
df_val['duration'] = df_val.tpep_dropoff_datetime - df_val.tpep_pickup_datetime
df_val.duration = df_val.duration.apply(lambda td: td.total_seconds() / 60)
df_val.shape

(2979431, 20)

In [100]:
# filtering on the df_val_copy
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <=60)]
print(f"New shape of df: {df_val.shape}")

New shape of df: (2918187, 20)


In [101]:
validation_dicts = df_val[cat_vars].to_dict(orient="records")

In [102]:
#Model's feature metrics 
X_Validation = dv.transform(validation_dicts)
X_Validation

<2918187x515 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [103]:
y_validation = df_val[target].values
y_validation.shape

(2918187,)

In [104]:
y_validation_pred = lr.predict(X_Validation)
# Generate MSE
mean_squared_error(y_validation, y_validation_pred, squared=False)

14.612694413410905