In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
!pip install pyarrow



In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [4]:
train_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')
val_df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [5]:
train_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [6]:
train_df.shape

(3066766, 19)

In [7]:
# Q1
print(f"No. of Columns for January: {train_df.shape[1]}")

No. of Columns for January: 19


In [8]:
train_df['duration'] = train_df['tpep_dropoff_datetime'] - train_df['tpep_pickup_datetime']
train_df['duration'] = train_df['duration'].apply(lambda td: td.total_seconds() / 60)

In [9]:
train_df['duration'].head()

0     8.433333
1     6.316667
2    12.750000
3     9.616667
4    10.833333
Name: duration, dtype: float64

In [11]:
# Q2
print(f"Standard Deviation of Duration: {train_df['duration'].std()}")

Standard Deviation of Duration: 42.594351241920904


In [12]:
past_records = train_df.shape[0]
train_df = train_df[(train_df.duration >= 1) & (train_df.duration <= 60)]
current_records = train_df.shape[0]

In [13]:
# Q3
print(f"Fraction of records left after dropping outliers: {current_records / past_records}")
print(f"In percentage: {((current_records / past_records) * 100):.2f}%")

Fraction of records left after dropping outliers: 0.9812202822125979
In percentage: 98.12%


In [14]:
train_df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

In [15]:
categorical = ['PULocationID', 'DOLocationID']
train_df[categorical] = train_df[categorical].astype(str)

In [16]:
train_df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                     object
DOLocationID                     object
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
duration                        float64
dtype: object

In [17]:
dv = DictVectorizer()

train_dicts = train_df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

In [18]:
# Q4
print(f"Dimensionality of the Matrix (No. of Columns): {X_train.shape[1]}")

Dimensionality of the Matrix (No. of Columns): 515


In [19]:
target = 'duration'
y_train = train_df[target].values

In [20]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

In [21]:
# Q5
train_rmse = mean_squared_error(y_train, y_pred, squared=False)
print(f"Train RMSE: {train_rmse}")

Train RMSE: 7.649261027792376


In [22]:
val_df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,1,2023-02-01 00:32:53,2023-02-01 00:34:34,2.0,0.3,1.0,N,142,163,2,4.4,3.5,0.5,0.0,0.0,1.0,9.4,2.5,0.0
1,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,-3.0,-1.0,-0.5,0.0,0.0,-1.0,-5.5,0.0,0.0
2,2,2023-02-01 00:35:16,2023-02-01 00:35:30,1.0,0.0,1.0,N,71,71,4,3.0,1.0,0.5,0.0,0.0,1.0,5.5,0.0,0.0
3,1,2023-02-01 00:29:33,2023-02-01 01:01:38,0.0,18.8,1.0,N,132,26,1,70.9,2.25,0.5,0.0,0.0,1.0,74.65,0.0,1.25
4,2,2023-02-01 00:12:28,2023-02-01 00:25:46,1.0,3.22,1.0,N,161,145,1,17.0,1.0,0.5,3.3,0.0,1.0,25.3,2.5,0.0


In [23]:
# with outliers
val_df.shape

(2913955, 19)

In [24]:
val_df['duration'] = val_df['tpep_dropoff_datetime'] - val_df['tpep_pickup_datetime']
val_df['duration'] = val_df['duration'].apply(lambda td: td.total_seconds() / 60)

In [25]:
# with new column 'duration'
val_df.shape

(2913955, 20)

In [26]:
val_df = val_df[(val_df.duration >= 1) & (val_df.duration <= 60)]

In [27]:
# without outliers
val_df.shape

(2855951, 20)

In [28]:
val_df.dtypes

VendorID                          int32
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int32
DOLocationID                      int32
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
Airport_fee                     float64
duration                        float64
dtype: object

In [29]:
print(categorical)

['PULocationID', 'DOLocationID']


In [30]:
val_df[categorical] = val_df[categorical].astype(str)

In [31]:
val_df.dtypes

VendorID                          int32
tpep_pickup_datetime     datetime64[ns]
tpep_dropoff_datetime    datetime64[ns]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                     object
DOLocationID                     object
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
Airport_fee                     float64
duration                        float64
dtype: object

In [32]:
val_dicts = val_df[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [33]:
print(X_val.shape)

(2855951, 515)


In [34]:
print(target)

duration


In [35]:
y_val = val_df[target].values

In [36]:
y_pred = lr.predict(X_val)

In [37]:
# Q6
val_rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f"Validation RMSE: {val_rmse}")

Validation RMSE: 7.811832836304415
