In [1]:
# import libraries
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [2]:
# read data
data_train = pd.read_parquet('./data/yellow_tripdata_2023-01.parquet')
data_val = pd.read_parquet('./data/yellow_tripdata_2023-02.parquet')

In [3]:
# count number of columns in the dataframe
print("Q1: Number of columns in the Jan 2023 dataset: ", len(data_train.columns))

Q1: Number of columns in the Jan 2023 dataset:  19


In [4]:
# create the duration field
data_train['duration'] = ((data_train['tpep_dropoff_datetime'] - data_train['tpep_pickup_datetime']).dt.total_seconds() / 60)
data_val['duration'] = ((data_val['tpep_dropoff_datetime'] - data_val['tpep_pickup_datetime']).dt.total_seconds() / 60)

In [5]:
#calculate standard devivation
print('Q2: Standard Deviation of the trips in Jan 2023: ', data_train.duration.std().round(2))

Q2: Standard Deviation of the trips in Jan 2023:  42.59


In [6]:
# calculate the proportion of records left after filtering
print("Q3: Proportion of Records Left: ",round((len(data_train.query('1 <= duration <= 60')) / len(data_train) * 100), 2), '%')

Q3: Proportion of Records Left:  98.12 %


In [7]:
# filter the training and validation data
data_train = data_train.query('1 <= duration <= 60')
data_val = data_val.query('1 <= duration <= 60')

In [8]:
# select the features for training
categorical_features = ['PULocationID', 'DOLocationID']

In [9]:
# convert the categorical features to string
data_train[categorical_features] = data_train[categorical_features].astype(str)
data_val[categorical_features] = data_val[categorical_features].astype(str)

In [10]:
# convert to dictionary to use in DictVectorizer
train_dict = data_train[categorical_features].to_dict(orient='records')
val_dict = data_val[categorical_features].to_dict(orient='records')

In [11]:
# initialize DictVectorizer
dv = DictVectorizer()

# fit and transform the training data
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

# set target variable
y_train = data_train['duration']
y_val = data_val['duration']

In [12]:
# find the number of columns in the matrix
print('Q4: The number of columns in the matrix is: ', X_train.shape[1])

Q4: The number of columns in the matrix is:  515


In [13]:
# train the model
model = LinearRegression()
model.fit(X_train, y_train)

# predict the duration
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)

In [14]:
# calculate the RMSE
print('Q5: RMSE (Train): ', root_mean_squared_error(y_train, y_pred_train))
print('Q6: RMSE (Validation): ', root_mean_squared_error(y_val, y_pred_val))

Q5: RMSE (Train):  7.6492624397080675
Q6: RMSE (Validation):  7.81181211389241
