In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

Set default display format for pandas.

In [2]:
pd.set_option('display.float_format', '{:.2f}'.format)

Read function for dataframes. Including homework answears Q1, Q2, Q3.

In [3]:
def read_dataframe(url, month):
    df = pd.read_parquet(url)
    
    print(f"Column count for month {month} is {len(df.columns.tolist())}.")

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    print(f"The standard deviation for month {month} is {df.duration.std()}")
    
    print(df.duration.describe(percentiles=[0.90, 0.92, 0.95, 0.98, 0.99, 0.995]))

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']

    df[categorical] = df[categorical].astype(str)
    
    return df

Read dataframes from Yellow trip data 2023 January and February.

In [4]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet', "January")
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet', "February")

Column count for month January is 19.
The standard deviation for month January is 42.594351241920904
count   3066766.00
mean         15.67
std          42.59
min         -29.20
50%          11.52
90%          27.94
92%          30.55
95%          36.47
98%          48.73
99%          57.25
99.5%        65.52
max       10029.18
Name: duration, dtype: float64
Column count for month February is 19.
The standard deviation for month February is 42.84210176105097
count   2913955.00
mean         16.02
std          42.84
min         -43.62
50%          11.80
90%          28.48
92%          31.12
95%          37.22
98%          49.90
99%          58.87
99.5%        67.38
max        7053.62
Name: duration, dtype: float64


One-hot encode and DictVectorizer init. Get feaute matrixes.

In [5]:
categorical = ['PULocationID', 'DOLocationID']
dv = DictVectorizer()

In [6]:
train_dicts = df_train[categorical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)

In [7]:
val_dicts = df_val[categorical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

Get the dimensionality of X_train matrix . Including homework answear Q4.

In [8]:
X_train.shape

(3009173, 515)

In [9]:
target = 'duration'

Y_train = df_train[target].values
Y_val = df_val[target].values

Training the model with duration.

In [10]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

Y_pred_train = lr.predict(X_train)
Y_pred_val = lr.predict(X_val)

RMSE on train. Including homework answear Q5.

In [11]:
mean_squared_error(Y_train, Y_pred_train, squared = False)

7.649261027792376

RMSE on validation. Including homework answear Q6.

In [12]:
mean_squared_error(Y_val, Y_pred_val, squared = False)

7.811832836304415

Save the homework model.

In [13]:
with open('models/lin_reg_homework.bin', 'wb') as f_out:
    pickle.dump((dv,lr), f_out)