In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option('display.float_format', '{:.2f}'.format)

In [3]:
def read_dataframe(url, month):
    df = pd.read_parquet(url)
    
    print(f"Column count for month {month} is {len(df.columns.tolist())}.")

    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    print(f"The standard deviation for month {month} is {df.duration.std()}")
    
#     print(df.duration.describe(percentiles=[0.90, 0.92, 0.95, 0.98, 0.99, 0.099]))

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']

    df[categorical] = df[categorical].astype(str)
    
    return df

In [4]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet', "January")
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet', "February")

Column count for month January is 19.
The standard deviation for month January is 42.594351241920904
Column count for month February is 19.
The standard deviation for month February is 42.84210176105097


In [5]:
len(df_train), len(df_val)

(3009173, 2855951)

In [6]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [7]:
categorical = ['PULocationID', 'DOLocationID'] #['PU_DO']

dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)

In [8]:
val_dicts = df_val[categorical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [9]:
target = 'duration'

Y_train = df_train[target].values
Y_val = df_val[target].values

In [10]:
lr = LinearRegression()
lr.fit(X_train, Y_train)

Y_pred = lr.predict(X_val)

orig = df_train[target].values

mean_squared_error(Y_val, Y_pred, squared = False)

7.811832836304415

In [11]:
# mean_squared_error(X_val, X_pred, squared = False)

In [12]:
with open('models/lin_reg_homework.bin', 'wb') as f_out:
    pickle.dump((dv,lr), f_out)