In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [None]:
df_jan = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet'
df_feb = 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet'

In [None]:
def read_dataframe(url):

    df = pd.read_parquet(url)
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime']) 
    df['Duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df['Duration'] = df.Duration.apply(lambda td: td.total_seconds()/60)
    df['Duration'] = round(df.Duration,2)
    df = df[(df.Duration >= 1) & (df.Duration <= 60)]
    df.reset_index(drop=True, inplace=True)
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']

    return df

df_train = read_dataframe(df_jan)
df_val = read_dataframe(df_feb)

In [None]:
categorical = ['PU_DO']
numerical = ['trip_distance']

dv = DictVectorizer()
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val= dv.transform(val_dicts)

In [None]:
target = 'Duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

In [None]:
lr = Lasso(alpha=0.0001)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

In [None]:
lr = Ridge()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)
mean_squared_error(y_val, y_pred, squared=False)

In [None]:
with open ('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)