In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error as mse

from matplotlib import pyplot as plt

In [87]:
## Need to drop the following columns when doing training, validation, and testing
## 1. pickup_location_ID
## 2. dropoff_location_ID
## 3. pickup_date
## 4. avg_fare as this is the target


In [88]:
def drop_cols(df):
    columns = ['pickup_location_ID', 'dropoff_location_ID', 'pickup_date', 'avg_fare', 'Unnamed: 0']
    return df.drop(columns=columns)

def load_train_val_test():
    train_DF = pd.read_csv('./fare_prediction_data/avg_fare_train.csv')
    X_train = drop_cols(train_DF)
    y_train = train_DF['avg_fare']
    val_DF = pd.read_csv('./fare_prediction_data/avg_fare_val.csv')
    X_val = drop_cols(val_DF)
    y_val = val_DF['avg_fare']
    test_DF = pd.read_csv('./fare_prediction_data/avg_fare_test.csv')
    X_test = drop_cols(test_DF)
    y_test = test_DF['avg_fare']
    return X_train, y_train, X_val, y_val, X_test, y_test 

In [89]:
X_train, y_train, X_val, y_val, X_test, y_test = load_train_val_test()

In [90]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [91]:
X_train

Unnamed: 0,PU_time_2AM-5:59AM,PU_time_6AM-9:59AM,PU_time_10AM-1:59PM,PU_time_2PM-5:59PM,PU_time_6PM-9:59PM,PU_time_10PM-1:59AM,weekend/holiday,PU_longitude,PU_latitude,DO_longitude,DO_latitude,distance
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.436498,0.486340,0.685417,0.664506,0.263102
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.464140,0.666929,0.485334,0.591462,0.065125
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.506950,0.647527,0.658291,0.916533,0.258914
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.867058,0.309496,0.900212,0.356801,0.048747
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.867058,0.309496,0.686300,0.566794,0.265309
...,...,...,...,...,...,...,...,...,...,...,...,...
2991966,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.426022,0.481428,0.496258,0.595536,0.112665
2991967,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.482776,0.689307,0.502882,0.726237,0.035253
2991968,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.576828,0.558231,0.507943,0.615445,0.076664
2991969,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.502882,0.726237,0.476860,0.661925,0.057868


In [92]:
y_pred_train_lr = lr.predict(X_train)
print('Train MSE for Linear Regression: ', mse(y_train, y_pred_train_lr))
y_pred_val_lr = lr.predict(X_val)
print('Validation MSE for Linear Regression: ', mse(y_val, y_pred_val_lr))
y_pred_test_lr = lr.predict(X_test)
print('Test MSE for Linear Regression: ', mse(y_test, y_pred_test_lr))

Train MSE for Linear Regression:  28.643067913199495
Validation MSE for Linear Regression:  32.331054071162505
Test MSE for Linear Regression:  30.667541448681575


In [96]:
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
# hyperparameter tuning
print('======================================================================')
for alpha in alphas:
    rr = Ridge(alpha=alpha)
    rr.fit(X_train, y_train)
    y_pred_train_rr = rr.predict(X_train)
    print('Train MSE for Ridge Regression with Alpha = ' + str(alpha) +':')
    print(mse(y_train, y_pred_train_rr))
    y_pred_val_rr = rr.predict(X_val)
    print('Validation MSE for Ridge Regression with Alpha = ' + str(alpha) +':')
    print(mse(y_val, y_pred_val_rr))
    print('======================================================================')

Train MSE for Ridge Regression with Alpha = 0.001:
28.64304995852185
Validation MSE for Ridge Regression with Alpha = 0.001:
32.33185577790173
Train MSE for Ridge Regression with Alpha = 0.01:
28.643049958541095
Validation MSE for Ridge Regression with Alpha = 0.01:
32.33184464731968
Train MSE for Ridge Regression with Alpha = 0.1:
28.64304996046725
Validation MSE for Ridge Regression with Alpha = 0.1:
32.331733344088434
Train MSE for Ridge Regression with Alpha = 1:
28.64305015306393
Validation MSE for Ridge Regression with Alpha = 1:
32.33062057068018
Train MSE for Ridge Regression with Alpha = 10:
28.643069396170766
Validation MSE for Ridge Regression with Alpha = 10:
32.31951870490219
Train MSE for Ridge Regression with Alpha = 100:
28.64497726493877
Validation MSE for Ridge Regression with Alpha = 100:
32.21106491392894


In [97]:
y_pred_test_rr = rr.predict(X_test)

In [98]:
mse(y_test, y_pred_test_rr)

30.5937669668545