In [60]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import missingno as msno

from lib.preprocessing import *

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [11]:
riders = pd.read_csv("data/riders.csv")
sampsub = pd.read_csv("data/sample_submission.csv")
test = pd.read_csv("data/test.csv")
clean = load_sample(sql_db="data/cleaned_alex_emily_nelson.db")

In [12]:
clean.drop(["Vehicle Type"], axis=1, inplace=True)

In [13]:
def get_seconds_from_dt_series(series: pd.Series) -> pd.Series:
        return pd.to_datetime(series).dt.hour * 3600 + pd.to_datetime(series).dt.minute * 60 + pd.to_datetime(series).dt.second
    
clean["Placement - Time"]=get_seconds_from_dt_series(clean["Placement - Time"])
clean["Confirmation - Time"]=get_seconds_from_dt_series(clean["Confirmation - Time"])
clean["Arrival at Pickup - Time"]=get_seconds_from_dt_series(clean["Arrival at Pickup - Time"])
clean["Pickup - Time"]=get_seconds_from_dt_series(clean["Pickup - Time"])
clean_one = pd.get_dummies(clean["Personal or Business"])

In [57]:
# display result
clean = pd.concat((clean_one, clean), axis=1)
clean = clean.drop(["Personal or Business"], axis=1)
clean = clean.drop(["Personal"], axis=1)
clean = clean.rename(columns={"Business": "Personal or Business"})

In [15]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(clean[clean.columns[clean.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = clean.columns[clean.columns!='Time from Pickup to Arrival']) # save to dataframe

In [29]:
# fitting model
olsmod = LinearRegression()
olsmod.fit(X=X, y=clean["Time from Pickup to Arrival"])
pd.Series(olsmod.coef_, index=X.columns).sort_values()

Destination Lat                   -750.012597
Pickup Long                       -254.540081
Destination Long                  -182.516193
Platform Type                      -35.005923
average_rating                     -33.884939
Placement - Weekday (Mo = 1)        -6.479837
Placement - Day of Month            -2.420101
Fulfillment - Day of Month          -2.420101
Pickup - Time                       -0.173112
Confirmation - Time                 -0.084699
Placement - Time                    -0.061827
orders                              -0.036953
age                                 -0.023678
user_id                             -0.019455
rider_id                             0.002273
order_no                             0.002699
number_rating                        0.170925
Arrival at Pickup - Time             0.317324
Temperature                          5.206481
Fulfillment - Weekday (Su = 0)      16.223473
Personal or Business                49.724888
Distance (KM)                     

In [48]:
# recursive feature elimination
rfe = RFECV(estimator=olsmod, cv=100, scoring="neg_mean_squared_error")
rfe.fit(X=X, y=clean["Time from Pickup to Arrival"])
print('Selected', rfe.n_features_, 'features:')
X.columns[rfe.support_]

Selected 17 features:


Index(['Personal or Business', 'Platform Type', 'Placement - Day of Month',
       'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time',
       'Distance (KM)', 'Temperature', 'Pickup Lat', 'Pickup Long',
       'Destination Lat', 'Destination Long', 'Fulfillment - Weekday (Su = 0)',
       'Fulfillment - Day of Month', 'average_rating'],
      dtype='object')

In [49]:
# Ridge Regression
ridge=RidgeCV(alphas=[0.1, 1.0, 10.0], scoring="neg_mean_squared_error", cv=100)
ridge.fit(X=X, y=clean["Time from Pickup to Arrival"])

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=100,
        scoring='neg_mean_squared_error')

In [50]:
ridge.alpha_

1.0

In [51]:
pd.DataFrame({
    'ols':olsmod.coef_, 'ridge':ridge.coef_
}, index=X.columns)

Unnamed: 0,ols,ridge
Personal or Business,49.724888,53.707309
order_no,0.002699,0.002711
user_id,-0.019455,-0.018462
Platform Type,-35.005923,-37.919203
Placement - Day of Month,-2.420101,-2.441202
Placement - Weekday (Mo = 1),-6.479837,-7.580231
Placement - Time,-0.061827,-0.060488
Confirmation - Time,-0.084699,-0.085388
Arrival at Pickup - Time,0.317324,0.318261
Pickup - Time,-0.173112,-0.174708


In [52]:
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])

LassoCV(cv=100)

In [53]:
lasso.alpha_

429.4572965948736

In [54]:
pd.Series(lasso.coef_, index=X.columns)

Personal or Business               0.000000
order_no                           0.002743
user_id                           -0.017897
Platform Type                     -0.000000
Placement - Day of Month          -0.000000
Placement - Weekday (Mo = 1)       0.000000
Placement - Time                  -0.061676
Confirmation - Time               -0.092462
Arrival at Pickup - Time           0.316738
Pickup - Time                     -0.164502
Distance (KM)                     84.151284
Temperature                        0.000000
Pickup Lat                         0.000000
Pickup Long                       -0.000000
Destination Lat                   -0.000000
Destination Long                   0.000000
rider_id                           0.000143
Fulfillment - Weekday (Su = 0)     0.000000
Fulfillment - Day of Month        -0.000000
orders                            -0.043686
age                               -0.014748
average_rating                    -0.000000
number_rating                   

In [65]:
ridge_best = Ridge(alpha=ridge.alpha_)
lasso_best = Lasso(alpha=lasso.alpha_)
cv_ols = cross_val_score(estimator=olsmod, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
cv_rfe = cross_val_score(estimator=olsmod, X=X[X.columns[rfe.support_]], y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
cv_ridge = cross_val_score(estimator=ridge_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)

print('OLS MSE:', -cv_ols.mean())
print('RFE MSE:', -cv_rfe.mean())
print('Ridge MSE:', -cv_ridge.mean())
print('Lasso MSE:', -cv_lasso.mean())

OLS MSE: 541.4305024090005
RFE MSE: 541.2211138429134
Ridge MSE: 540.9267399216817
Lasso MSE: 546.4358026759803
