In [2]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import missingno as msno

from lib.preprocessing import *

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_absolute_error

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [3]:
riders = pd.read_csv("data/riders.csv")
sampsub = pd.read_csv("data/sample_submission.csv")
test = pd.read_csv("data/test.csv")
clean = load_sample(sql_db="data/cleaned_alex_emily_nelson.db")

In [4]:
clean.drop(["Vehicle Type"], axis=1, inplace=True)

In [5]:
def get_seconds_from_dt_series(series: pd.Series) -> pd.Series:
        return pd.to_datetime(series).dt.hour * 3600 + pd.to_datetime(series).dt.minute * 60 + pd.to_datetime(series).dt.second
    
clean["Placement - Time"]=get_seconds_from_dt_series(clean["Placement - Time"])
clean["Confirmation - Time"]=get_seconds_from_dt_series(clean["Confirmation - Time"])
clean["Arrival at Pickup - Time"]=get_seconds_from_dt_series(clean["Arrival at Pickup - Time"])
clean["Pickup - Time"]=get_seconds_from_dt_series(clean["Pickup - Time"])
clean_one = pd.get_dummies(clean["Personal or Business"])

In [6]:
# display result
clean = pd.concat((clean_one, clean), axis=1)
clean = clean.drop(["Personal or Business"], axis=1)
clean = clean.drop(["Personal"], axis=1)
clean = clean.rename(columns={"Business": "Personal or Business"})

In [7]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(clean[clean.columns[clean.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = clean.columns[clean.columns!='Time from Pickup to Arrival']) # save to dataframe

In [8]:
# fitting model
olsmod = LinearRegression()
olsmod.fit(X=X, y=clean["Time from Pickup to Arrival"])
pd.Series(olsmod.coef_, index=X.columns).sort_values()

Pickup - Time                    -1369.590988
Confirmation - Time              -1045.956770
Placement - Time                  -581.824411
orders                             -89.129723
Placement - Weekday (Mo = 1)       -60.964026
Pickup Long                        -33.153047
Personal or Business               -28.006401
average_rating                     -24.429199
age                                -14.907015
Destination Long                   -12.816730
order_no                            -8.099513
Fulfillment - Day of Month           3.643558
Placement - Day of Month             3.643558
Pickup Lat                          11.408046
Temperature                         15.731119
user_id                             16.932358
Destination Lat                     21.581774
rider_id                            21.835696
Platform Type                       64.044886
number_rating                       68.295757
Fulfillment - Weekday (Su = 0)      78.980112
Distance (KM)                     

In [9]:
# recursive feature elimination
rfe = RFECV(estimator=olsmod, cv=100, scoring="neg_mean_squared_error")
rfe.fit(X=X, y=clean["Time from Pickup to Arrival"])
print('Selected', rfe.n_features_, 'features:')
X.columns[rfe.support_]

Selected 11 features:


Index(['Platform Type', 'Placement - Weekday (Mo = 1)', 'Placement - Time',
       'Confirmation - Time', 'Arrival at Pickup - Time', 'Pickup - Time',
       'Distance (KM)', 'Pickup Long', 'Fulfillment - Weekday (Su = 0)',
       'orders', 'number_rating'],
      dtype='object')

In [10]:
# Ridge Regression
ridge=RidgeCV(alphas=[0.1, 1.0, 10.0], scoring="neg_mean_squared_error", cv=100)
ridge.fit(X=X, y=clean["Time from Pickup to Arrival"])

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=100,
        scoring='neg_mean_squared_error')

In [11]:
ridge.alpha_

0.1

In [12]:
pd.DataFrame({
    'ols':olsmod.coef_, 'ridge':ridge.coef_
}, index=X.columns)

Unnamed: 0,ols,ridge
Personal or Business,-28.006401,-28.496408
order_no,-8.099513,-8.02354
user_id,16.932358,17.055464
Platform Type,64.044886,64.179394
Placement - Day of Month,3.643558,3.560058
Placement - Weekday (Mo = 1),-60.964026,-60.727009
Placement - Time,-581.824411,-584.946349
Confirmation - Time,-1045.95677,-1006.334026
Arrival at Pickup - Time,2998.760756,2932.011942
Pickup - Time,-1369.590988,-1339.283253


In [13]:
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])

LassoCV(cv=100)

In [14]:
lasso.alpha_

0.5806626336730881

In [15]:
pd.Series(lasso.coef_, index=X.columns)

Personal or Business               -29.211831
order_no                            -7.079930
user_id                             17.175173
Platform Type                       63.105909
Placement - Day of Month             0.583743
Placement - Weekday (Mo = 1)       -53.456526
Placement - Time                  -614.421323
Confirmation - Time               -751.413646
Arrival at Pickup - Time          2569.194448
Pickup - Time                    -1200.976436
Distance (KM)                      594.571407
Temperature                         14.432947
Pickup Lat                          12.470923
Pickup Long                        -32.361804
Destination Lat                     20.537762
Destination Long                   -10.748426
rider_id                            22.046982
Fulfillment - Weekday (Su = 0)      71.680056
Fulfillment - Day of Month           5.034798
orders                             -91.159367
age                                -14.852583
average_rating                    

In [16]:
ridge_best = Ridge(alpha=ridge.alpha_)
lasso_best = Lasso(alpha=lasso.alpha_)
cv_ols = cross_val_score(estimator=olsmod, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
cv_rfe = cross_val_score(estimator=olsmod, X=X[X.columns[rfe.support_]], y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
cv_ridge = cross_val_score(estimator=ridge_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)

print('OLS MAE:', -cv_ols.mean())
print('RFE MAE:', -cv_rfe.mean())
print('Ridge MAE:', -cv_ridge.mean())
print('Lasso MAE:', -cv_lasso.mean())

OLS MSE: 545.9273608951255
RFE MSE: 544.8771960693266
Ridge MSE: 545.7661817781473
Lasso MSE: 544.8874312249457


# Data Demolition

How would our regressions be affected if we demolished some data?
### Oh I'm glad you asked!

In [17]:
demo = clean

In [25]:
demo.dtypes
demo.average_rating

0       14.6
1       14.3
2       14.4
3       14.4
4       13.8
        ... 
2115    12.7
2116    14.2
2117    14.3
2118    13.8
2119    14.2
Name: average_rating, Length: 2120, dtype: float64

In [20]:
demo=demo.drop(columns=["Temperature", "rider_id", "age"]) # dropped randomly selected columns

In [23]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MSE: 545.0075482121848


Ah! Our model is only damaged by an additional error of 0.12011698723! These columns must not have been very influential-- let's drop some more influential ones.

In [26]:
demo=clean
demo=demo.drop(columns=["Distance (KM)", "orders", "average_rating"]) # dropped randomly selected columns

In [27]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"]
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MSE:', -cv_lasso.mean())

Lasso MSE: 742.0418242996093


There we go-- an increase in error of 197.154393075. These must be greatly impactful.

What if we passed these through sin functions? 

In [40]:
demo = clean
sin: np.sin
demo=demo.apply(np.sin, columns=["Distance (KM)", "orders", "average_rating"])
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 0.6397024786433453


Woah! Dropped dramatically. 