In [34]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import re
from lib.preprocessing import *

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split


from sklearn.metrics import mean_absolute_error

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [4]:
riders = pd.read_csv("data/riders.csv")
sampsub = pd.read_csv("data/sample_submission.csv")
test = pd.read_csv("data/test.csv")
clean = load_sample(sql_db="data/cleaned_alex_emily_nelson.db")

In [5]:
clean.drop(["Vehicle Type"], axis=1, inplace=True)

In [6]:
def get_seconds_from_dt_series(series: pd.Series) -> pd.Series:
        return pd.to_datetime(series).dt.hour * 3600 + pd.to_datetime(series).dt.minute * 60 + pd.to_datetime(series).dt.second
    
clean["Placement - Time"]=get_seconds_from_dt_series(clean["Placement - Time"])
clean["Confirmation - Time"]=get_seconds_from_dt_series(clean["Confirmation - Time"])
clean["Arrival at Pickup - Time"]=get_seconds_from_dt_series(clean["Arrival at Pickup - Time"])
clean["Pickup - Time"]=get_seconds_from_dt_series(clean["Pickup - Time"])
clean_one = pd.get_dummies(clean["Personal or Business"])

In [26]:
# display result
clean = pd.concat((clean_one, clean), axis=1)
clean = clean.drop(["Personal or Business"], axis=1)
clean = clean.drop(["Personal"], axis=1)
clean = clean.rename(columns={"Business": "Personal or Business"})

In [43]:
train, test = train_test_split(clean, test_size=0.2, random_state=13)
model=Lasso()
train = train.drop(["Time from Pickup to Arrival"], axis=1)
model.fit(X=train, y=train["Time from Pickup to Arrival"],)

KeyError: 'Time from Pickup to Arrival'

In [27]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(clean[clean.columns[clean.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = clean.columns[clean.columns!='Time from Pickup to Arrival']) # save to dataframe

In [28]:
lasso = LassoCV(n_alphas=100, cv=2)
lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])
lasso.alpha_
lasso_best = Lasso(alpha=lasso.alpha_)
cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=2)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 553.0366755842736


In [29]:
lasso = LassoCV(n_alphas=100, cv=3)
lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)
cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=3)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 546.3732047710251


In [32]:
lasso = LassoCV(n_alphas=100, cv=4)
lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)
cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=4)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 546.669358319893


In [30]:
lasso = LassoCV(n_alphas=100, cv=5)
lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)
cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=5)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 543.1478376285393


In [31]:
lasso = LassoCV(n_alphas=100, cv=10)
lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)
cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=10)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 543.0976625645924


In [12]:
#fitting model
olsmod = LinearRegression()
olsmod.fit(X=X, y=clean["Time from Pickup to Arrival"])
pd.Series(olsmod.coef_, index=X.columns).sort_values()

Pickup - Time                    -1403.383288
Confirmation - Time              -1019.069379
Placement - Time                  -420.967003
orders                             -94.056081
Placement - Weekday (Mo = 1)       -57.367210
age                                -56.591129
Pickup Long                        -51.409263
average_rating                     -31.666084
Destination Lat                    -30.383823
Destination Long                   -21.500041
Placement - Day of Month           -13.157300
Fulfillment - Day of Month         -13.157300
rider_id                            -9.886796
Pickup Lat                          -5.289838
Temperature                         -3.888470
user_id                              1.435144
Personal or Business                 2.405252
Platform Type                        5.469233
order_no                            10.751105
Fulfillment - Weekday (Su = 0)      64.694787
number_rating                       74.222997
Distance (KM)                     

In [13]:
#recursive feature elimination
rfe = RFECV(estimator=olsmod, cv=100, scoring="neg_mean_squared_error")
rfe.fit(X=X, y=clean["Time from Pickup to Arrival"])
print('Selected', rfe.n_features_, 'features:')
X.columns[rfe.support_]

Selected 15 features:


Index(['Placement - Day of Month', 'Placement - Weekday (Mo = 1)',
       'Placement - Time', 'Confirmation - Time', 'Arrival at Pickup - Time',
       'Pickup - Time', 'Distance (KM)', 'Pickup Long', 'Destination Lat',
       'Destination Long', 'Fulfillment - Weekday (Su = 0)', 'orders', 'age',
       'average_rating', 'number_rating'],
      dtype='object')

In [14]:
#Ridge Regression
ridge=RidgeCV(alphas=[0.1, 1.0, 10.0], scoring="neg_mean_squared_error", cv=100)
ridge.fit(X=X, y=clean["Time from Pickup to Arrival"])

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=100,
        scoring='neg_mean_squared_error')

In [15]:
ridge.alpha_

0.1

In [16]:
pd.DataFrame({
    'ols':olsmod.coef_, 'ridge':ridge.coef_
}, index=X.columns)

Unnamed: 0,ols,ridge
Personal or Business,2.405252,1.989462
order_no,10.751105,10.659303
user_id,1.435144,1.463981
Platform Type,5.469233,5.459326
Placement - Day of Month,-13.1573,-13.1508
Placement - Weekday (Mo = 1),-57.36721,-57.20063
Placement - Time,-420.967003,-423.990054
Confirmation - Time,-1019.069379,-985.330813
Arrival at Pickup - Time,2866.718519,2808.834902
Pickup - Time,-1403.383288,-1376.258371


In [17]:
#lasso = LassoCV(n_alphas=100, cv=100)
#lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])

In [28]:
#lasso.alpha_

In [29]:
#pd.Series(lasso.coef_, index=X.columns)

In [30]:
#ridge_best = Ridge(alpha=ridge.alpha_)
#lasso_best = Lasso(alpha=lasso.alpha_)
#cv_ols = cross_val_score(estimator=olsmod, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
#cv_rfe = cross_val_score(estimator=olsmod, X=X[X.columns[rfe.support_]], y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
#cv_ridge = cross_val_score(estimator=ridge_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
#cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)

#print('OLS MAE:', -cv_ols.mean())
#print('RFE MAE:', -cv_rfe.mean())
#print('Ridge MAE:', -cv_ridge.mean())
#print('Lasso MAE:', -cv_lasso.mean())

# Data Demolition

How would our regressions be affected if we demolished some data?
### Oh I'm glad you asked!

In [17]:
demo = clean

In [25]:
demo.dtypes
demo.average_rating

0       14.6
1       14.3
2       14.4
3       14.4
4       13.8
        ... 
2115    12.7
2116    14.2
2117    14.3
2118    13.8
2119    14.2
Name: average_rating, Length: 2120, dtype: float64

In [20]:
demo=demo.drop(columns=["Temperature", "rider_id", "age"]) # dropped randomly selected columns

In [23]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MSE: 545.0075482121848


Ah! Our model is only damaged by an additional error of 0.12011698723! These columns must not have been very influential-- let's drop some more influential ones.

In [26]:
demo=clean
demo=demo.drop(columns=["Distance (KM)", "orders", "average_rating"]) # dropped randomly selected columns

In [27]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"]
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MSE:', -cv_lasso.mean())

Lasso MSE: 742.0418242996093


There we go-- an increase in error of 197.154393075. These must be greatly impactful.

What if we passed these through sin functions? 

In [40]:
demo = clean
sin: np.sin
demo=demo.apply(np.sin, columns=["Distance (KM)", "orders", "average_rating"])
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 0.6397024786433453


Woah! Dropped dramatically. 