In [37]:
# import all libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import re
from lib.preprocessing import *

import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import LeaveOneOut

from sklearn.metrics import mean_absolute_error

import warnings # supress warnings
warnings.filterwarnings('ignore')

In [38]:
riders = pd.read_csv("data/riders.csv")
sampsub = pd.read_csv("data/sample_submission.csv")
test = pd.read_csv("data/test.csv")
clean = load_sample(sql_db="data/cleaned_alex_emily_nelson.db")

In [39]:
clean.drop(["Vehicle Type"], axis=1, inplace=True)

In [40]:
def get_seconds_from_dt_series(series: pd.Series) -> pd.Series:
        return pd.to_datetime(series).dt.hour * 3600 + pd.to_datetime(series).dt.minute * 60 + pd.to_datetime(series).dt.second
    
clean["Placement - Time"]=get_seconds_from_dt_series(clean["Placement - Time"])
clean["Confirmation - Time"]=get_seconds_from_dt_series(clean["Confirmation - Time"])
clean["Arrival at Pickup - Time"]=get_seconds_from_dt_series(clean["Arrival at Pickup - Time"])
clean["Pickup - Time"]=get_seconds_from_dt_series(clean["Pickup - Time"])
clean_one = pd.get_dummies(clean["Personal or Business"])

In [41]:
# display result
clean = pd.concat((clean_one, clean), axis=1)
clean = clean.drop(["Personal or Business"], axis=1)
clean = clean.drop(["Personal"], axis=1)
clean = clean.rename(columns={"Business": "Personal or Business"})

In [47]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(clean[clean.columns[clean.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = clean.columns[clean.columns!='Time from Pickup to Arrival']) # save to dataframe

In [48]:
def get_model():
    model=Lasso()
    return model

def eval_model(cv):
    model=get_model()
    scores=cross_val_score(model, X, clean["Time from Pickup to Arrival"], scoring="accuracy", cv=cv, n_jobs=-1)
    return np.mean(scores), scores.min(), scores.max()
ideal, _, _ = eval_model(LeaveOneOut())
print('Ideal: %.3f' % ideal)

Ideal: nan


nan

In [22]:
# fitting model
#olsmod = LinearRegression()
#olsmod.fit(X=X, y=clean["Time from Pickup to Arrival"])
#pd.Series(olsmod.coef_, index=X.columns).sort_values()

In [23]:
# recursive feature elimination
#rfe = RFECV(estimator=olsmod, cv=100, scoring="neg_mean_squared_error")
#rfe.fit(X=X, y=clean["Time from Pickup to Arrival"])
#print('Selected', rfe.n_features_, 'features:')
#X.columns[rfe.support_]

In [24]:
# Ridge Regression
#ridge=RidgeCV(alphas=[0.1, 1.0, 10.0], scoring="neg_mean_squared_error", cv=100)
#ridge.fit(X=X, y=clean["Time from Pickup to Arrival"])

In [25]:
#ridge.alpha_

In [26]:
#pd.DataFrame({
#    'ols':olsmod.coef_, 'ridge':ridge.coef_
#}, index=X.columns)

In [27]:
#lasso = LassoCV(n_alphas=100, cv=100)
#lasso.fit(X=X, y=clean["Time from Pickup to Arrival"])

In [28]:
#lasso.alpha_

In [29]:
#pd.Series(lasso.coef_, index=X.columns)

In [30]:
#ridge_best = Ridge(alpha=ridge.alpha_)
#lasso_best = Lasso(alpha=lasso.alpha_)
#cv_ols = cross_val_score(estimator=olsmod, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
#cv_rfe = cross_val_score(estimator=olsmod, X=X[X.columns[rfe.support_]], y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
#cv_ridge = cross_val_score(estimator=ridge_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
#cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=clean["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)

#print('OLS MAE:', -cv_ols.mean())
#print('RFE MAE:', -cv_rfe.mean())
#print('Ridge MAE:', -cv_ridge.mean())
#print('Lasso MAE:', -cv_lasso.mean())

# Data Demolition

How would our regressions be affected if we demolished some data?
### Oh I'm glad you asked!

In [17]:
demo = clean

In [25]:
demo.dtypes
demo.average_rating

0       14.6
1       14.3
2       14.4
3       14.4
4       13.8
        ... 
2115    12.7
2116    14.2
2117    14.3
2118    13.8
2119    14.2
Name: average_rating, Length: 2120, dtype: float64

In [20]:
demo=demo.drop(columns=["Temperature", "rider_id", "age"]) # dropped randomly selected columns

In [23]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MSE: 545.0075482121848


Ah! Our model is only damaged by an additional error of 0.12011698723! These columns must not have been very influential-- let's drop some more influential ones.

In [26]:
demo=clean
demo=demo.drop(columns=["Distance (KM)", "orders", "average_rating"]) # dropped randomly selected columns

In [27]:
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"]
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MSE:', -cv_lasso.mean())

Lasso MSE: 742.0418242996093


There we go-- an increase in error of 197.154393075. These must be greatly impactful.

What if we passed these through sin functions? 

In [40]:
demo = clean
sin: np.sin
demo=demo.apply(np.sin, columns=["Distance (KM)", "orders", "average_rating"])
# Scaling data
scaler = StandardScaler()
X = scaler.fit_transform(demo[demo.columns[demo.columns!='Time from Pickup to Arrival']]) # all columns except time from pickup to arrival
X = pd.DataFrame(X, columns = demo.columns[demo.columns!='Time from Pickup to Arrival']) # save to dataframe
lasso = LassoCV(n_alphas=100, cv=100)
lasso.fit(X=X, y=demo["Time from Pickup to Arrival"])
lasso_best = Lasso(alpha=lasso.alpha_)

cv_lasso = cross_val_score(estimator=lasso_best, X=X, y=demo["Time from Pickup to Arrival"], scoring="neg_mean_absolute_error", cv=100)
print('Lasso MAE:', -cv_lasso.mean())

Lasso MAE: 0.6397024786433453


Woah! Dropped dramatically. 