# Data Pipeline

Outline of steps to take for bikeshare machine learning pipeline

In [1]:
import os 
import sys 

sys.path.append("/Users/benjamin/Repos/ddl/yellowbrick")
import numpy as np
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 

sns.set_context('notebook')
sns.set_style('whitegrid')

## Step 0: Data Load

In [30]:
data = pd.read_csv('data/day.csv')
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


## Step 1: y Selection
Select y either daily DC to DC total trips or casual trips 

In [31]:
y_cols = ['casual', 'registered', 'cnt']
y = data[y_cols[0]]

## Step 2: Feature Engineering

1. Define Features (final_db => Xraw): final_db (not September X 2017- March 31st, 2018) minus any obviously trip related features 
2. Split Data to Train and Test (Xraw => Xtrain, Xtest) 
3. Add Polynominal Features (Xtrain, Xtest => Xtrainpol, Xtestpol): Use only squared, not cubed etc because we have a lot of features
3. Split out binary from non-binary features (Xtrainpol, Xtestpol => Xtrainbin, Xtestbin, Xtraincon, Xtestcon): List comprehensions
4. Scale only non-binary features (Xtraincon, Xtestcon => Xtrainscl, Xtestscl): Use the standard scalar using the defaults
5. Concatenate binary and standarized non-binary features (Xtrainbin, Xtestbin, Xtrainscl, Xtestscl => Xptrain, Xptest)


In [34]:
# 1. Define Features (final_db => Xraw): final_db minus any obviously trip related features 
feature_cols = [col for col in data.columns if (col not in y_cols) & (col not in ['dteday'])]
Xraw = data[feature_cols]
Xraw.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,1,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446
1,2,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539
2,3,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309
3,4,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296
4,5,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869


In [42]:
# 2. Train Test Split (Xraw => Xtrain, Xtest): Split data 
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(Xraw, y, test_size=0.33)

Xtrain.head()


Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
178,179,3,0,6,0,2,1,1,0.744167,0.692558,0.634167,0.144283
233,234,3,0,8,0,1,1,1,0.691667,0.638254,0.47,0.27675
403,404,1,1,2,0,3,1,2,0.256667,0.266421,0.722917,0.133721
344,345,4,0,12,0,0,0,1,0.220833,0.253154,0.49,0.066542
59,60,1,0,3,0,2,1,1,0.266667,0.263879,0.535,0.216425


In [82]:
# 3. Add Polynominal Features (Xtrain, Xtest => Xtrainpol, Xtestpol): Use only squared, not cubed etc because we have a lot of features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(2, include_bias=False)
# poly fit transform will transform dataframe into an array of arrays, so tranform back
Xtrainpol = poly.fit_transform(Xtrain)
Xtestpol = poly.fit_transform(Xtest)
Xtrainpol_cols, Xtestpol_cols, = poly.get_feature_names(Xtrain.columns), poly.get_feature_names(Xtrain.columns)
Xtrainpol = pd.DataFrame(poly.fit_transform(Xtrain), columns=Xtrainpol_cols)
Xtestpol = pd.DataFrame(poly.fit_transform(Xtest), columns=Xtrainpol_cols)

print(Xtrainpol.head())

   instant  season   yr  mnth  holiday  weekday  workingday  weathersit  \
0    179.0     3.0  0.0   6.0      0.0      2.0         1.0         1.0   
1    234.0     3.0  0.0   8.0      0.0      1.0         1.0         1.0   
2    404.0     1.0  1.0   2.0      0.0      3.0         1.0         2.0   
3    345.0     4.0  0.0  12.0      0.0      0.0         0.0         1.0   
4     60.0     1.0  0.0   3.0      0.0      2.0         1.0         1.0   

       temp     atemp     ...         temp^2  temp atemp  temp hum  \
0  0.744167  0.692558     ...       0.553785    0.515379  0.471926   
1  0.691667  0.638254     ...       0.478403    0.441459  0.325083   
2  0.256667  0.266421     ...       0.065878    0.068381  0.185549   
3  0.220833  0.253154     ...       0.048767    0.055905  0.108208   
4  0.266667  0.263879     ...       0.071111    0.070368  0.142667   

   temp windspeed   atemp^2  atemp hum  atemp windspeed     hum^2  \
0        0.107371  0.479637   0.439197         0.099924  0.

In [84]:
# 4. Split out binary from non-binary features (Xtrainpol, Xtestpol => Xtrainbin, Xtestbin, Xtraincon, Xtestcon): List comprehensions
# Use only train data for define as test and train will be the same
binary_cols = [col for col in Xtrainpol.columns if Xtrainpol[col].nunique() == 2]
non_binary_cols = [col for col in Xtrainpol.columns if col not in binary_cols]
Xtrainbin, Xtestbin = Xtrainpol[binary_cols], Xtestpol[binary_cols]
Xtraincon, Xtestcon = Xtrainpol[non_binary_cols], Xtestpol[non_binary_cols]
Xtrainbin.head()

Unnamed: 0,yr,holiday,workingday,yr^2,yr holiday,yr workingday,holiday^2,workingday^2
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [87]:
# 5. Scale only non-binary features (Xtraincon, Xtestcon => Xtrainscl, Xtestscl): Use the standard scalar using the defaults
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
Xtrainsclfit = scaler.fit(Xtraincon)
Xtrainscl = pd.DataFrame(scaler.transform(Xtraincon), columns=Xtraincon.columns)
Xtestscl = pd.DataFrame(scaler.transform(Xtestcon), columns=Xtestcon.columns)

Xtrainscl.head()

Unnamed: 0,instant,season,mnth,weekday,weathersit,temp,atemp,hum,windspeed,instant^2,...,temp^2,temp atemp,temp hum,temp windspeed,atemp^2,atemp hum,atemp windspeed,hum^2,hum windspeed,windspeed^2
0,-0.897002,0.485449,-0.137932,-0.476825,-0.76078,1.352505,1.329207,0.032198,-0.613986,-0.925551,...,1.487577,1.477011,1.133932,0.328985,1.46136,1.087376,0.273517,-0.082719,-0.515463,-0.635267
1,-0.633645,0.485449,0.436099,-0.978259,-0.76078,1.067068,0.997958,-1.099073,1.103137,-0.781327,...,1.079115,1.040621,0.073049,2.172711,0.997072,-0.014935,2.08928,-1.072138,0.233537,0.985314
2,0.18037,-1.306132,-1.285993,0.02461,1.01075,-1.297978,-1.270184,0.643772,-0.750897,-0.092683,...,-1.156198,-1.161876,-0.935035,-1.273454,-1.164048,-0.865149,-1.248405,0.574688,-0.41508,-0.720586
3,-0.102141,1.38124,1.58416,-1.479694,-0.76078,-1.492804,-1.351112,-0.961254,-1.621718,-0.373264,...,-1.248915,-1.235533,-1.493793,-1.704006,-1.208333,-1.407957,-1.692941,-0.967338,-1.659054,-1.111501
4,-1.466812,-1.306132,-0.998978,-0.476825,-0.76078,-1.243609,-1.28569,-0.65116,0.321165,-1.106131,...,-1.127841,-1.150149,-1.244842,-0.760323,-1.172708,-1.272325,-0.739887,-0.715573,-0.043848,0.120853


In [90]:
# 6. Concatenate binary and standarized non-binary features (Xtrainbin, Xtestbin, Xtrainscl, Xtestscl => Xptrain, Xptest)
print(Xtrainscl.shape)
print(Xtrainbin.shape)
Xptrain = pd.concat([Xtrainscl, Xtrainbin], axis=1)
Xptest = pd.concat([Xtestscl, Xtestbin], axis=1)
print(Xptrain.shape, Xptest.shape)

(489, 82)
(489, 8)
(489, 90) (242, 90)


## Step 3: Feature Selection
Leverage LassoCV with fit_intercept=False (Xptrain => Xftrain) 
*Note: Other methods could potentially be used for feature selection such as stepwise regression

In [111]:
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel

clf = LassoCV(fit_intercept=False)
sfm = SelectFromModel(clf)
sfm.fit(Xptrain, ytrain)
Xftrain_array = sfm.transform(Xptrain)
mask = sfm.get_support()
selected_columns = Xptrain.columns[mask]
Xftrain = pd.DataFrame(Xftrain_array, columns=selected_columns)
Xftest = Xptest[selected_columns] 
Xftest.head()



Unnamed: 0,instant,season,mnth,weekday,temp,hum,instant yr,instant holiday,instant workingday,instant weathersit,...,weathersit temp,weathersit atemp,weathersit windspeed,temp^2,hum^2,windspeed^2,yr,workingday,yr^2,workingday^2
0,-1.399776,-1.306132,-0.998978,-0.476825,-0.967829,0.18025,-0.973576,-0.129756,-0.736323,-0.980715,...,-0.16872,-0.089678,0.553496,-0.96731,0.068541,-0.253107,0.0,1.0,0.0,1.0
1,-0.351134,1.38124,1.01013,0.526045,-0.106397,0.046552,-0.973576,-0.129756,0.169551,-0.599654,...,-0.623386,-0.614385,0.872967,-0.2863,-0.068275,3.941131,0.0,1.0,0.0,1.0
2,1.31999,1.38124,1.01013,0.02461,0.881306,1.13188,1.300397,-0.129756,1.613158,2.004704,...,1.77701,1.64639,-0.848815,0.829334,1.161077,-1.111486,1.0,1.0,1.0,1.0
3,1.554618,1.38124,1.297145,0.02461,-0.772416,-0.108495,1.473956,-0.129756,1.815842,0.446294,...,-0.973791,-0.928388,-1.026514,-0.836683,-0.22179,-0.930422,1.0,1.0,1.0,1.0
4,-1.007134,-0.410342,-0.137932,-1.479694,0.831466,0.15853,-0.973576,-0.129756,-1.042418,-0.549722,...,1.724567,1.755582,0.010294,0.76447,0.046034,-0.681239,0.0,0.0,0.0,0.0


## Step 4: Model Fit


In [108]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score 

In [122]:
from sklearn.linear_model import LinearRegression 

model = LinearRegression() 
model.fit(Xptrain, ytrain)

yhat = model.predict(Xptest)

r2 = r2_score(ytest, yhat)
me = mse(ytest, yhat)

print("r2={:0.3f} MSE={:0.3f}".format(r2,me))

r2=0.793 MSE=93982.294


In [119]:
# L2 and L1 Regularization 
alphas = np.logspace(-10, 0, 200)

In [120]:
from sklearn.linear_model import RidgeCV 

model = RidgeCV(alphas=alphas) 
model.fit(Xftrain, ytrain)

yhat = model.predict(Xftest)

r2 = r2_score(ytest, yhat)
me = mse(ytest, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.alpha_))

r2=0.829 MSE=77748.471 alpha=1.000


In [123]:
model = RidgeCV(alphas=alphas) 
model.fit(Xptrain, ytrain)

yhat = model.predict(Xptest)

r2 = r2_score(ytest, yhat)
me = mse(ytest, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.alpha_))

r2=0.827 MSE=78369.526 alpha=1.000


In [125]:
from sklearn.linear_model import LassoCV 
model = LassoCV(alphas=alphas) 
model.fit(Xptrain, ytrain)

yhat = model.predict(Xptest)

r2 = r2_score(ytest, yhat)
me = mse(ytest, yhat)

print("r2={:0.3f} MSE={:0.3f} alpha={:0.3f}".format(r2,me, model.alpha_))



r2=0.837 MSE=74236.814 alpha=1.000




## Step 5: Impact Analysis
