# ML Lab
In this notebook, I use the UCI bikeshare data to demonstrate preparing data and creating pipelines for machine learning.

The focus here is on fitting and scoring models.

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style='darkgrid', palette='colorblind')
pd.options.display.max_rows = None
pd.options.display.max_columns = None

	- instant: record index
	- dteday : date
	- season : season (1:springer, 2:summer, 3:fall, 4:winter)
	- yr : year (0: 2011, 1:2012)
	- mnth : month ( 1 to 12)
	- hr : hour (0 to 23)
	- holiday : weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
	- weekday : day of the week
	- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
	+ weathersit : 
		- 1: Clear, Few clouds, Partly cloudy, Partly cloudy
		- 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist
		- 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds
		- 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
	- temp : Normalized temperature in Celsius. The values are divided to 41 (max)
	- atemp: Normalized feeling temperature in Celsius. The values are divided to 50 (max)
	- hum: Normalized humidity. The values are divided to 100 (max)
	- windspeed: Normalized wind speed. The values are divided to 67 (max)
	- casual: count of casual users
	- registered: count of registered users
	- cnt: count of total rental bikes including both casual and registered

# 0. Load data

In [2]:
# What does our data look like?
df = pd.read_csv('~/Desktop/py/data/day.csv')
df.shape

(731, 16)

In [3]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
instant,731.0,366.0,211.165812,1.0,183.5,366.0,548.5,731.0
season,731.0,2.49658,1.110807,1.0,2.0,3.0,3.0,4.0
yr,731.0,0.500684,0.500342,0.0,0.0,1.0,1.0,1.0
mnth,731.0,6.519836,3.451913,1.0,4.0,7.0,10.0,12.0
holiday,731.0,0.028728,0.167155,0.0,0.0,0.0,0.0,1.0
weekday,731.0,2.997264,2.004787,0.0,1.0,3.0,5.0,6.0
workingday,731.0,0.683995,0.465233,0.0,0.0,1.0,1.0,1.0
weathersit,731.0,1.395349,0.544894,1.0,1.0,1.0,2.0,3.0
temp,731.0,0.495385,0.183051,0.05913,0.337083,0.498333,0.655417,0.861667
atemp,731.0,0.474354,0.162961,0.07907,0.337842,0.486733,0.608602,0.840896


In [4]:
df.corr()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,1.0,0.412224,0.866025,0.496702,0.016145,-1.6e-05,-0.004337,-0.021477,0.15058,0.152638,0.016375,-0.11262,0.275255,0.659623,0.62883
season,0.412224,1.0,-0.001844,0.83144,-0.010537,-0.00308,0.012485,0.019211,0.334315,0.342876,0.205445,-0.229046,0.210399,0.411623,0.4061
yr,0.866025,-0.001844,1.0,-0.001792,0.007954,-0.005461,-0.002013,-0.048727,0.047604,0.046106,-0.110651,-0.011817,0.248546,0.594248,0.56671
mnth,0.496702,0.83144,-0.001792,1.0,0.019191,0.009509,-0.005901,0.043528,0.220205,0.227459,0.222204,-0.207502,0.123006,0.293488,0.279977
holiday,0.016145,-0.010537,0.007954,0.019191,1.0,-0.10196,-0.253023,-0.034627,-0.028556,-0.032507,-0.015937,0.006292,0.054274,-0.108745,-0.068348
weekday,-1.6e-05,-0.00308,-0.005461,0.009509,-0.10196,1.0,0.03579,0.031087,-0.00017,-0.007537,-0.052232,0.014282,0.059923,0.057367,0.067443
workingday,-0.004337,0.012485,-0.002013,-0.005901,-0.253023,0.03579,1.0,0.0612,0.05266,0.052182,0.024327,-0.018796,-0.518044,0.303907,0.061156
weathersit,-0.021477,0.019211,-0.048727,0.043528,-0.034627,0.031087,0.0612,1.0,-0.120602,-0.121583,0.591045,0.039511,-0.247353,-0.260388,-0.297391
temp,0.15058,0.334315,0.047604,0.220205,-0.028556,-0.00017,0.05266,-0.120602,1.0,0.991702,0.126963,-0.157944,0.543285,0.540012,0.627494
atemp,0.152638,0.342876,0.046106,0.227459,-0.032507,-0.007537,0.052182,-0.121583,0.991702,1.0,0.139988,-0.183643,0.543864,0.544192,0.631066


Since we want to focus on total rides ('cnt'), we'll want to drop casual and registered.

We can also drop the datetime variable 'dteday' and keep 'instant' as a quasi-time variable (increments by 1 every day).

We may also want to drop one of temp and atemp since they are so highly correlated (r = 0.991). Atemp is probably more important since what the temperature 'feels like' is more likely than the actual temperature to affect a person's decision whether or not to bike.

# 1. Select our y variable
We are interested in total rides, so we choose 'cnt'.

In [5]:
y_cols = ['casual', 'registered', 'cnt']
y = df[y_cols[-1]]

# 2. Select our X for further processing
Drop unneeded variables

In [6]:
feature_cols = [col for col in df.columns if (col not in y_cols) & (col not in ['dteday', 'temp'])]
Xraw = df[feature_cols]
Xraw.shape

(731, 11)

# 3. Train-test-split

In [7]:
from sklearn.model_selection import train_test_split as tts

Xtrain, Xtest, ytrain, ytest = tts(Xraw, y, test_size=0.33)

Xtrain.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,atemp,hum,windspeed
49,50,1,0,2,0,6,0,1,0.391404,0.187917,0.507463
177,178,3,0,6,0,1,1,2,0.637004,0.658333,0.107588
322,323,4,0,11,0,6,0,1,0.324483,0.502083,0.224496
372,373,1,1,1,0,0,0,1,0.340258,0.465,0.191542
94,95,2,0,4,0,2,1,2,0.39835,0.642083,0.388067


# 4. Preprocessing pipeline

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

class DenseTransformer(TransformerMixin):
    '''Custom class to deal with incompatibilities between 
    preprocessing pipeline stages.
    OneHotEncoder outputs a sparse matrix and the other
    steps need a dense matrix as input.
    '''
    def transform(self, X, y=None, **fit_params):
        return X.todense()
    
    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)
    
    def fit(self, X, y=None, **fit_params):
        return self

''' Not 100% sure this pipeline is kosher.
OneHotEncoder creates dummies but PolynomialFeatures creates more dummies,
some of which are redundant (year squared for example).
In theory, using Lasso/ElasticNet should take care of these, but does it? 
And should I manually drop them if I'm not using Lasso/ElasticNet?
'''

categorical_features = np.array([1, 3, 5, 7])

processpipe = Pipeline([
    ('ohe', OneHotEncoder(categorical_features=categorical_features)),
    ('to_dense', DenseTransformer()),
    ('poly', PolynomialFeatures(2, include_bias=False)),
    ('scaler', StandardScaler()),
])

# Run train and test X matrices through pipeline
'''Xtrain_scaled = processpipe.fit_transform(Xtrain)
Xtest_scaled = processpipe.transform(Xtest)

# Get all of the feature names created by poly
Xtrain_cols = processpipe.named_steps['poly'].get_feature_names(Xtrain.columns)
Xtest_cols = processpipe.named_steps['poly'].get_feature_names(Xtest.columns)

Xtrain = pd.DataFrame(Xtrain_scaled, columns=Xtrain_cols)
Xtest = pd.DataFrame(Xtest_scaled, columns=Xtest_cols)'''

# Temporary workaround because get_feature_names throws an IndexError. 
# Doesn't matter too much here but should fix in future.
Xtrain = processpipe.fit_transform(Xtrain)
Xtest = processpipe.transform(Xtest)

print(Xtrain.shape, Xtest.shape)

(489, 594) (242, 594)


# 5. Model fitting
Now that we have interaction/squared terms and scaled variables, we can fit and score models.

In [9]:
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import median_absolute_error as medae
from sklearn.metrics import explained_variance_score as evs
from sklearn.metrics import r2_score

In [10]:
def score_model(model, alpha=False):
    ''' 
    This function fits a model using the training set, predicts using the test set, and then calculates 
    and reports goodness of fit metrics and alpha if specified and available.
    
    All of the model parameters are also reported, which I find extremely useful.
    
    I wanted to include all of the available regression metrics to see how they compare and comove.
    I ran into an ValueError when trying to include MSLE (mean squared log error). 
    Could be related to ln0 being undefined?
    '''
    model.fit(Xtrain, ytrain)
    yhat = model.predict(Xtest)
    r2 = r2_score(ytest, yhat)
    me = mse(ytest, yhat)
    ae = mae(ytest, yhat)
    mede = medae(ytest, yhat)
    ev = evs(ytest, yhat)
    
    if alpha == True:
        print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \
              \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f} \nalpha={:0.3f}".format(model, r2, me, 
                                                                                  ae, mede, ev, model.alpha_))
    else:
        print("Results from {}: \nr2={:0.3f} \nMSE={:0.3f} \
              \nMAE={:0.3f} \nMEDAE={:0.3f} \nEVS={:0.3f}".format(model, r2, me, ae, mede, ev))

Briefly: 

* R^2 ranges from 0 to 1. Variance in y explained by variance in X.
* Mean squared error is relative. Expected value of the squared error (L2).
* Mean absolute error is relative. L1 norm loss.
* Median absolute error is also relative but is robust to outliers. Does not support multioutput while many of the others do.
* Explained variance score ranges from 0 to 1. Similar to R^2.

In [11]:
'''OLS'''
from sklearn.linear_model import LinearRegression

score_model(LinearRegression())

Results from LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False): 
r2=-5580736537244458073194496.000 
MSE=21782447978565512238703361130496.000               
MAE=975088987255422.750 
MEDAE=489.410 
EVS=-5578848272397284065411072.000


OLS barfs, which is to be expected. Most likely a result of the OneHotEncoding + PolynomialFeatures blowing up p.
Since p is large, regularization is a good way to go. Same goes for SVM or RF which deal well with large p.

In [12]:
'''Elastic Net'''
from sklearn.linear_model import ElasticNetCV

# Alphas to search over
alphas = np.logspace(0, 100, 200)

# Suggested l1_ratio from docs
l1_ratio = [.1, .5, .7, .9, .95, .99, 1]

en = ElasticNetCV(l1_ratio=l1_ratio, alphas=alphas, fit_intercept=True, normalize=False)

score_model(en, alpha=True)
print("L1 ratio=",en.l1_ratio_)



Results from ElasticNetCV(alphas=array([1.00000e+000, 3.18063e+000, ..., 3.14404e+099, 1.00000e+100]),
       copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1], max_iter=1000,
       n_alphas=100, n_jobs=1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0): 
r2=0.860 
MSE=548227.769               
MAE=541.281 
MEDAE=417.140 
EVS=0.861 
alpha=32.176
L1 ratio= 1.0


ElasticNet works.
L1 ratio = 1 implies Lasso, so let's see what happens when we try Lasso and Ridge separately.

In [13]:
'''Lasso'''
from sklearn.linear_model import LassoCV

alphas = np.logspace(0, 100, 200)

lasso = LassoCV(alphas=alphas, fit_intercept=True, normalize=False)
score_model(lasso, alpha=True)



Results from LassoCV(alphas=array([1.00000e+000, 3.18063e+000, ..., 3.14404e+099, 1.00000e+100]),
    copy_X=True, cv=None, eps=0.001, fit_intercept=True, max_iter=1000,
    n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False): 
r2=0.860 
MSE=548227.769               
MAE=541.281 
MEDAE=417.140 
EVS=0.861 
alpha=32.176


As expected, the results are identical to ElasticNet with the chosen l1_ratio_ of 1.0.

In [14]:
'''Ridge'''
from sklearn.linear_model import RidgeCV

alphas = np.logspace(0, 100, 200)

rr = RidgeCV(alphas=alphas, fit_intercept=True, normalize=False)

score_model(rr, alpha=True)

Results from RidgeCV(alphas=array([1.00000e+000, 3.18063e+000, ..., 3.14404e+099, 1.00000e+100]),
    cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring=None, store_cv_values=False): 
r2=0.802 
MSE=770877.358               
MAE=604.524 
MEDAE=436.583 
EVS=0.808 
alpha=1.000


As expected, Ridge doesn't perform as well as Lasso.

In the following cells, I go over more regression models, usually using default hyperparameters since I don't understand the concepts as well as I do with regularization methods. 

Some of them are surprisingly high-performing even when untuned.
I've ordered them from worst to best performing. 

In [15]:
'''K Neighbors Regressor'''
from sklearn.neighbors import KNeighborsRegressor

kn = KNeighborsRegressor()
score_model(kn)

Results from KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'): 
r2=0.448 
MSE=2153181.982               
MAE=1193.931 
MEDAE=1073.600 
EVS=0.455


In [18]:
'''AdaBoost Regressor'''
from sklearn.ensemble import AdaBoostRegressor

ad = AdaBoostRegressor() 
score_model(ad)

Results from AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None): 
r2=0.823 
MSE=689925.541               
MAE=668.982 
MEDAE=592.138 
EVS=0.832


In [16]:
'''Support Vector Regression'''
from sklearn.svm import SVR

sv = SVR(kernel='linear')
score_model(sv)

Results from SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False): 
r2=0.844 
MSE=607382.002               
MAE=590.320 
MEDAE=464.988 
EVS=0.844


In [17]:
'''Bayesian Ridge'''
from sklearn.linear_model import BayesianRidge

br = BayesianRidge() 
score_model(br)

Results from BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False): 
r2=0.850 
MSE=585927.698               
MAE=564.858 
MEDAE=414.605 
EVS=0.852


In [19]:
'''RF'''
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor() 
score_model(rf)

Results from RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False): 
r2=0.877 
MSE=479600.875               
MAE=510.957 
MEDAE=396.400 
EVS=0.878
