In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
#  load a bike share toy dataset we will use for building these classifiers
data = pd.read_csv('bike_sharing_daily.csv')
print(data.dtypes)

desc_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum',
                 'windspeed', 'cnt']
data = data[desc_features]

instant         int64
dteday         object
season          int64
yr              int64
mnth            int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
casual          int64
registered      int64
cnt             int64
dtype: object


In [3]:
data.isnull().sum()

season        0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

In [8]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# two-step numeric imputer (using Simple mean imputing) and standard scaling)
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')),
                                      ('scaler', StandardScaler())
                                     ]
                              )

# two-step categorical imputer (using Simple imputing and ordinal encoder)
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant'))
                                          ,('encoder', OrdinalEncoder())
                                         ]
                                  )

In [9]:
# hard-coded numerical and categorical features
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

# create a preprocessing tranformer 
# ColumnTransformer will allow different features to be transformed separately 
# and the features generated by each transformer will be concatenated to form a single feature space. 
preprocessor = ColumnTransformer(transformers=[('numeric', numeric_transformer, numeric_features),
                                               ('categorical', categorical_transformer, categorical_features)
                                              ]
                                ) 
# the above preprocessor transformer will apply numeric_transformer to numeric_features 
# and categorical_transformer to categorical_features. These are pipelines defined earlier.

In [10]:
from sklearn.ensemble import RandomForestRegressor

# now we build a preprocessing+training pipeline to use for our analytics task.
# this pipeline will apply the preprocessor transformer for numeric and categorical features separately
# then it will use a random forest regressor initialized.
reg_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', RandomForestRegressor(n_estimators=20, max_depth=3))
                          ]
                   )

In [11]:
from sklearn.model_selection import train_test_split

# prepare training and testing data, cnt (count) is the target variable for the regression problem
X = data.drop('cnt',axis=1)
y = data['cnt']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [12]:
rf_pipe_model = reg_pipeline.fit(X_train, y_train)
print (rf_pipe_model)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                           

In [13]:
from sklearn.metrics import r2_score

# we can now test the pipelined random forest model we created with the test data
predictions = rf_pipe_model.predict(X_test)

# and check the explained variance
print("R2 score:{:.2f}".format(r2_score(y_test, predictions)))

R2 score:0.56


In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

# next we will test different regressors for the same task.
# we will use linear regression, elastic net, grad boosting and support vector machine regressors

regressors = [LinearRegression(), 
              ElasticNet(),
              GradientBoostingRegressor(),
              SVR()
             ]

# for each regressor, use the same preprocessor transformer and add the regressor to the custom pipe
for reg_ in regressors:
    
    reg_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor',reg_)
                              ]
                       )
    # train and test
    model = reg_pipe.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    print(reg_)
    print('Model R2 score:{:2f}\n'.format(r2_score(predictions, y_test)))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Model R2 score:-0.049862

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
Model R2 score:-0.823941

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbo

In [15]:
# Saving the model
import joblib

# using joblib library, you can pickle (save) your models [or data/settings]
# we will save our piped random forest model as it provided the best performance
joblib.dump(rf_pipe_model, './rf_reg_model.pkl')
# the pickled model will be stored in the same directory

['./rf_reg_model.pkl']

In [16]:
# you can load it back 
loaded_rf_model = joblib.load('./rf_reg_model.pkl')

new_data = X_test

new_prediction = loaded_rf_model.predict(new_data)
new_prediction

array([5628.00394627, 5850.78313231, 2024.44802212, 1913.27250348,
       3316.31885362, 3177.07212479, 5212.97002087, 5090.76857632,
       3869.86437687, 1913.27250348, 1852.18294955, 3105.93356654,
       3467.04658337, 5629.94504692, 5884.85642459, 5631.22977874,
       5732.73308504, 4182.00425613, 2817.21316038, 3723.02417573,
       6133.04678214, 2101.11325604, 4177.56720168, 5631.22977874,
       1774.01628289, 5735.82411949, 5826.45491919, 6133.04678214,
       5701.79446921, 5710.10126476, 1852.18294955, 5989.9434514 ,
       4773.90962297, 5667.90311117, 3467.04658337, 3165.57450676,
       5529.34538793, 5832.28079182, 3467.04658337, 4289.2962131 ,
       5315.57443625, 3467.04658337, 5626.98518664, 3467.04658337,
       5757.53523032, 5998.90400048, 2524.0892269 , 5297.96539595,
       2437.93635618, 2423.56214493, 5554.4075923 , 5715.11523817,
       5647.12602367, 5628.00394627, 5340.16354955, 4731.71146937,
       4065.3648451 , 5090.76857632, 5850.78313231, 2437.93635

In [17]:
# see the loaded rf pipeline
loaded_rf_model

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                           