# Libraries

In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

# Regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import (RandomForestRegressor, ExtraTreesRegressor, 
                              AdaBoostRegressor, GradientBoostingRegressor)
from xgboost import XGBRegressor


# Fine tuning models
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit


# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


# Import
import pickle

# Data

In [2]:
df = pd.read_csv('sales_store_train_data.csv', index_col = 'Date', parse_dates = True)
df = df.drop(['Customers'], axis = 1)

In [3]:
# Separate target & features
y_train = np.log(df.Sales)

X_train = df.drop(['Sales'], axis = 1)
feat_names = X_train.columns

In [4]:
print(df.shape)
df.head()

(804056, 26)


Unnamed: 0_level_0,DayOfWeek,Sales,Promo,StateHoliday,SchoolHoliday,Year,Month,Week,Day,StoreType,...,Assortment_basic,Assortment_extended,Assortment_extra,PromoInterval_0,"PromoInterval_Jan,Apr,Jul,Oct","PromoInterval_Feb,May,Aug,Nov","PromoInterval_Mar,Jun,Sept,Dec",StateGDP,StateDensity,AvgWeaklyCusts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-06-19,5,7329,1,0,0,2015,6,25,19,348,...,0,1,0,1,0,0,0,70755,1743,6.604942
2015-06-19,5,9367,1,0,0,2015,6,25,19,148,...,0,1,0,1,0,0,0,22427,116,6.604942
2015-06-19,5,7856,1,0,0,2015,6,25,19,348,...,0,1,0,0,0,0,1,37509,287,6.604942
2015-06-19,5,8074,1,0,0,2015,6,25,19,602,...,1,0,0,0,1,0,0,22980,227,6.604942
2015-06-19,5,8626,1,0,0,2015,6,25,19,602,...,1,0,0,0,1,0,0,35443,178,6.604942


# Model

In [5]:
metrics = ['neg_root_mean_squared_error','neg_mean_absolute_error', 'r2']

In [6]:
models = []
scores = []

In [7]:
tscv = TimeSeriesSplit(n_splits=3)

In [8]:
%%time
classes = [DecisionTreeRegressor(random_state = 42),
              ExtraTreesRegressor(random_state = 42, n_estimators = 30),
              RandomForestRegressor(random_state = 42, n_estimators = 30),
              AdaBoostRegressor(random_state = 42, n_estimators = 30),
              GradientBoostingRegressor(random_state = 42, n_estimators = 30),
              XGBRegressor(random_state = 42, n_estimators = 30)
          ]


for model in classes:
    model.fit(X_train, y_train)
    
    cross_val = cross_validate(model, X_train, y_train, 
                               cv = tscv, scoring = metrics)

    models.append(f'{model.__class__.__name__}')
    scores.append(list(cross_val.values())[2:])

Wall time: 15min 44s


In [9]:
data = []
for i in range(len(models)):
    for j in range(len(metrics)):
        data.append(scores[i][j].mean())

data = np.reshape(data, (len(models), len(metrics)))
        
df_models = pd.DataFrame(data, index = models, columns = metrics)
df_models.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,neg_root_mean_squared_error,neg_mean_absolute_error,r2
DecisionTreeRegressor,-0.196637,-0.139703,0.787191
ExtraTreesRegressor,-0.161755,-0.116366,0.85595
RandomForestRegressor,-0.16051,-0.11565,0.858101
AdaBoostRegressor,-0.377376,-0.294779,0.216255
GradientBoostingRegressor,-0.35771,-0.277907,0.295693
XGBRegressor,-0.259303,-0.200037,0.629753


# Fine Tune

In [10]:
fine_tuned_models = []
new_models = []
new_scores = []

In [11]:
%%time

'''RandomForestRegressor
parameters: 
    n_estimators: number of trees in the forest; (default = 100)
                    generally random forests do not overfit as more trees are added, however, the increase on
                    performance does not necessarily justifies the increase on complexity (more time/resources)
                    
    max_depth: maximum depth of each tree in the forest; (default = None, all leaves are expanded as much as possible)
                    generally, the deeper the tree, the more splits, which can lead to higher performance, as it
                    allows the model to better fit the data (lower bias), however this can also lead to higher
                    variance as the model can start to overfit the training data
    
    min_samples_split: minimum number of samples required to split an internal node; (default = 2)
                    this can reduce the number of splits, which can reduce overfitting, however, if too large, it
                    can also lead to underfitting the training set
    
    min_samples_leaf: mininum number of samples that must be present in a leaf/terminal node (end of the tree); (default = 1)
                    as before, a higher value can reduce overfitting but a too high value can lead to underfitting.
                    Specially in regression, a higher value can also smooth the model, by avoiding leaves with only one value
                    
    max_features: number of features to consider for best split (deafault = 'auto', total number of features)
                    a higher number will lead to better performance, but a too high number will also lead to overfitting.
                    Moreover, the higher the number, the more computationally expensive the model becomes.
    
    max_samples: number (or %) of samples to draw from X to train each base estimator (default = None, total num of samples)
                     The lower the value, the more randomness is introduced to the data.
                     --Only possible if Bootstrap = True.
    '''

param_grid = {'max_depth':[32, 40, 48],
              'min_samples_split':[2, 3, 4],
              'min_samples_leaf':[1, 2, 3],
              'max_features':[16, 20, 24],
              'max_samples':[0.8, 0.9, 1.0]
             }

rf_gd = GridSearchCV(estimator=RandomForestRegressor(n_estimators=50, 
                                                       random_state=42, 
                                                       bootstrap=True,
                                                       n_jobs=-1),
                      param_grid=param_grid, 
                      cv=tscv, 
                      scoring=metrics,
                      refit='neg_mean_absolute_error',
                      error_score='raise', 
                      verbose=2)
                     
rf_gd.fit(X_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=2; total time=   7.4s
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=2; total time=  14.3s
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=2; total time=  23.2s
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=3; total time=   5.4s
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=3; total time=  13.7s
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=3; total time=  22.1s
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=4; total time=   5.0s
[CV] END max_depth=32, max_features=16, max_samples=0.8, min_samples_leaf=1, min_samples_split=4; total time=  12.5s
[

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
             error_score='raise',
             estimator=RandomForestRegressor(n_estimators=50, n_jobs=-1,
                                             random_state=42),
             param_grid={'max_depth': [32, 40, 48],
                         'max_features': [16, 20, 24],
                         'max_samples': [0.8, 0.9, 1.0],
                         'min_samples_leaf': [1, 2, 3],
                         'min_samples_split': [2, 3, 4]},
             refit='neg_mean_absolute_error',
             scoring=['neg_root_mean_squared_error', 'neg_mean_absolute_error',
                      'r2'],
             verbose=2)

In [12]:
import pickle 
filename = 'RFRegressor.pkl'
pickle.dump(rf_gd.best_estimator_, open(filename, 'wb'))

In [13]:
fine_tuned_models.append(rf_gd.best_estimator_)
rf_gd.best_params_

{'max_depth': 48,
 'max_features': 24,
 'max_samples': 0.8,
 'min_samples_leaf': 2,
 'min_samples_split': 3}

### XGB Regressor

In [14]:
%%time
# https://xgboost.readthedocs.io/en/stable/python/python_api.html?highlight=xgboost%20xgbregressor#xgboost.XGBRegressor
'''XGBRegressor
parameters: 
    n_estimators: number of gradient boosted trees;
                    
    max_depth: maximum depth of each tree; (default = 6)
                    a higher value leads to a more complex model, more likely to overfit.
    
    learning_rate: boosting learning rate; (default = 0.3)
                    it shrinks the feature weights to make the boosting process more conservative. It prevents overfitting.
    
    subsample: subsample ratio if the training instances; (default = 1)
                    a lower value can help prevent overfitting.
   
    colsample_bytree: subsample ratio of columns (features) when constructing each tree; (default = 1)
                    similar to max_features in RandomForest/ExtraTrees
                    
    reg_alpha: L1 regularization term on weights; (default = 1)
                    increasing this value can prevent overfitting
                    
    reg_lambda: L2 regularization term on weights; (default = 0)
                    increasing this value can prevent overfitting
    '''

param_grid = {'max_depth':[20, 24],
              'learning_rate':[0.06, 0.08],
              'subsample':[0.8, 0.9],
              'colsample_bytree':[0.7, 0.8],
              'reg_alpha':[1.0, 1.3],
              'reg_lambda':[0.5, 0.8]}

xgb_gd = GridSearchCV(estimator = XGBRegressor(n_estimators = 50, random_state = 42, n_jobs = -1),
                      param_grid = param_grid, 
                      cv = tscv, 
                      scoring = metrics,
                      refit = 'neg_root_mean_squared_error',
                      error_score = 'raise', 
                      verbose = 2)
                     
xgb_gd.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits
[CV] END colsample_bytree=0.7, learning_rate=0.06, max_depth=20, reg_alpha=1.0, reg_lambda=0.5, subsample=0.8; total time=   6.7s
[CV] END colsample_bytree=0.7, learning_rate=0.06, max_depth=20, reg_alpha=1.0, reg_lambda=0.5, subsample=0.8; total time=  13.8s
[CV] END colsample_bytree=0.7, learning_rate=0.06, max_depth=20, reg_alpha=1.0, reg_lambda=0.5, subsample=0.8; total time=  24.1s
[CV] END colsample_bytree=0.7, learning_rate=0.06, max_depth=20, reg_alpha=1.0, reg_lambda=0.5, subsample=0.9; total time=   6.1s
[CV] END colsample_bytree=0.7, learning_rate=0.06, max_depth=20, reg_alpha=1.0, reg_lambda=0.5, subsample=0.9; total time=  14.0s
[CV] END colsample_bytree=0.7, learning_rate=0.06, max_depth=20, reg_alpha=1.0, reg_lambda=0.5, subsample=0.9; total time=  23.6s
[CV] END colsample_bytree=0.7, learning_rate=0.06, max_depth=20, reg_alpha=1.0, reg_lambda=0.8, subsample=0.8; total time=   6.3s
[CV] END colsample_bytree=0.

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
             error_score='raise',
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta...
                                    reg_lambda=None, scale_pos_weight=None,
                                    subsample=None, tree_method=None,
                                    validate_parameters=None, verbosity=None),
             param_grid={'colsample_bytree': [0.7, 0.8],
                         'learning_rate': [0.06, 0.08], 'max_depth': [

In [15]:
filename = 'XGBRegressor.pkl'
pickle.dump(xgb_gd.best_estimator_, open(filename, 'wb'))

In [16]:
fine_tuned_models.append(xgb_gd.best_estimator_)
xgb_gd.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.08,
 'max_depth': 24,
 'reg_alpha': 1.0,
 'reg_lambda': 0.5,
 'subsample': 0.9}

## Comparing models

In [17]:
%%time
for model in fine_tuned_models:
    cross_val = cross_validate(model, X_train, y_train, cv= tscv, scoring = metrics)

    new_models.append(f'{model.__class__.__name__}')
    new_scores.append(list(cross_val.values())[2:])
    print('')

data = []
for model in range(len(fine_tuned_models)):
    for score in range(len(metrics)):
        data.append(new_scores[model][score].mean())

data = np.reshape(data, (len(fine_tuned_models), len(metrics)))
        
df_models = pd.DataFrame(data, index = new_models, columns = metrics)



Wall time: 1min 56s


In [18]:
df_models.style.highlight_max(color = 'lightgreen', axis = 0)

Unnamed: 0,neg_root_mean_squared_error,neg_mean_absolute_error,r2
RandomForestRegressor,-0.15729,-0.113587,0.863776
XGBRegressor,-0.215799,-0.168316,0.743698
