## Sklearn Random Forest Regressor 
### We will peform GridSearch to find the optimal parameters to be used in the Random Forest Regressor model to predict sales. Tuning of the hyperparameters are based on the MSE value obtained

In [1]:
#Importing relevant libraries
from data import train
from sklearn.svm import SVR
import numpy as np
from multiprocessing import cpu_count
from utils import StandardizedGridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split

In [2]:
#Separating predictors from sales column
X=train.drop('sales', axis=1)
y=train.sales

### We will first fit the training data into the default Random Forest Regressor model to see the MSE value before conducting GridSearch to find the optimal hyperparameters

In [3]:
# conducting a separate train test split to fit into default Gradient Boosting Regressor Model

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state =20)

In [4]:
from sklearn.ensemble import RandomForestRegressor
RF_default = RandomForestRegressor()
RF_default.fit(X_train,y_train)

RandomForestRegressor()

In [5]:
# MSE value for Random Forest Regressor default model
RF_default_MSE=np.mean((y_test - RF_default.predict(X_test))**2)

RF_default_MSE

2497.918238345863

### We will now conduct GridSearchCV to see if a better model can be obtained

In [7]:
#Retriving saved Random Forest and Bagging Models
# Random Forest and Bagging Regressor Run 1

RF_run1=StandardizedGridSearchCV.load('models/RF3.p')
RF_run1

GridSearchCV(cv=10, estimator=RandomForestRegressor(n_estimators=50), n_jobs=-1,
             param_grid={'ccp_alpha': array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 , 0.11,
       0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21, 0.22,
       0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32, 0.33,
       0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43, 0.44,
       0.45, 0.46, 0.47, 0.48, 0.49,...
       0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65, 0.66,
       0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76, 0.77,
       0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87, 0.88,
       0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99,
       1.  ]),
                         'criterion': ['mse'], 'max_depth': [10],
                         'max_features': (None, 'auto')},
             refit='neg_mean_squared_error', return_train_score=True,
             scoring=['neg_mean_squared_error', 

In [8]:
# Top 10 model results for Random Forest and Bagging Regressor Run 1

RF_run1.results.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_neg_mean_squared_error,split1_test_neg_mean_squared_error,split2_test_neg_mean_squared_error,split3_test_neg_mean_squared_error,split4_test_neg_mean_squared_error,split5_test_neg_mean_squared_error,...,split2_train_r2,split3_train_r2,split4_train_r2,split5_train_r2,split6_train_r2,split7_train_r2,split8_train_r2,split9_train_r2,mean_train_r2,std_train_r2
ccp_alpha,criterion,max_depth,max_features,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
0.4,mse,10,auto,2.899884,0.190939,0.022561,0.002941,-755.626536,-1130.759937,-1694.508315,-1775.814405,-2172.628731,-1216.508806,...,0.960095,0.966893,0.957276,0.962863,0.953173,0.961897,0.966917,0.957073,0.960333,0.004164
0.26,mse,10,auto,2.864082,0.1307,0.028877,0.008176,-677.526201,-1104.905571,-1531.601132,-1823.970063,-1864.418735,-1135.12254,...,0.959303,0.957509,0.965415,0.963031,0.954947,0.955528,0.965571,0.950377,0.95898,0.004574
0.37,mse,10,auto,2.508874,0.259479,0.026801,0.012177,-780.85727,-1131.299373,-1545.118862,-1647.605753,-2340.118346,-1267.582515,...,0.950897,0.960197,0.963135,0.956457,0.952452,0.961652,0.96208,0.952311,0.957269,0.004271
0.09,mse,10,,2.408103,0.255367,0.023288,0.007993,-685.879815,-1247.368889,-1602.747915,-1816.489336,-2402.069959,-1168.383123,...,0.962631,0.9625,0.959854,0.953452,0.954563,0.956357,0.964922,0.961207,0.959815,0.004122
0.41,mse,10,auto,2.863,0.166757,0.022919,0.003531,-754.359279,-1195.760152,-1644.272291,-1679.493063,-2186.962712,-1240.085854,...,0.961984,0.954638,0.958405,0.958572,0.961221,0.961501,0.969827,0.949502,0.959334,0.005739
0.75,mse,10,,2.790311,0.155244,0.027522,0.011107,-691.196682,-1326.688986,-2246.760563,-1614.182492,-2031.203491,-1213.9811,...,0.961767,0.958783,0.96461,0.9576,0.952811,0.960372,0.964812,0.955528,0.959116,0.003725
0.16,mse,10,,2.820084,0.149843,0.024837,0.004359,-774.39289,-1150.255681,-1123.755966,-1666.610928,-2227.564881,-1144.760464,...,0.960626,0.960278,0.960421,0.961381,0.954694,0.96459,0.967074,0.962144,0.961276,0.003027
0.82,mse,10,auto,2.568868,0.14242,0.019984,0.007035,-708.267609,-1175.909523,-2141.931342,-1694.730043,-2768.783052,-1212.592417,...,0.958538,0.96596,0.961876,0.96384,0.959715,0.958723,0.963014,0.958853,0.960296,0.003123
0.34,mse,10,auto,2.511113,0.195342,0.030569,0.015252,-697.633161,-1077.643726,-1693.960023,-1688.088708,-2824.018827,-1200.085094,...,0.958557,0.961148,0.962986,0.96099,0.954738,0.964273,0.967007,0.952169,0.959687,0.004791
0.21,mse,10,,2.914651,0.204543,0.022568,0.005315,-729.137507,-1086.686377,-1887.835085,-1542.379802,-2225.723423,-1465.426693,...,0.961113,0.96239,0.961133,0.963046,0.954713,0.953662,0.962572,0.950453,0.957355,0.004958


In [9]:
#Printing top 10 models for Random Forest and Bagging Regressor Run 1 based on test MSE value

RF_run1.results['mean_test_neg_mean_squared_error'].head(10)

ccp_alpha  criterion  max_depth  max_features
0.40       mse        10         auto           -2367.260821
0.26       mse        10         auto           -2369.330841
0.37       mse        10         auto           -2394.247759
0.09       mse        10         NaN            -2411.766285
0.41       mse        10         auto           -2411.942395
0.75       mse        10         NaN            -2416.251007
0.16       mse        10         NaN            -2422.690418
0.82       mse        10         auto           -2435.520573
0.34       mse        10         auto           -2437.530603
0.21       mse        10         NaN            -2440.069211
Name: mean_test_neg_mean_squared_error, dtype: float64

In [10]:
#We aggreagted the train and test MSE scores of run1 to see if ccp_alpha had a significant effect on the MSE values

(-RF_run1.results[['mean_train_neg_mean_squared_error', 'mean_test_neg_mean_squared_error']]).groupby('ccp_alpha').mean()

Unnamed: 0_level_0,mean_train_neg_mean_squared_error,mean_test_neg_mean_squared_error
ccp_alpha,Unnamed: 1_level_1,Unnamed: 2_level_1
0.01,415.459104,2457.605728
0.02,403.068408,2531.019440
0.03,405.120889,2595.617216
0.04,396.390804,2540.406584
0.05,385.953300,2533.883679
...,...,...
0.96,424.202244,2530.222280
0.97,422.735572,2700.666507
0.98,440.051091,2652.354658
0.99,425.853027,2604.285165


### From above, we decided to narrow the range of ccp_alpha to see if there is a effect on MSE

In [11]:
#Retriving saved Random Forest and Bagging Models
# Random Forest and Bagging Regressor Run 2

RF_run2=StandardizedGridSearchCV.load('models/RFe50.p')
RF_run2

GridSearchCV(cv=10, estimator=RandomForestRegressor(n_estimators=50), n_jobs=-1,
             param_grid={'ccp_alpha': (0.1, 1, 10), 'criterion': ('mse', 'mae'),
                         'max_depth': (9, 10, 12),
                         'max_features': (None, 'auto'),
                         'max_leaf_nodes': array([ 199,  629, 1992])},
             refit='neg_mean_squared_error', return_train_score=True,
             scoring=['neg_mean_squared_error', 'r2'], verbose=10)

In [12]:
# Top 10 model results for Random Forest and Bagging Regressor Run 2

RF_run2.results.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,split0_test_neg_mean_squared_error,split1_test_neg_mean_squared_error,split2_test_neg_mean_squared_error,split3_test_neg_mean_squared_error,split4_test_neg_mean_squared_error,split5_test_neg_mean_squared_error,...,split2_train_r2,split3_train_r2,split4_train_r2,split5_train_r2,split6_train_r2,split7_train_r2,split8_train_r2,split9_train_r2,mean_train_r2,std_train_r2
ccp_alpha,criterion,max_depth,max_features,max_leaf_nodes,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1.0,mse,10,auto,199,2.211581,0.171242,0.015584,0.001685,-683.491642,-1195.051484,-1285.306545,-1739.070524,-2362.208701,-1157.440121,...,0.948945,0.95924,0.957151,0.964259,0.95527,0.956784,0.961405,0.946941,0.957203,0.005313
1.0,mse,10,,629,2.150079,0.091926,0.015485,0.001876,-706.258133,-1208.535551,-1957.424978,-1500.246919,-2577.34054,-1292.303164,...,0.959216,0.957633,0.960571,0.953595,0.946787,0.956361,0.961713,0.959704,0.956851,0.004465
1.0,mse,9,auto,1992,1.94614,0.09162,0.017891,0.009554,-736.225889,-1258.048837,-1676.078044,-1730.420648,-2630.859167,-1187.081754,...,0.957354,0.96263,0.959547,0.961469,0.950129,0.957139,0.955888,0.951536,0.956335,0.003952
1.0,mse,9,auto,199,1.908923,0.082004,0.016939,0.002284,-723.570004,-1060.177203,-1930.354322,-1660.379301,-2509.13583,-1219.552813,...,0.957436,0.95823,0.964113,0.955169,0.946392,0.957835,0.961473,0.953314,0.956208,0.004758
1.0,mse,12,,199,2.291848,0.195587,0.015372,0.002282,-698.066201,-1210.917347,-1681.50772,-1931.949869,-2477.170536,-1212.206429,...,0.959113,0.96215,0.957668,0.961108,0.955416,0.9555,0.968553,0.956147,0.959559,0.003771
0.1,mae,10,auto,629,38.803116,0.750402,0.013823,0.001988,-1216.399332,-1422.906419,-1861.37564,-1631.72476,-2239.897759,-1221.762514,...,0.933389,0.940773,0.9358,0.944764,0.932035,0.93091,0.939954,0.943334,0.936917,0.004658
0.1,mse,9,auto,1992,2.037349,0.137142,0.015513,0.004097,-788.83381,-1239.292029,-1761.06517,-1798.97476,-2540.595108,-1171.446502,...,0.960188,0.957943,0.959833,0.963889,0.952492,0.959621,0.961377,0.952228,0.958185,0.003625
1.0,mse,10,auto,629,2.192411,0.183072,0.014795,0.001563,-692.011369,-1156.798389,-2006.554859,-2076.775409,-2244.279506,-1131.61038,...,0.960247,0.957954,0.959084,0.960403,0.948575,0.959368,0.963173,0.957985,0.958176,0.003737
0.1,mse,10,,629,2.269565,0.076147,0.019023,0.005073,-699.843315,-1093.099895,-1856.364686,-1814.059972,-2490.649525,-1185.844307,...,0.961597,0.960642,0.965414,0.957505,0.96091,0.964169,0.964339,0.96534,0.961915,0.002604
0.1,mse,10,,199,2.210904,0.107924,0.015755,0.00215,-870.393487,-1329.443813,-1864.775141,-1586.944005,-2606.601471,-1306.494765,...,0.963617,0.961862,0.965268,0.96409,0.950988,0.963521,0.966004,0.95248,0.960584,0.005558


In [13]:
#Printing top 10 models for Random Forest and Bagging Regressor Run 2 based on test MSE value

RF_run2.results['mean_test_neg_mean_squared_error'].head(10)

ccp_alpha  criterion  max_depth  max_features  max_leaf_nodes
1.0        mse        10         auto          199              -2428.587681
                                 NaN           629              -2454.831336
                      9          auto          1992             -2484.084729
                                               199              -2486.469285
                      12         NaN           199              -2488.630722
0.1        mae        10         auto          629              -2492.699447
           mse        9          auto          1992             -2495.357914
1.0        mse        10         auto          629              -2502.546570
0.1        mse        10         NaN           629              -2503.855931
                                               199              -2504.170200
Name: mean_test_neg_mean_squared_error, dtype: float64

### Comparing the 2 GridSearchCV models as well as the default model, we see that RF_run1 is the best model as it gives the lower test MSE score of 2367.26

In [14]:
## Printing best parameters from RF_run1

RF_run1.best_estimator_

RandomForestRegressor(ccp_alpha=0.4, max_depth=10, n_estimators=50)