# Random Forest model

## Import data and set up

In [14]:
# import relevant packages
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [2]:
# set up paths
data_dir = "../data"

In [27]:
# import data 
X_train = pd.read_csv(os.path.join(data_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(data_dir, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
y_test = pd.read_csv(os.path.join(data_dir, 'y_test.csv'))

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(397900, 24)
(99476, 24)
(397900, 1)
(99476, 1)


In [4]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [24]:
X_train.columns

Index(['Date', 'Store', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Month', 'Year',
       'Weekday', 'Holiday', 'StoreType_enc', 'Assortment_enc', 'Store_enc'],
      dtype='object')

In [28]:
train_cols = ['Open', 'Promo', 'SchoolHoliday', 'Month', 'Year',
       'Weekday', 'Holiday', 'StoreType_enc', 'Assortment_enc', 'Store_enc']

X_train = X_train[train_cols]
X_test = X_test[train_cols]
print(X_train.columns)
print(X_test.columns)

Index(['Open', 'Promo', 'SchoolHoliday', 'Month', 'Year', 'Weekday', 'Holiday',
       'StoreType_enc', 'Assortment_enc', 'Store_enc'],
      dtype='object')
Index(['Open', 'Promo', 'SchoolHoliday', 'Month', 'Year', 'Weekday', 'Holiday',
       'StoreType_enc', 'Assortment_enc', 'Store_enc'],
      dtype='object')


In [29]:
y_train = y_train.to_numpy().flatten()
y_test = y_test.to_numpy().flatten()

## Random Forest

In [15]:
model = RandomForestRegressor(criterion="mse", random_state=42)

In [38]:
# define parameter space for grid search
parameter_space = {
    'n_estimators': [100, 150],
    'max_features': ["auto", 0.8],
    'max_depth': [5, 8]
}

# n_estimators = number of trees in the foreset
# max_features = max number of features considered for splitting a node
# max_depth = max number of levels in each decision tree
# min_samples_split = min number of data points placed in a node before the node is split
# min_samples_leaf = min number of data points allowed in a leaf node
# bootstrap = method for sampling data points (with or without replacement)

In [39]:
my_scorer = make_scorer(metric, greater_is_better=False)

In [40]:
%%time
regr = GridSearchCV(model, parameter_space, scoring=my_scorer, 
                   n_jobs=None, cv=5, return_train_score = True)
regr.fit(X_train, y_train)

CPU times: user 14min 22s, sys: 1min 36s, total: 15min 58s
Wall time: 14min 13s


GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [5, 8], 'max_features': ['auto', 0.8],
                         'n_estimators': [100, 150]},
             return_train_score=True,
             scoring=make_scorer(metric, greater_is_better=False))

In [41]:
#Best estimator
print('Best estimator:\n', regr.best_estimator_)

# Best parameter setting
print('Best parameters found:\n', regr.best_params_)

# Scorer used on the held out data to choose best parameters for the model
print('Scorer used on the held out data to choose the best parameters for the model:', regr.scorer_)

# Mean cross-validated score of the best estimator
print('Best mean cross-validated score:', regr.best_score_)

Best estimator:
 RandomForestRegressor(max_depth=8, max_features=0.8, random_state=42)
Best parameters found:
 {'max_depth': 8, 'max_features': 0.8, 'n_estimators': 100}
Scorer used on the held out data to choose the best parameters for the model: make_scorer(metric, greater_is_better=False)
Best mean cross-validated score: -19.064212178229724


In [42]:
# All results
cv_results = pd.DataFrame.from_dict(regr.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,13.370069,0.151149,0.258208,0.007395,5,auto,100,"{'max_depth': 5, 'max_features': 'auto', 'n_es...",-21.415396,-21.759289,...,-21.545982,0.140817,7,-21.494293,-21.479182,-21.48807,-21.572581,-21.570554,-21.520936,0.041623
1,19.959316,0.311081,0.384933,0.003949,5,auto,150,"{'max_depth': 5, 'max_features': 'auto', 'n_es...",-21.422286,-21.767489,...,-21.551618,0.140744,8,-21.501236,-21.485892,-21.495927,-21.574965,-21.57317,-21.526238,0.039366
2,10.813128,0.059437,0.25326,0.002263,5,0.8,100,"{'max_depth': 5, 'max_features': 0.8, 'n_estim...",-20.755215,-21.126859,...,-20.903387,0.134139,5,-20.886677,-20.842708,-20.861567,-20.935301,-20.873964,-20.880043,0.031208
3,16.284521,0.074947,0.379905,0.003559,5,0.8,150,"{'max_depth': 5, 'max_features': 0.8, 'n_estim...",-20.755028,-21.1315,...,-20.907432,0.137396,6,-20.884581,-20.845261,-20.867934,-20.936503,-20.88409,-20.883674,0.030051
4,19.572977,0.182223,0.392423,0.003293,8,auto,100,"{'max_depth': 8, 'max_features': 'auto', 'n_es...",-19.157266,-19.379854,...,-19.242967,0.124519,4,-19.184469,-19.120182,-19.172973,-19.240086,-19.169185,-19.177379,0.038316
5,29.469988,0.186869,0.585345,0.004954,8,auto,150,"{'max_depth': 8, 'max_features': 'auto', 'n_es...",-19.152774,-19.383862,...,-19.242585,0.124261,3,-19.180643,-19.123344,-19.168198,-19.242979,-19.16678,-19.176389,0.038537
6,16.823943,1.605795,0.395529,0.002626,8,0.8,100,"{'max_depth': 8, 'max_features': 0.8, 'n_estim...",-18.95661,-19.213687,...,-19.064212,0.127687,1,-19.019551,-18.936606,-18.999889,-19.05041,-18.972073,-18.995706,0.039064
7,24.163557,0.169408,0.589849,0.004114,8,0.8,150,"{'max_depth': 8, 'max_features': 0.8, 'n_estim...",-18.960513,-19.228252,...,-19.068232,0.130292,2,-19.022454,-18.948996,-19.001489,-19.054326,-18.975435,-19.00054,0.0365


In [43]:
y_pred = regr.predict(X_test)
value = metric(y_pred, y_test)
print(value)

24.40313367243775
