# Random Forest Regressor
We focus on Hyperparameter Tuning on the model in this notebook. 
- EDA has been done in the previous notebooks, particularly in notebooks `1_SARIMA` and `2_custom_predictor`
- We use the `X` and `y` dataframe from previous notebooks, since they have added features we can use; we will simply re-run the previous code to recreate the dataframes

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from pathlib import Path

root_path = '../input/demand-forecasting-kernels-only/'
train_original = pd.read_csv(Path(root_path, 'train.csv'), low_memory=False, 
                    parse_dates=['date'], index_col=['date'])
test_original = pd.read_csv(Path(root_path, 'test.csv'), low_memory=False, 
                   parse_dates=['date'], index_col=['date'])
sample_sub_original = pd.read_csv(Path(root_path, 'sample_submission.csv'))

In [2]:
def SMAPE(forecast, actual):
    """
    Symmetric Mean Absolute Percentage Error: from https://www.kaggle.com/enolac5/time-series-arima-dnn-xgboost-comparison
    """
    masked_arr = ~((forecast==0)&(actual==0))
    diff = abs(forecast[masked_arr] - actual[masked_arr])
    avg = (abs(forecast[masked_arr]) + abs(actual[masked_arr]))/2
    
    print('SMAPE Error Score: ' + str(round(sum(diff/avg)/len(forecast) * 100, 2)) + ' %')

In [3]:
train = train_original.copy()
test = test_original.copy()

train['weekday'] = train.index.dayofweek
train['dayofyear'] = train.index.dayofyear
train['year'] = train.index.year
train['month'] = train.index.month
# train = train.reset_index()

test['weekday'] = test.index.dayofweek
test['dayofyear'] = test.index.dayofyear
test['year'] = test.index.year
test['month'] = test.index.month
# test = test.reset_index()

In [4]:
# One Hot Encode Months + drop first column to remove dummy variable trap
temp = pd.get_dummies(train['month'], prefix='is_month', drop_first=True)

for col in temp.columns:
    train[col] = temp[col]
    
# Determines if day is a weekend
# Assumption: Friday is NOT a weekend
train['is_weekend'] = train['weekday'] // 5 # 0 for Mon-Fri (0-4), 1 for Sat (5) and Sun (6)

from itertools import product

avg_sales = np.zeros(shape=(10, 50)) # (num_store, num_item)

for store, item in list(product(range(1, 11), range(1, 51))):
    avg_sales[store-1, item-1] = train.query(f'store == {store} & item == {item}')['sales'].mean()
    
avg_sales_col = []

for _, row in train.iterrows():
    store, item = row['store'], row['item']
    avg_sales_col.append(avg_sales[store-1, item-1])
    
train['avg_sales'] = avg_sales_col

In [5]:
# One Hot Encode Months + drop first column to remove dummy variable trap
temp = pd.get_dummies(test['month'], prefix='is_month', drop_first=True)

for col in temp.columns:
    test[col] = temp[col]
    
# add in columns manually (test data spans Jan-Mar only)
for i in range(4, 13):
    test[f'is_month_{i}'] = 0
    
# Determines if day is a weekend
# Assumption: Friday is NOT a weekend
test['is_weekend'] = test['weekday'] // 5 # 0 for Mon-Fri (0-4), 1 for Sat (5) and Sun (6)

# Recalculate avg_sales column for test dataframe
avg_sales_col = []

for _, row in test.iterrows():
    store, item = row['store'], row['item']
    avg_sales_col.append(avg_sales[store-1, item-1])
    
test['avg_sales'] = avg_sales_col

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
X = train.copy()
X.drop(['sales'], axis=1, inplace=True)
y = train['sales']

In [8]:
rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=7, random_state=42)

In [9]:
rf.fit(X, y)

RandomForestRegressor(min_samples_leaf=7, n_estimators=50, random_state=42)

In [10]:
X_test = test.copy()
X_test.drop(['id'], axis=1, inplace=True)

test_preds = rf.predict(X_test)

sample_sub = sample_sub_original.copy()
sample_sub['sales'] = test_preds
sample_sub['sales'] = np.round(sample_sub['sales']).astype(int)
sample_sub.head()

Unnamed: 0,id,sales
0,0,14
1,1,13
2,2,13
3,3,14
4,4,16


In [11]:
sample_sub.to_csv('./submission.csv', index=False) # Public Score of 14.61!

### Hyperparameter Tuning
Quite an extensive list of parameters from the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html) of the RandomForestRegressor from the scikit-learn package. 

Let us try a [RandomSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html) to narrow our search space. Not all parameter settings will be tried out.
- This section will be commented out after completion to reduce runtime on creating `submission.csv`

In [12]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 7,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 50,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [13]:
# from sklearn.model_selection import RandomizedSearchCV

# # Number of trees in random forest
# n_estimators = np.arange(50, 300, step=50).tolist()

# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']

# # Maximum number of levels in tree
# max_depth = np.arange(10, 60, step=10).tolist()
# max_depth.append(None)

# # Minimum number of samples required at each leaf node
# min_samples_leaf = [5, 7, 9]

# # Method of selecting samples for training each tree
# bootstrap = [True, False]

# random_grid = {
#     'n_estimators': n_estimators,
#     'max_features': max_features,
#     'max_depth': max_depth,
#     'min_samples_leaf': min_samples_leaf,
#     'bootstrap': bootstrap
# }

# for i in random_grid.keys():
#     print(random_grid[i])

In [14]:
# rf_random = RandomizedSearchCV(estimator=RandomForestRegressor(), param_distributions=random_grid, cv=3, verbose=3, random_state=42)

# rf_random.fit(X, y)

In [15]:
# rf_random.best_params_

The `rf_random.best_params_` found above are as follows:

{'n_estimators': 150,
 'min_samples_leaf': 9,
 'max_features': 'auto',
 'max_depth': 20,
 'bootstrap': True}

Using the above parameters, we can narrow our search into a smaller grid and carry out [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html), this time iterating through all parameter settings in the grid.

In [16]:
from sklearn.model_selection import GridSearchCV

In [17]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [10, 20, 30, 40],
    'max_features': ['auto'],
    'min_samples_leaf': [9],
    'n_estimators': [100, 150, 200]
}

rf_gs = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid, cv=3, verbose=3)

rf_gs.fit(X, y)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=100; total time= 2.0min
[CV 2/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=100; total time= 2.0min
[CV 3/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=100; total time= 2.0min
[CV 1/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=150; total time= 3.0min
[CV 2/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=150; total time= 3.0min
[CV 3/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=150; total time= 3.0min
[CV 1/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=200; total time= 4.0min
[CV 2/3] END bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=9, n_estimators=200; total 

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'bootstrap': [True], 'max_depth': [10, 20, 30, 40],
                         'max_features': ['auto'], 'min_samples_leaf': [9],
                         'n_estimators': [100, 150, 200]},
             verbose=3)

In [18]:
rf_gs.best_params_

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 'auto',
 'min_samples_leaf': 9,
 'n_estimators': 200}

`rf_gs.best_params_` gave us: 

{'bootstrap': True,
 'max_depth': 20,
 'max_features': 'auto',
 'min_samples_leaf': 9,
 'n_estimators': 200}

In [19]:
X_test = test.copy()
X_test.drop(['id'], axis=1, inplace=True)

test_preds = rf_gs.predict(X_test)

sample_sub = sample_sub_original.copy()
sample_sub['sales'] = test_preds
sample_sub['sales'] = np.round(sample_sub['sales']).astype(int)
sample_sub.head()

Unnamed: 0,id,sales
0,0,13
1,1,13
2,2,13
3,3,14
4,4,16


In [20]:
sample_sub.to_csv('./submission.csv', index=False) # Public Score of 14.51!!