# Model: Random Forest Regressor
This notebook documents our process of approaching the given regression task using a RandomForestRegressor. The following approaches are documented:
* paramter optimization using RandomizedSearchCV
* paramter optimization using Hyperopt
* each of the mentioned approaches is tried with K-Fold CrossValidation and TimeSeriesSplit CrossValidation

In [2]:
from sklearn.metrics import mean_squared_error
import data_preprocessing
from sklearn.ensemble import RandomForestRegressor
from hyperopt import hp, space_eval, tpe, fmin, Trials
from math import sqrt
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import uniform, randint

In [3]:
# import preprocessed data
train, test = data_preprocessing.preprocess_data("Energy Consumption Dataset.csv")

In [4]:
# separate features from target variable
def split_x_y(df):
    y = df["Energy Consumption"]
    X = df.drop("Energy Consumption", axis=1)
    return X, y

X_train, y_train = split_x_y(train)
X_test, y_test = split_x_y(test)

## Using RandomizedSearchCV
First, we need to create a search grid / parameter distribution for RandomizedSearchCV to select from

In [5]:
param_dist = {
    'min_samples_leaf': uniform(),
    'min_samples_split': uniform(),
    'max_depth': randint(2, 18),
    'max_features': randint(1, 18),
    'ccp_alpha': uniform(),
    'max_leaf_nodes': randint(10, 100),
    'min_impurity_decrease': uniform(),
    'min_weight_fraction_leaf': uniform(0.01, 0.5),

    'n_estimators': randint(100, 500),
    'bootstrap': [True, False],
    'oob_score': [True, False], 
    'max_samples': uniform()
}

### Using K-Fold CrossVal

In [6]:
# Create the RF Regressor
reg = RandomForestRegressor(random_state=42)
# Create the RandomizedSearchCV instance
random_search = RandomizedSearchCV(reg,
                                   param_distributions=param_dist,
                                   n_iter=100,
                                   cv=5,
                                   random_state=42,
                                   scoring="neg_mean_squared_error")

# perform the hyperparameter optimization
random_search.fit(X_train, y_train)

230 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 402, in fit
    raise ValueError(
ValueError: `max_sample` canno

In [7]:
# inspect the model's performance
best_score = random_search.best_score_
rmse = sqrt(-best_score)
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  11334.54540420434
NRMSE:  0.16735884895319877


In [38]:
# inspect the tuned hyperparameters
best_params = random_search.best_params_
best_params

{'bootstrap': True,
 'ccp_alpha': 0.04771612769164879,
 'max_depth': 11,
 'max_features': 16,
 'max_leaf_nodes': 60,
 'max_samples': 0.7578461104643691,
 'min_impurity_decrease': 0.014393488629755868,
 'min_samples_leaf': 0.11607264050691624,
 'min_samples_split': 0.04600264202175275,
 'min_weight_fraction_leaf': 0.03036440115948507,
 'n_estimators': 337,
 'oob_score': False}

In [39]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance
rf = RandomForestRegressor(**best_params, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  12126.06815997252
NRMSE:  0.17904598174958686


### Using TimeSeriesSplit

In [10]:
# Create TimeSeriesSplit instance
tscv = TimeSeriesSplit(n_splits=5)

In [11]:
# Create the RF Regressor
reg = RandomForestRegressor(random_state=42)
# Create the RandomizedSearchCV instance
random_search = RandomizedSearchCV(reg,
                                   param_distributions=param_dist,
                                   n_iter=100,
                                   cv=tscv, # here, we pass TSCV instead of regular CV
                                   random_state=42,
                                   scoring="neg_mean_squared_error")

# perform the hyperparameter optimization
random_search.fit(X_train, y_train)

230 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/ensemble/_forest.py", line 402, in fit
    raise ValueError(
ValueError: `max_sample` canno

In [12]:
# inspect the model's performance
best_score = random_search.best_score_
rmse = sqrt(-best_score)
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  11429.96106248341
NRMSE:  0.16876769722829355


In [13]:
# inspect the tuned hyperparameters
best_params = random_search.best_params_
best_params

{'bootstrap': True,
 'ccp_alpha': 0.04771612769164879,
 'max_depth': 11,
 'max_features': 16,
 'max_leaf_nodes': 60,
 'max_samples': 0.7578461104643691,
 'min_impurity_decrease': 0.014393488629755868,
 'min_samples_leaf': 0.11607264050691624,
 'min_samples_split': 0.04600264202175275,
 'min_weight_fraction_leaf': 0.03036440115948507,
 'n_estimators': 337,
 'oob_score': False}

In [14]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance
rf = RandomForestRegressor(**best_params, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  12126.06815997252
NRMSE:  0.17904598174958686


## Using Hyperopt

In [15]:
from hyperopt import hp, space_eval, tpe, fmin, Trials
from sklearn.model_selection import cross_val_score

The following functions are passed to Hyperopt's `fmin` func - one using K-Fold CV and one using TimeSeriesSplit

In [16]:
# optimization function for K-Fold Cross Validation
def random_forest_regressor_kfold(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf, n_estimators, bootstrap, oob_score, max_samples = args

    reg = RandomForestRegressor(random_state=42,

                                min_samples_leaf=float(min_samples_leaf),
                                min_samples_split=float(min_samples_split),
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),

                                n_estimators=int(n_estimators),
                                bootstrap=bootstrap,
                                oob_score=oob_score if bootstrap else False,
                                max_samples=max_samples if bootstrap else None
                               )

    # we return the best score resulting from cross validation - KF-fold here
    return sqrt(-cross_val_score(reg, X_train, y_train, cv=5, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [17]:
# optimization function for TimeSeriesSplit Cross Validation
def random_forest_regressor_tscv(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf, n_estimators, bootstrap, oob_score, max_samples = args

    tscv = TimeSeriesSplit(n_splits=5)

    reg = RandomForestRegressor(random_state=42,

                                min_samples_leaf=float(min_samples_leaf),
                                min_samples_split=float(min_samples_split),
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),

                                n_estimators=int(n_estimators),
                                bootstrap=bootstrap,
                                oob_score=oob_score if bootstrap else False,
                                max_samples=max_samples if bootstrap else None
                               )

    # we return the best score resulting from cross validation - TimeSeriesSplit here
    return sqrt(-cross_val_score(reg, X_train, y_train, cv=tscv, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [18]:
# define the search space
space = [
    hp.uniform('min_samples_leaf', 0.01, 1.0),
    hp.uniform('min_samples_split', 0.01, 1.0),
    hp.quniform('max_depth', 2, 18, 1),
    hp.quniform('max_features', 1, 18, 2),
    hp.quniform('ccp_alpha', 0.01, 1.0, 0.05),
    hp.quniform('max_leaf_nodes', 10, 100, 5),
    hp.quniform('min_impurity_decrease', 0.01, 1.0, 0.05),
    hp.quniform('min_weight_fraction_leaf', 0.01, 0.5, 0.05),


    hp.quniform("n_estimators", 100, 500, 10),
    hp.choice("bootstrap", [True, False]),
    hp.choice("oob_score", [True, False]), 
    hp.quniform("max_samples", 0.01, 1.0, 0.05)
]

### Using regular K-Fold CV

In [20]:
trials = Trials()

# to perform a hyperparameter optimization, we pass the optimization function to Hyperopt
# Here, we're using K-Fold CV
best = fmin(fn=random_forest_regressor_kfold, space=space, algo=tpe.suggest, verbose=True, max_evals=100, trials=trials)

100%|██████████| 100/100 [02:41<00:00,  1.62s/trial, best loss: 10792.451800430958]


In [21]:
# inspect the model's performance
rmse = trials.best_trial['result']['loss']
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  10792.451800430958
NRMSE:  0.15935463190548618


In [23]:
# inspect the tuned hyperparameters
best

{'bootstrap': 0,
 'ccp_alpha': 0.45,
 'max_depth': 4.0,
 'max_features': 6.0,
 'max_leaf_nodes': 20.0,
 'max_samples': 0.8,
 'min_impurity_decrease': 0.25,
 'min_samples_leaf': 0.04672972635371414,
 'min_samples_split': 0.026563574755124053,
 'min_weight_fraction_leaf': 0.1,
 'n_estimators': 100.0,
 'oob_score': 1}

In [24]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance

# get optimized hyperparameters from result
bootstrap, ccp_alpha, max_depth, max_features, max_leaf_nodes, max_samples, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf, n_estimators, oob_score = best.values()

# train the Random Forest
reg = RandomForestRegressor(random_state=42,

                            min_samples_leaf=float(min_samples_leaf),
                            min_samples_split=float(min_samples_split),
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes),

                            n_estimators=int(n_estimators),
                            bootstrap=bootstrap,
                            oob_score=oob_score if bootstrap else False,
                            max_samples=max_samples if bootstrap else None
                        )

# predict on whole dataset
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  12126.06815997252
NRMSE:  0.17904598174958686


### using TimeSeriesSplit

In [26]:
trials = Trials()

# to perform a hyperparameter optimization, we pass the optimization function to Hyperopt
# Here, we're using K-Fold CV
best = fmin(fn=random_forest_regressor_tscv, space=space, algo=tpe.suggest, verbose=True, max_evals=100, trials=trials)

100%|██████████| 100/100 [03:59<00:00,  2.39s/trial, best loss: 10689.787200612356]


In [27]:
# inspect the model's performance
rmse = trials.best_trial['result']['loss']
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  10689.787200612356
NRMSE:  0.1578387502674358


In [28]:
# inspect the tuned hyperparameters
best

{'bootstrap': 1,
 'ccp_alpha': 0.30000000000000004,
 'max_depth': 13.0,
 'max_features': 4.0,
 'max_leaf_nodes': 95.0,
 'max_samples': 0.9500000000000001,
 'min_impurity_decrease': 0.25,
 'min_samples_leaf': 0.027646921143988423,
 'min_samples_split': 0.06002148092561177,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 250.0,
 'oob_score': 1}

In [33]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance

# get optimized hyperparameters from result
bootstrap, ccp_alpha, max_depth, max_features, max_leaf_nodes, max_samples, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf, n_estimators, oob_score = best.values()

# train the Random Forest
reg = RandomForestRegressor(random_state=42,

                            min_samples_leaf=float(min_samples_leaf),
                            min_samples_split=float(min_samples_split),
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes),

                            n_estimators=int(n_estimators),
                            bootstrap=bootstrap,
                            oob_score=oob_score if bootstrap else False,
                            max_samples=max_samples if bootstrap else None
                        )

# predict on whole dataset
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  12126.06815997252
NRMSE:  0.17904598174958686
