In [81]:
from sklearn.metrics import mean_squared_error
import data_preprocessing
from sklearn.ensemble import RandomForestRegressor
from hyperopt import hp, space_eval, tpe, fmin, Trials
from math import sqrt
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
import numpy as np

In [82]:
# import preprocessed data
train, test = data_preprocessing.preprocess_data("Energy Consumption Dataset.csv")

In [83]:
# separate features from target variable
def split_x_y(df):
    y = df["Energy Consumption"]
    X = df.drop("Energy Consumption", axis=1)
    return X, y

X_train, y_train = split_x_y(train)
X_test, y_test = split_x_y(test)

In [84]:
def random_forest_regressor_kfold(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf, n_estimators, bootstrap, oob_score, max_samples = args

    tscv = TimeSeriesSplit(n_splits=5)
    reg = RandomForestRegressor(random_state=42,

                                min_samples_leaf=float(min_samples_leaf),
                                min_samples_split=float(min_samples_split),
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),

                                n_estimators=int(n_estimators),
                                bootstrap=bootstrap,
                                oob_score=oob_score if bootstrap else False,
                                max_samples=max_samples if bootstrap else None
                            )

    return sqrt(-cross_val_score(reg, X_train, y_train, cv=tscv, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [85]:
def random_forest_regressor_tscv(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf, n_estimators, bootstrap, oob_score, max_samples = args
    tscv = TimeSeriesSplit(n_splits=5)

    reg = RandomForestRegressor(random_state=42,

                                min_samples_leaf=float(min_samples_leaf),
                                min_samples_split=float(min_samples_split),
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),

                                n_estimators=int(n_estimators),
                                bootstrap=bootstrap,
                                oob_score=oob_score if bootstrap else False,
                                max_samples=max_samples if bootstrap else None
                            )

    return sqrt(-cross_val_score(reg, X_train, y_train, cv=tscv, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [86]:
space = [
    hp.uniform('min_samples_leaf', 0.01, 1.0),
    hp.uniform('min_samples_split', 0.01, 1.0),
    hp.quniform('max_depth', 2, 18, 1),
    hp.quniform('max_features', 1, 18, 2),
    hp.quniform('ccp_alpha', 0.01, 1.0, 0.05),
    hp.quniform('max_leaf_nodes', 10, 100, 5),
    hp.quniform('min_impurity_decrease', 0.01, 1.0, 0.05),
    hp.quniform('min_weight_fraction_leaf', 0.01, 0.5, 0.05),


    hp.quniform("n_estimators", 100, 500, 10),
    hp.choice("bootstrap", [True, False]),
    hp.choice("oob_score", [True, False]), 
    hp.quniform("max_samples", 0.01, 1.0, 0.05)
]

### using regular kfold

In [87]:
trials = Trials()

best = fmin(fn=random_forest_regressor_kfold, space=space, algo=tpe.suggest, verbose=True, max_evals=1000, trials=trials, catch_eval_exceptions=True)
RMSE = trials.best_trial['result']['loss']
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

 46%|████▌     | 460/1000 [26:54<31:23,  3.49s/trial, best loss: 10326.76345584245]   

job exception: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_co

 49%|████▉     | 492/1000 [28:53<33:25,  3.95s/trial, best loss: 10326.76345584245]  

job exception: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_co

 52%|█████▏    | 517/1000 [30:27<40:17,  5.01s/trial, best loss: 10326.76345584245]

job exception: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_co

100%|█████████▉| 997/1000 [1:13:26<00:13,  4.42s/trial, best loss: 10296.729455355495]
RMSE:  10296.729455355495
NRMSE:  0.15203510402733802


In [88]:
best

{'bootstrap': 1,
 'ccp_alpha': 0.0,
 'max_depth': 10.0,
 'max_features': 10.0,
 'max_leaf_nodes': 55.0,
 'max_samples': 0.8,
 'min_impurity_decrease': 0.4,
 'min_samples_leaf': 0.01091407213625034,
 'min_samples_split': 0.011078280889464581,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 220.0,
 'oob_score': 0}

In [89]:
bootstrap, ccp_alpha, max_depth, max_features, max_leaf_nodes, max_samples, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf, n_estimators, oob_score = best.values()
reg = RandomForestRegressor(random_state=42,

                            min_samples_leaf=float(min_samples_leaf),
                            min_samples_split=float(min_samples_split),
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes),

                            n_estimators=int(n_estimators),
                            bootstrap=bootstrap,
                            oob_score=oob_score if bootstrap else False,
                            max_samples=max_samples if bootstrap else None
                        )
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
score = mean_squared_error(y_test, y_pred)
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))



RMSE:  10296.729455355495
NRMSE:  0.15203510402733802


### using TimeSeriesSplit

In [90]:
trials = Trials()

best = fmin(fn=random_forest_regressor_tscv, space=space, algo=tpe.suggest, verbose=True, max_evals=1000, trials=trials, catch_eval_exceptions=True)
RMSE = trials.best_trial['result']['loss']
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

  5%|▍         | 47/1000 [09:33<15:26:21, 58.32s/trial, best loss: 11187.71506965727]

job exception: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_co

  5%|▌         | 52/1000 [09:41<2:56:57, 11.20s/trial, best loss: 11187.71506965727] 

job exception: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_co

 15%|█▍        | 146/1000 [23:32<2:17:41,  9.67s/trial, best loss: 10392.796083532252]


KeyboardInterrupt: 

In [None]:
best

In [None]:
bootstrap, ccp_alpha, max_depth, max_features, max_leaf_nodes, max_samples, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf, n_estimators, oob_score = best.values()
reg = RandomForestRegressor(random_state=42,

                            min_samples_leaf=float(min_samples_leaf),
                            min_samples_split=float(min_samples_split),
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes),

                            n_estimators=int(n_estimators),
                            bootstrap=bootstrap,
                            oob_score=oob_score if bootstrap else False,
                            max_samples=max_samples if bootstrap else None
                        )
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
score = mean_squared_error(y_test, y_pred)
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))