In [13]:
from sklearn.metrics import mean_squared_error
import data_preprocessing
from sklearn.tree import DecisionTreeRegressor
from hyperopt import hp, space_eval, tpe, fmin, Trials
from math import sqrt
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
import numpy as np

In [7]:
# import preprocessed data
train, test = data_preprocessing.preprocess_data("Energy Consumption Dataset.csv")

In [8]:
# separate features from target variable
def split_x_y(df):
    y = df["Energy Consumption"]
    X = df.drop("Energy Consumption", axis=1)
    return X, y

X_train, y_train = split_x_y(train)
X_test, y_test = split_x_y(test)

In [59]:
def decision_tree_regressor_kfold(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf = args

    reg = DecisionTreeRegressor(random_state=42,

                                min_samples_leaf=float(min_samples_leaf),
                                min_samples_split=float(min_samples_split),
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),
                               )

    return sqrt(-cross_val_score(reg, X_train, y_train, cv=5, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [60]:
def decision_tree_regressor_tscv(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf = args

    tscv = TimeSeriesSplit(n_splits=5)

    reg = DecisionTreeRegressor(random_state=42,

                                min_samples_leaf=float(min_samples_leaf),
                                min_samples_split=float(min_samples_split),
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),
                               )

    return sqrt(-cross_val_score(reg, X_train, y_train, cv=tscv, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [9]:
# define the search space
space = [
    hp.uniform('min_samples_leaf', 0.01, 1.0),
    hp.uniform('min_samples_split', 0.01, 1.0),
    hp.quniform('max_depth', 2, 18, 1),
    hp.quniform('max_features', 1, 18, 2),
    hp.quniform('ccp_alpha', 0.01, 1.0, 0.05),
    hp.quniform('max_leaf_nodes', 10, 100, 5),
    hp.quniform('min_impurity_decrease', 0.01, 1.0, 0.05),
    hp.quniform('min_weight_fraction_leaf', 0.01, 0.5, 0.05)
]

### Using regular kfold cv

In [62]:
trials = Trials()

best = fmin(fn=decision_tree_regressor_kfold, space=space, algo=tpe.suggest, verbose=True, max_evals=1000, trials=trials)
RMSE = trials.best_trial['result']['loss']
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

100%|██████████| 1000/1000 [04:42<00:00,  3.54trial/s, best loss: 10551.677248021653]
RMSE:  10551.677248021653
NRMSE:  0.1557995045923523


In [63]:
best

{'ccp_alpha': 0.5,
 'max_depth': 8.0,
 'max_features': 10.0,
 'max_leaf_nodes': 85.0,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 0.010234684635058989,
 'min_samples_split': 0.010337604867686253,
 'min_weight_fraction_leaf': 0.0}

In [64]:
ccp_alpha, max_depth, max_features, max_leaf_nodes, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf = best.values()
reg = DecisionTreeRegressor(random_state=42,

                            min_samples_leaf=float(min_samples_leaf),
                            min_samples_split=float(min_samples_split),
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes))
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
score = mean_squared_error(y_test, y_pred)
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

RMSE:  10551.677248021653
NRMSE:  0.1557995045923523


### using time series split

In [65]:
trials = Trials()

best = fmin(fn=decision_tree_regressor_tscv, space=space, algo=tpe.suggest, verbose=True, max_evals=1000, trials=trials)
RMSE = trials.best_trial['result']['loss']
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

100%|██████████| 1000/1000 [04:29<00:00,  3.72trial/s, best loss: 10618.986342855369]
RMSE:  10618.986342855369
NRMSE:  0.15679334882992305


In [66]:
ccp_alpha, max_depth, max_features, max_leaf_nodes, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf = best.values()
reg = DecisionTreeRegressor(random_state=42,

                            min_samples_leaf=float(min_samples_leaf),
                            min_samples_split=float(min_samples_split),
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes))
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
score = mean_squared_error(y_test, y_pred)
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

RMSE:  10618.986342855369
NRMSE:  0.15679334882992305


### using BlockingTimeSeries Split

In [18]:
class BlockingTimeSeriesSplit():
    def __init__(self, n_splits):
        self.n_splits = n_splits
    
    def get_n_splits(self, X, y, groups):
        return self.n_splits
    
    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.5 * (stop - start)) + start
            yield indices[start: mid], indices[mid + margin: stop]


In [11]:
def decision_tree_regressor_btscv(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf = args

    btscv = BlockingTimeSeriesSplit(n_splits=5)

    reg = DecisionTreeRegressor(random_state=42,

                                min_samples_leaf=float(min_samples_leaf),
                                min_samples_split=float(min_samples_split),
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),
                               )

    return sqrt(-cross_val_score(reg, X_train, y_train, cv=btscv, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [17]:
trials = Trials()

best = fmin(fn=decision_tree_regressor_btscv, space=space, algo=tpe.suggest, verbose=True, max_evals=1000, trials=trials)
RMSE = trials.best_trial['result']['loss']
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

100%|██████████| 1000/1000 [04:06<00:00,  4.06trial/s, best loss: 10848.987774545836]
RMSE:  10848.987774545836
NRMSE:  0.16018940694188105


In [19]:
ccp_alpha, max_depth, max_features, max_leaf_nodes, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf = best.values()
reg = DecisionTreeRegressor(random_state=42,

                            min_samples_leaf=float(min_samples_leaf),
                            min_samples_split=float(min_samples_split),
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes))
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
score = mean_squared_error(y_test, y_pred)
print("RMSE: ", RMSE)
print("NRMSE: ", RMSE / (y_test.max() - y_test.min()))

RMSE:  10848.987774545836
NRMSE:  0.16018940694188105
