# Model: Decision Tree Regressor
This notebook documents our process of approaching the given regression task using a DecisionTreeRegressor. The following approaches are documented:
* paramter optimization using RandomizedSearchCV
* paramter optimization using Hyperopt
* each of the mentioned approaches is tried with K-Fold CrossValidation and TimeSeriesSplit CrossValidation

In [2]:
from sklearn.metrics import mean_squared_error
import data_preprocessing
from sklearn.tree import DecisionTreeRegressor
from hyperopt import hp, space_eval, tpe, fmin, Trials
from math import sqrt
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from scipy.stats import uniform, randint

In [3]:
# import preprocessed data
train, test = data_preprocessing.preprocess_data("Energy Consumption Dataset.csv")

In [4]:
# separate features from target variable
def split_x_y(df):
    y = df["Energy Consumption"]
    X = df.drop("Energy Consumption", axis=1)
    return X, y

X_train, y_train = split_x_y(train)
X_test, y_test = split_x_y(test)

## Using RandomizedSearchCV
First, we need to create a search grid / parameter distribution for RandomizedSearchCV to select from

In [5]:
param_dist = {
    'min_samples_leaf': uniform(),
    'min_samples_split': uniform(),
    'max_depth': randint(2, 18),
    'max_features': randint(1, 18),
    'ccp_alpha': uniform(),
    'max_leaf_nodes': randint(10, 100),
    'min_impurity_decrease': uniform(),
    'min_weight_fraction_leaf': uniform(0.01, 0.5)
}

### Using K-Fold CrossVal

In [7]:
# Create the DT Regressor
reg = DecisionTreeRegressor(random_state=42)
# Create the RandomizedSearchCV instance
random_search = RandomizedSearchCV(reg,
                                   param_distributions=param_dist,
                                   n_iter=1000,
                                   cv=5,
                                   random_state=42,
                                   scoring="neg_mean_squared_error")

# perform the hyperparameter optimization
random_search.fit(X_train, y_train)

100 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/

In [8]:
# inspect the model's performance
best_score = random_search.best_score_
rmse = sqrt(-best_score)
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  11052.766802091313
NRMSE:  0.1631982813408634


In [9]:
# inspect the tuned hyperparameters
best_params = random_search.best_params_
best_params

{'ccp_alpha': 0.9954215129254561,
 'max_depth': 6,
 'max_features': 7,
 'max_leaf_nodes': 90,
 'min_impurity_decrease': 0.8826154575172458,
 'min_samples_leaf': 0.09697876251609538,
 'min_samples_split': 0.055706836115245184,
 'min_weight_fraction_leaf': 0.07461190005137468}

In [11]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance
rf = DecisionTreeRegressor(**best_params, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  11533.000197356256
NRMSE:  0.17028910901804706


### Using TimeSeriesSplit

In [12]:
# Create TimeSeriesSplit instance
tscv = TimeSeriesSplit(n_splits=5)

In [13]:
# Create the DT Regressor
reg = DecisionTreeRegressor(random_state=42)
# Create the RandomizedSearchCV instance
random_search = RandomizedSearchCV(reg,
                                   param_distributions=param_dist,
                                   n_iter=1000,
                                   cv=tscv, # here, we pass TSCV instead of regular CV
                                   random_state=42,
                                   scoring="neg_mean_squared_error")

# perform the hyperparameter optimization
random_search.fit(X_train, y_train)

100 fits failed out of a total of 5000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/

In [14]:
# inspect the model's performance
best_score = random_search.best_score_
rmse = sqrt(-best_score)
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  10958.794202509443
NRMSE:  0.16181074037311288


In [15]:
# inspect the tuned hyperparameters
best_params = random_search.best_params_
best_params

{'ccp_alpha': 0.15041689110352818,
 'max_depth': 6,
 'max_features': 17,
 'max_leaf_nodes': 86,
 'min_impurity_decrease': 0.06351182959000135,
 'min_samples_leaf': 0.03682186763599138,
 'min_samples_split': 0.1338521188006332,
 'min_weight_fraction_leaf': 0.016835982413498644}

In [16]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance
rf = DecisionTreeRegressor(**best_params, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  11093.479600694762
NRMSE:  0.1637994212074353


## Using Hyperopt

In [19]:
from hyperopt import hp, space_eval, tpe, fmin, Trials
from sklearn.model_selection import cross_val_score

The following functions are passed to Hyperopt's `fmin` func - one using K-Fold CV and one using TimeSeriesSplit

In [20]:
# optimization function for K-Fold Cross Validation
def decision_tree_regressor_kfold(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf = args

    reg = DecisionTreeRegressor(random_state=42,

                                min_samples_leaf=min_samples_leaf,
                                min_samples_split=min_samples_split,
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),
                               )

    # we return the best score resulting from cross validation - KF-fold here
    return sqrt(-cross_val_score(reg, X_train, y_train, cv=5, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [21]:
# optimization function for TimeSeriesSplit Cross Validation
def decision_tree_regressor_tscv(args):
    min_samples_leaf, min_samples_split, max_depth, max_features, ccp_alpha, max_leaf_nodes, min_impurity_decrease, min_weight_fraction_leaf = args

    tscv = TimeSeriesSplit(n_splits=5)

    reg = DecisionTreeRegressor(random_state=42,

                                min_samples_leaf=min_samples_leaf,
                                min_samples_split=min_samples_split,
                                ccp_alpha=ccp_alpha,
                                min_impurity_decrease=min_impurity_decrease,
                                min_weight_fraction_leaf=min_weight_fraction_leaf,

                                max_depth=int(max_depth),
                                max_features=int(max_features),
                                max_leaf_nodes=int(max_leaf_nodes),
                               )

    # we return the best score resulting from cross validation - TimeSeriesSplit here
    return sqrt(-cross_val_score(reg, X_train, y_train, cv=tscv, scoring="neg_mean_squared_error", n_jobs=-1).mean())

In [22]:
# define the search space
space = [
    hp.uniform('min_samples_leaf', 0.01, 1.0),
    hp.uniform('min_samples_split', 0.01, 1.0),
    hp.quniform('max_depth', 2, 18, 1),
    hp.quniform('max_features', 1, 18, 2),
    hp.quniform('ccp_alpha', 0.01, 1.0, 0.05),
    hp.quniform('max_leaf_nodes', 10, 100, 5),
    hp.quniform('min_impurity_decrease', 0.01, 1.0, 0.05),
    hp.quniform('min_weight_fraction_leaf', 0.01, 0.5, 0.05)
]

### Using regular K-Fold CV

In [24]:
trials = Trials()

# to perform a hyperparameter optimization, we pass the optimization function to Hyperopt
# Here, we're using K-Fold CV
best = fmin(fn=decision_tree_regressor_kfold, space=space, algo=tpe.suggest, verbose=True, max_evals=1000, trials=trials)

100%|██████████| 1000/1000 [03:52<00:00,  4.29trial/s, best loss: 10561.653255260968]


In [25]:
# inspect the model's performance
rmse = trials.best_trial['result']['loss']
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  10561.653255260968
NRMSE:  0.1559468041115815


In [26]:
# inspect the tuned hyperparameters
best

{'ccp_alpha': 0.45,
 'max_depth': 9.0,
 'max_features': 14.0,
 'max_leaf_nodes': 95.0,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 0.01040378956496385,
 'min_samples_split': 0.01015424055167271,
 'min_weight_fraction_leaf': 0.0}

In [27]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance

# get optimized hyperparameters from result
ccp_alpha, max_depth, max_features, max_leaf_nodes, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf = best.values()

# train the Decision Tree
reg = DecisionTreeRegressor(random_state=42,

                            min_samples_leaf=min_samples_leaf,
                            min_samples_split=min_samples_split,
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes))

# predict on whole dataset
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  11093.479600694762
NRMSE:  0.1637994212074353


### using TimeSeriesSplit

In [28]:
trials = Trials()

# to perform a hyperparameter optimization, we pass the optimization function to Hyperopt
# Here, we're using K-Fold CV
best = fmin(fn=decision_tree_regressor_tscv, space=space, algo=tpe.suggest, verbose=True, max_evals=1000, trials=trials)

100%|██████████| 1000/1000 [03:35<00:00,  4.64trial/s, best loss: 10588.439863554451]


In [29]:
# inspect the model's performance
rmse = trials.best_trial['result']['loss']
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  10588.439863554451
NRMSE:  0.15634231851215857


In [30]:
# inspect the tuned hyperparameters
best

{'ccp_alpha': 0.6000000000000001,
 'max_depth': 10.0,
 'max_features': 14.0,
 'max_leaf_nodes': 80.0,
 'min_impurity_decrease': 0.75,
 'min_samples_leaf': 0.010062040532295273,
 'min_samples_split': 0.010087827648561807,
 'min_weight_fraction_leaf': 0.0}

In [31]:
# Train the model on the whole dataset and predict unseen data to assess it's real performance

# get optimized hyperparameters from result
ccp_alpha, max_depth, max_features, max_leaf_nodes, min_impurity_decrease, min_samples_leaf, min_samples_split, min_weight_fraction_leaf = best.values()

# train the Decision Tree
reg = DecisionTreeRegressor(random_state=42,

                            min_samples_leaf=min_samples_leaf,
                            min_samples_split=min_samples_split,
                            ccp_alpha=ccp_alpha,
                            min_impurity_decrease=min_impurity_decrease,
                            min_weight_fraction_leaf=min_weight_fraction_leaf,

                            max_depth=int(max_depth),
                            max_features=int(max_features),
                            max_leaf_nodes=int(max_leaf_nodes))

# predict on whole dataset
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rmse = sqrt(mean_squared_error(y_test, y_pred))
print("RMSE: ", rmse)
print("NRMSE: ", rmse / (y_test.max() - y_test.min()))

RMSE:  11093.479600694762
NRMSE:  0.1637994212074353
