In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
import data_preprocessing

energy_consumption_data_train, energy_consumption_data_test = data_preprocessing.preprocess_data("Energy Consumption Dataset.csv")

In [4]:
X_train = pd.DataFrame(energy_consumption_data_train.iloc[:, :18])
Y_train = pd.DataFrame(energy_consumption_data_train.iloc[:, -1])

X_test = pd.DataFrame(energy_consumption_data_test.iloc[:, :18])
Y_test = pd.DataFrame(energy_consumption_data_test.iloc[:, -1])
_range = (Y_test.max() - Y_test.min()).item()

In [5]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

In [6]:
# try a default decision tree without hyperparameter tuning
regressor = DecisionTreeRegressor()
regressor.fit(X_train, Y_train)
Y_pred = regressor.predict(X_test)

In [7]:
Y_test.values[:5].swapaxes(0, 1), Y_pred[:5]

(array([[36169.  , 34264.75, 31731.5 , 32394.75, 32294.25]]),
 array([39047.75, 55805.5 , 43994.  , 52247.  , 40987.75]))

In [8]:
# estimate example using (normalized) RMSE
from sklearn.metrics import mean_squared_error

rsme = np.sqrt(mean_squared_error(Y_test, Y_pred))
rsme_norm = rsme / _range

print(f"{rsme=}")
print(f"{rsme_norm=}")

rsme=14855.722627344107
rsme_norm=0.21935036215551054


In [12]:
# inspect decision tree
import math

print(f"{regressor.tree_.node_count=}")
print(f"{regressor.get_depth()=}")
print(f"{regressor.get_n_leaves()=}")

print(f"{(math.fsum(regressor.tree_.impurity) / len(regressor.tree_.impurity))=}")
print(f"{regressor.tree_.max_n_classes=}")

regressor.tree_.node_count=80581
regressor.get_depth()=40
regressor.get_n_leaves()=40291
(math.fsum(regressor.tree_.impurity) / len(regressor.tree_.impurity))=7840993.941107945
regressor.tree_.max_n_classes=1


In [13]:
# utilize randomized search with a parameter grid to finde the best hyperparameters for
# the decision tree regressor.

regressor = DecisionTreeRegressor()

param_dist = {"max_depth":          [i for i in range(0, 9)],
              "max_features":       [i for i in range(0, 9)],
              "min_samples_leaf":   [i for i in range(0, 9)],
              "criterion":          ['squared_error', 'poisson', 'friedman_mse', 'absolute_error'],
              "ccp_alpha":          [i for i in range(0, 9)]}

# Create the GridSearchCV object
rand_search_dt = RandomizedSearchCV(estimator=regressor,
                                    param_distributions=param_dist,
                                    cv=KFold(n_splits=5, shuffle=True, random_state=42))

# Fit the GridSearchCV object to the training data
rand_search_dt.fit(X_train, Y_train)

# Get the best hyperparameters
best_regressor = rand_search_dt.best_estimator_
best_hyperparams = rand_search_dt.best_params_

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/Users/I539028/.pyenv/versions/3.11.6/lib/python3.11/site-packages/sklearn/uti

In [14]:
f"{best_hyperparams=}"

"best_hyperparams={'min_samples_leaf': 6, 'max_features': 4, 'max_depth': 8, 'criterion': 'squared_error', 'ccp_alpha': 3}"

In [15]:
regressor = best_regressor

In [16]:
# inspect decision tree
import math

print(f"{regressor.tree_.node_count=}")
print(f"{regressor.get_depth()=}")
print(f"{regressor.get_n_leaves()=}")

print(f"{(math.fsum(regressor.tree_.impurity) / len(regressor.tree_.impurity))=}")
print(f"{regressor.tree_.max_n_classes=}")

regressor.tree_.node_count=429
regressor.get_depth()=8
regressor.get_n_leaves()=215
(math.fsum(regressor.tree_.impurity) / len(regressor.tree_.impurity))=101609480.7186063
regressor.tree_.max_n_classes=1


In [17]:
# predict on testset using best regressor
regressor.fit(X_train, Y_train)

# predict values
Y_pred = regressor.predict(X_test)

Y_test.values[:5].swapaxes(0, 1), Y_pred[:5]

(array([[36169.  , 34264.75, 31731.5 , 32394.75, 32294.25]]),
 array([44095.10598377, 45539.18421053, 45539.18421053, 45539.18421053,
        44095.10598377]))

In [18]:
# evaluate using (normalized) RMSE
rsme = np.sqrt(mean_squared_error(Y_test, Y_pred))
print(f"{rsme=}")
print(f"{rsme_norm=}")

rsme=11634.101467363196
rsme_norm=0.21935036215551054
