# SMAC: Decision Tree Regressor

# Imports

In [34]:
from ConfigSpace import Categorical, Configuration, ConfigurationSpace, Float, Integer
from ConfigSpace.conditions import InCondition
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from smac import HyperparameterOptimizationFacade, Scenario
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import LogNorm
from scipy.stats import randint
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import pandas as pd
import numpy as np
import time

# Read data

In [18]:
# dataset part 1
dataset = pd.read_csv('../../../dataset/smog_part1.csv', index_col = 0)

In [19]:
dataset = dataset[['pm25_x', 'pm25_y', 'temperatureSht', 'humiditySht', 'pressure']]
dataset = dataset[dataset['humiditySht'] >= 60]

dataset.head()

Unnamed: 0,pm25_x,pm25_y,temperatureSht,humiditySht,pressure
0,19.0,13.8,6.1,72,986
1,16.7,14.4,5.9,74,986
2,21.8,15.1,5.7,75,986
3,21.8,18.0,5.7,76,986
4,25.0,19.0,5.5,77,986


# Train and test split

Split data for train and test sets and normalze

In [20]:
Y = dataset['pm25_x'].to_numpy().reshape(-1, 1)
X = dataset['pm25_y'].to_numpy().reshape(-1, 1)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True, random_state = 0)

In [22]:
scaler = StandardScaler().fit(x_train)
y_scaler = max(y_train)

x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

y_train = y_train / y_scaler
y_test = y_test / y_scaler

# SKLearn Decision Tree Regressor
default values

In [23]:
dtm = DecisionTreeRegressor()

dtm.fit(x_train,y_train)

train_dataset_score = dtm.score(x_train,y_train)
test_dataset_score = dtm.score(x_test, y_test)

print('R2 on train dataset: ', round(train_dataset_score * 100, 2))
print('R2 on test dataset: ', round(test_dataset_score * 100, 2))

R2 on train dataset:  81.95
R2 on test dataset:  77.14


In [24]:
class TREE:
    @property
    def configspace(self) -> ConfigurationSpace:
        # Configuration Space
        cs = ConfigurationSpace(seed=0)

        # Hyperparameters
        criterion = Categorical("criterion", ["squared_error", "friedman_mse", "absolute_error", "poisson"], default="absolute_error")
        splitter = Categorical("splitter", ["best", "random"], default="best")
        max_depth = Integer("max_depth", (1, 8192), default=None)
        min_samples_split = Integer("min_samples_split", (2, 512), default=2)
        min_samples_leaf = Integer("min_samples_leaf", (1, 512), default=1)
        max_leaf_nodes = Integer("max_leaf_nodes", (2, 131072), default=None)
        
        # Dependencies
        use_splitter = InCondition(child=splitter, parent=criterion, values=["squared_error", "friedman_mse", "absolute_error", "poisson"])
        use_max_depth = InCondition(child=max_depth, parent=criterion, values=["squared_error", "friedman_mse", "absolute_error", "poisson"])
        use_min_samples_split = InCondition(child=min_samples_split, parent=criterion, values=["squared_error", "friedman_mse", "absolute_error", "poisson"])
        use_min_samples_leaf = InCondition(child=min_samples_leaf, parent=criterion, values=["squared_error", "friedman_mse", "absolute_error", "poisson"])
        use_max_leaf = InCondition(child=max_leaf_nodes, parent=criterion, values=["squared_error", "friedman_mse", "absolute_error", "poisson"])
        
        cs.add_hyperparameters([criterion, splitter, max_depth, min_samples_split, min_samples_leaf, max_leaf_nodes])
        cs.add_conditions([use_splitter, use_max_depth, use_min_samples_split, use_min_samples_leaf, use_max_leaf])

        return cs

    def train(self, config: Configuration, seed: int = 0) -> float:
        """Creates a SVM based on a configuration and evaluates it on the
        iris-dataset using cross-validation."""
        config_dict = dict(config)

        classifier = DecisionTreeRegressor(**config_dict, random_state=seed)
        scores = cross_val_score(classifier, x_train, y_train.ravel(), cv=5)
        cost = 1 - np.mean(scores)

        return cost

In [25]:
%%time

if __name__ == "__main__":
    classifier = TREE()

    # general information about the run
    scenario = Scenario(
        classifier.configspace,
        n_trials=500,
        n_workers=-1
    )

    # We want to run the facade's default initial design, but we want to change the number
    # of initial configs to 5.
    initial_design = HyperparameterOptimizationFacade.get_initial_design(scenario, n_configs=5)

    # Now we use SMAC to find the best hyperparameters
    smac = HyperparameterOptimizationFacade(
        scenario,
        classifier.train,
        initial_design = initial_design,
        overwrite = True,
    )

    incumbent = smac.optimize()

    # Get cost of default configuration
    default_cost = smac.validate(classifier.configspace.get_default_configuration())
    print(f"Default cost: {default_cost}")

    # Let's calculate the cost of the incumbent
    incumbent_cost = smac.validate(incumbent)
    print(f"Incumbent cost: {incumbent_cost}")

[INFO][abstract_initial_design.py:82] Using `n_configs` and ignoring `n_configs_per_hyperparameter`.
[INFO][abstract_initial_design.py:147] Using 5 initial design configurations and 0 additional configurations.
[INFO][abstract_intensifier.py:515] Added config 7205e8 as new incumbent because there are no incumbents yet.
[INFO][abstract_intensifier.py:590] Added config 2a6786 and rejected config 7205e8 as incumbent because it is not better than the incumbents on 3 instances:
[INFO][configspace.py:175] --- criterion: 'friedman_mse' -> 'absolute_error'
[INFO][configspace.py:175] --- max_depth: 5084 -> 1051
[INFO][configspace.py:175] --- max_leaf_nodes: 14093 -> 33436
[INFO][configspace.py:175] --- min_samples_leaf: 109 -> 36
[INFO][configspace.py:175] --- min_samples_split: 442 -> 92
[INFO][smbo.py:319] Finished 50 trials.
[INFO][smbo.py:319] Finished 100 trials.
[INFO][abstract_intensifier.py:590] Added config a34ecc and rejected config 2a6786 as incumbent because it is not better than th

In [26]:
print(incumbent)

Configuration(values={
  'criterion': 'absolute_error',
  'max_depth': 2399,
  'max_leaf_nodes': 14154,
  'min_samples_leaf': 24,
  'min_samples_split': 51,
  'splitter': 'best',
})


In [27]:
dtr = DecisionTreeRegressor(criterion='absolute_error',
                              splitter='best',
                              max_depth=2399,
                              min_samples_split=51,
                              min_samples_leaf=24,
                              max_leaf_nodes=14154)
                              
dtr.fit(x_train,y_train.ravel())

In [35]:
y_pred = dtr.predict(x_test) * y_scaler

r2 = r2_score(y_test * y_scaler, y_pred) * 100
mse = mean_squared_error(y_test * y_scaler, y_pred)
mae = mean_absolute_error(y_test * y_scaler, y_pred)
print('R2: ', round(r2, 2))
print('MSE: ',round(mse, 2))
print('MAE: ',round(mae, 2))

R2:  78.58
MSE:  11.22
MAE:  1.55
