# Boosted Trees

In [1]:
import pandas as pd
import pylab
import seaborn as sns
import numpy as np
import datetime
import copy
import shap
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
import utils
df = utils.get_data()
len(df)

348

In [3]:
X = np.array(df[["temp_2", "temp_1", "average"]])
y = np.array(df["actual"])
print(X.shape)
print(y.shape)

(348, 3)
(348,)


### Standard Gradient Boosted trees
Based off XGBoost. As for Random Forest, the out-of-the-box values give a mean abs error just below 4 Farenheit (although the errors coming out of the cross-validation show less variance on repeated runs)

In [4]:
regr = GradientBoostingRegressor()
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-4.65454863, -4.11422532, -3.49592022, -3.82909805, -4.01827258]), -4.022412958898516)


Tweaking parameters (e.g. min_samples_leaf, min_samples_split) to pull the trees back from overfitting yields some slight performance improvements as before.

In [5]:
regr = GradientBoostingRegressor(
    learning_rate=0.1,
    n_estimators=100,
    subsample=1.0,
    min_samples_split=5,
    min_samples_leaf=10,
    min_weight_fraction_leaf=0.0,
    max_depth=3,
    min_impurity_decrease=0.0,
    max_features=None,
    alpha=0.9,
    max_leaf_nodes=None,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=1.0e-4,
    ccp_alpha=0.0
)
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-4.6863613 , -3.44053954, -3.49802911, -3.9556169 , -3.52534958]), -3.821179286887902)


### LightGBM Gradient Boosted Trees
Based on LightGBM. As for Random Forest, the out-of-the-box values give a mean abs error just below 4 Farenheit (although the errors coming out of the cross-validation show less variance on repeated runs).

In [6]:
regr = HistGradientBoostingRegressor()
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-3.51845481, -4.22037392, -3.76393088, -3.73469817, -3.98598955]), -3.8446894674118197)


Tweaking parameters to pull the trees back from overfitting yields some slight performance improvements as before.

In [16]:
regr = HistGradientBoostingRegressor(
    learning_rate=0.1,
    max_iter=100,
    max_leaf_nodes=None,
    max_depth=3,
    min_samples_leaf=10,
    l2_regularization=0,
    max_bins=255,
    categorical_features=None,
    monotonic_cst=None,
    early_stopping="auto",
    validation_fraction=0.1,
    n_iter_no_change=10,
    tol=1.0e-4
)
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-3.9521812 , -3.41632257, -4.2550038 , -3.95979037, -3.81842395]), -3.8803443790349705)


### Tweaking hyperparams with grid search

We can try and see if there is any mileage in tweaking the model parameters.

In [30]:
X_train, X_test, y_train, y_test = utils.get_train_test_split(X, y)

In [45]:
regr = HistGradientBoostingRegressor(max_iter=25)
grid_values = {
    "learning_rate": [0.07, 0.1, 0.13, 0.17],
    "max_leaf_nodes": [10, 12, 15],
    "max_depth": [4,6,8,10],
    "min_samples_leaf": [10, 12, 15, 20],
    "l2_regularization": [0,1],
}
grid_regr_acc = GridSearchCV(regr, param_grid = grid_values,scoring = 'neg_mean_squared_error')
grid_regr_acc.fit(X_train, y_train)  # this will use a default CV scheme on the training data of 5-fold

GridSearchCV(estimator=HistGradientBoostingRegressor(max_iter=25),
             param_grid={'l2_regularization': [0, 1],
                         'learning_rate': [0.07, 0.1, 0.13, 0.17],
                         'max_depth': [4, 6, 8, 10],
                         'max_leaf_nodes': [10, 12, 15],
                         'min_samples_leaf': [10, 12, 15, 20]},
             scoring='neg_mean_squared_error')

In [46]:
grid_regr_acc.best_estimator_

HistGradientBoostingRegressor(l2_regularization=1, learning_rate=0.13,
                              max_depth=8, max_iter=25, max_leaf_nodes=10,
                              min_samples_leaf=12)

In [47]:
scores = utils.get_cross_val_scores(grid_regr_acc.best_estimator_, X_train, y_train, 5)
print((scores, np.mean(scores)))
# this will test against most of the data used to train over the grid search; 
# so effectively the performance on the training data

(array([-3.49763578, -4.46351324, -3.89038819, -3.88515278, -3.62760239]), -3.872858476134982)


In [48]:
scores = utils.get_cross_val_scores(grid_regr_acc.best_estimator_, X_test, y_test, 5)
print((scores, np.mean(scores)))

(array([-4.19739768, -4.10158286, -4.32926636, -4.65390794, -3.36886148]), -4.130203264458821)


Even with a search, the performance does not really improve.