# Boosted Trees

In [4]:
import pandas as pd
import pylab
import seaborn as sns
import numpy as np
import datetime
import copy
import shap
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [2]:
import utils
df = utils.get_data()
len(df)

348

In [3]:
X = np.array(df[["temp_2", "temp_1", "average"]])
y = np.array(df["actual"])
print(X.shape)
print(y.shape)

(348, 3)
(348,)


### Standard Gradient Boosted trees
Based off XGBoost. As for Random Forest, the out-of-the-box values give a mean abs error just below 4 Farenheit (although the errors coming out of the cross-validation show less variance on repeated runs)

In [30]:
regr = GradientBoostingRegressor()
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-3.79999022, -3.59903   , -3.49938456, -4.29404266, -4.3860435 ]), -3.915698191896193)


Tweaking parameters (e.g. min_samples_leaf, min_samples_split) to pull the trees back from overfitting yields some slight performance improvements as before.

In [116]:
regr = GradientBoostingRegressor(
    learning_rate=0.1,
    n_estimators=100,
    subsample=1.0,
    min_samples_split=5,
    min_samples_leaf=10,
    min_weight_fraction_leaf=0.0,
    max_depth=3,
    min_impurity_decrease=0.0,
    max_features=None,
    alpha=0.9,
    max_leaf_nodes=None,
    validation_fraction=0.1,
    n_iter_no_change=None,
    tol=1.0e-4,
    ccp_alpha=0.0
)
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-3.98207312, -4.35697679, -3.50835749, -3.77251924, -3.33010732]), -3.7900067899071628)


### LightGBM Gradient Boosted Trees
Based on LightGBM. As for Random Forest, the out-of-the-box values give a mean abs error just below 4 Farenheit (although the errors coming out of the cross-validation show less variance on repeated runs).

In [120]:
regr = HistGradientBoostingRegressor()
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-4.17764297, -3.77253216, -3.62568237, -4.34194688, -3.7586422 ]), -3.9352893155416284)


Tweaking parameters to pull the trees back from overfitting yields some slight performance improvements as before.

In [140]:
regr = HistGradientBoostingRegressor(
    learning_rate=0.1,
    max_iter=100,
    max_leaf_nodes=None,
    max_depth=3,
    min_samples_leaf=10,
    l2_regularization=0,
    max_bins=255,
    categorical_features=None,
    monotonic_cst=None,
    early_stopping="auto",
    validation_fraction=0.1,
    n_iter_no_change=10,
    tol=1.0e-4
)
scores = utils.get_cross_val_scores(regr, X, y, 5)
print((scores, np.mean(scores)))

(array([-3.86177589, -3.67483445, -3.70281161, -3.61859548, -4.03036255]), -3.777675997142036)
