# Example of using Gradient Boosting Model
Demonstrate Gradient Boosting on the Boston housing dataset.

This example fits a Gradient Boosting model with least squares loss and 500 regression trees of depth 4.

https://scikit-learn.org/stable/modules/ensemble.html

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py

https://stats.stackexchange.com/questions/277399/gradient-boosting-regression-boston-housing-data-example


In [None]:
print(__doc__)
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>
#
# License: BSD 3 clause

import pandas as pd
import datetime as dt

import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

In [None]:
# #############################################################################
# Load data
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]


boston

df_d=pd.DataFrame(boston.data)
df_t=pd.DataFrame(boston.target)
df_fs=pd.DataFrame(boston.feature_names)
df_d.head()
df_t.head()
df_fs.head()

# df_d.describe()

In [None]:
# #############################################################################
# Fit regression model
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

In [None]:
# #############################################################################
# Plot training deviance

# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

In [None]:
# #############################################################################
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, boston.feature_names[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

Another simpler and straightforward example of using GBoosting in Regression
https://scikit-learn.org/stable/modules/ensemble.html

In [3]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
X_train, X_test = X[:200], X[200:]
y_train, y_test = y[:200], y[200:]
est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
                                max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
mean_squared_error(y_test, est.predict(X_test))    

5.009154859960321

In [2]:
y_test

array([15.49230505,  6.57114734, 10.37579443, 20.59690679,  9.46865402,
       12.2249463 , 13.46770655,  6.77099675, 13.99472406, 13.00220127,
       14.80966762,  6.96343743, 19.88643914, 12.67035772, 23.38408406,
       17.55949076, 19.98624112, 10.86230425,  7.5876299 , 26.58358565,
       10.41935049, 14.88679072, 26.8679316 , 15.3904829 , 13.48134564,
       10.19557285, 15.0949473 , 11.55955201, 17.0435683 , 12.10118855,
       15.21257905, 16.9270714 , 17.59847137,  9.82770839, 11.35562157,
        8.04806718, 13.95536071, 13.48218917,  9.18304908, 15.82944998,
       20.57680662, 13.95615034, 17.65148542, 14.11734029, 10.43568716,
       16.08225541, 24.94581331, 10.61163278,  8.84999584, 10.15520117,
       17.97192248, 25.12853324, 15.09652962, 13.65832049, 22.4078155 ,
       22.26310279, 16.86961838, 22.01849168, 20.88683737, 13.61498623,
       12.24839106, 15.56222229, 16.55543915, 12.07656541, 17.28603182,
       18.503831  , 17.92014109, 22.37944116, 17.91580877, 13.54