In [1]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import r2_score, mean_squared_error

from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

### AdaBoost

AdaBoost: each learner pays more attention to training instances that were underfitted by previous weak learner

In [3]:
concrete_data = pd.read_csv('dataset/concrete_data.csv')

X = concrete_data.drop('csMPa', axis=1)
Y = concrete_data['csMPa']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [4]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), 
                           n_estimators=100, 
                           learning_rate=1.0) #usually set at 0.01 or 0.001 to prevent overfitting

ada_reg.fit(x_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse',
                                                       max_depth=4,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort=False,
                                                       random_state=None,
                                                       splitter='best'),
                  learning_rate=1.0, loss='linear', n_estimators=100,
                  random_

In [5]:
ada_reg.estimator_weights_

array([1.70036247, 1.36254712, 1.42035226, 1.19954412, 1.26750077,
       1.19688718, 0.92264566, 1.10433941, 1.01040021, 1.12585885,
       1.37222715, 0.67406631, 1.04942413, 1.55446746, 1.10637091,
       1.02583973, 1.19732748, 0.91087722, 0.99717036, 0.82595785,
       1.39374631, 1.01661025, 0.81198543, 0.97427925, 0.7127864 ,
       0.75710123, 0.6233892 , 0.76471728, 0.76729913, 0.71631205,
       0.78545807, 1.13766242, 0.97835529, 0.88213221, 1.31667113,
       1.04231765, 0.60093768, 0.66914071, 0.91573149, 0.74007343,
       0.8106807 , 0.66889768, 0.86014538, 0.77128369, 0.92109397,
       1.32609714, 0.60069815, 0.68126018, 0.5812236 , 0.45779257,
       1.32307873, 1.07215821, 1.03447888, 0.4777055 , 0.85699819,
       0.97581956, 1.31794882, 0.78259009, 0.35531549, 0.64938267,
       0.85471191, 0.71992136, 0.67748182, 0.84652377, 0.81536883,
       1.15839826, 1.14803217, 0.81255634, 0.93805571, 0.65536008,
       0.94491915, 0.96326968, 0.65815734, 0.4525151 , 0.63443

In [6]:
y_pred = ada_reg.predict(x_test)

r2_score(y_test, y_pred)

0.8263992998594876

In [7]:
ada_reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2), 
                           n_estimators=200, 
                           learning_rate=0.5) #usually set at 0.01 or 0.001 to prevent overfitting

ada_reg.fit(x_train, y_train)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse',
                                                       max_depth=2,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       presort=False,
                                                       random_state=None,
                                                       splitter='best'),
                  learning_rate=0.5, loss='linear', n_estimators=200,
                  random_

In [8]:
ada_reg.estimator_weights_

array([0.64878187, 0.57709925, 0.69374779, 0.42889581, 0.53548137,
       0.35437436, 0.50646875, 0.41864503, 0.45502503, 0.41715165,
       0.43190165, 0.49922636, 0.30560577, 0.25704685, 0.31398949,
       0.46462399, 0.31317408, 0.56139081, 0.37064983, 0.21320009,
       0.29440961, 0.3590134 , 0.35806752, 0.45168917, 0.17345379,
       0.3810493 , 0.35326684, 0.48748187, 0.35013357, 0.24628105,
       0.18417025, 0.50007814, 0.15748358, 0.46014733, 0.36041138,
       0.28787901, 0.56996214, 0.30062073, 0.29009669, 0.38278062,
       0.30001574, 0.46209871, 0.39874138, 0.47596928, 0.33405925,
       0.27117352, 0.23500152, 0.42470806, 0.36313898, 0.17495144,
       0.41770502, 0.5477699 , 0.40939985, 0.28673516, 0.18668511,
       0.3444175 , 0.23415821, 0.45228335, 0.29500306, 0.30468814,
       0.36698985, 0.46551182, 0.2882642 , 0.41578311, 0.28891542,
       0.51266078, 0.36798033, 0.30323682, 0.24251906, 0.27722982,
       0.23562604, 0.04515496, 0.2965756 , 0.15199597, 0.31283

In [9]:
y_pred = ada_reg.predict(x_test)

r2_score(y_test, y_pred)

0.6752949868638068

### Gradient Boosting

Gradient boosting sequentially adds predictors to an ensemble where each weak learner learns from the residuals of previous weak learners

model1: y=A1+B1x+e1
model2: e1=A2+B2x+e2
model3: e2=A3+B3x+e3
CombinedModel: y=A1+A2+A3+(B1+B2+B3)x+e4

In [10]:
tree_reg1 = DecisionTreeRegressor(max_depth=3)
tree_reg1.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [11]:
y2 = y_train - tree_reg1.predict(x_train)

tree_reg2 = DecisionTreeRegressor(max_depth=3)
tree_reg2.fit(x_train, y2)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [12]:
y3 = y2 - tree_reg2.predict(x_train)

tree_reg3 = DecisionTreeRegressor(max_depth=3)
tree_reg3.fit(x_train, y3)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [13]:
y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [14]:
r2_score(y_test, y_pred)

0.7474566980305538

In [15]:
y4 = y3 - tree_reg3.predict(x_train)

tree_reg4 = DecisionTreeRegressor(max_depth=3)
tree_reg4.fit(x_train, y4)

y_pred = sum(tree.predict(x_test) for tree in (tree_reg1, tree_reg2, tree_reg3, tree_reg4))

r2_score(y_test, y_pred)

0.7958107727816512

In [16]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=1.0)

gbr.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [17]:
y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.7474566980305538

In [18]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=3, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.2999961312914349

In [19]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=50, learning_rate=0.1)

gbr.fit(x_train, y_train)

y_pred = gbr.predict(x_test)

r2_score(y_test, y_pred)

0.8927949990269481

### hyperparameter tuning of gradient boosting trees

1) grid search
2) warm start and early stopping

#### using grid search

In [20]:
X = X.drop(['flyash', 'coarseaggregate', 'fineaggregate'], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [21]:
baseline = GradientBoostingRegressor(max_depth=3, n_estimators=50)
baseline.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=50,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [22]:
y_pred = baseline.predict(x_test)

r2_score(y_test, y_pred)

0.9072646307921569

In [23]:
importance_features = pd.Series(baseline.feature_importances_, index=X.columns).sort_values(ascending=False)

importance_features

age                 0.366452
cement              0.337295
water               0.100123
slag                0.099232
superplasticizer    0.096898
dtype: float64

In [24]:
gbr = GradientBoostingRegressor(max_depth=3)

parameters = {'n_estimators': [1, 5, 10, 50, 100, 200, 300, 400, 500] }

gridSearch_reg = GridSearchCV(estimator=gbr, param_grid=parameters, cv=3) #3fold cross-validation

In [25]:
gridSearch_reg.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                

In [26]:
gridSearch_reg.best_params_

{'n_estimators': 500}

In [27]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) #shuffle data again

gbr_best = GradientBoostingRegressor(max_depth=3,
                                     n_estimators=gridSearch_reg.best_params_['n_estimators'])

gbr_best.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [28]:
y_pred = gbr_best.predict(x_test)

r2_score(y_test, y_pred)

0.9266305949351323

#### using warm start and early stopping

In [29]:
gbr = GradientBoostingRegressor(max_depth=3, warm_start=True)

In [31]:
min_val_error = float('inf')
error_increasing = 0

for n_estimators in range(1, 1000):
    gbr.n_estimators = n_estimators
    gbr.fit(x_train, y_train)
    
    y_pred = gbr.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred)
    
    print('No. of estimators: ', gbr.n_estimators_)
    print('Validation error: ', val_error)
    
    if val_error < min_val_error:
        min_val_error = val_error
        error_increasing = 0
    else:
        error_increasing += 1
        if error_increasing == 10:
            break

No. of estimators:  1
Validation error:  247.09671411536897
No. of estimators:  2
Validation error:  217.85131825208632
No. of estimators:  3
Validation error:  195.05598925393718
No. of estimators:  4
Validation error:  177.25936988023236
No. of estimators:  5
Validation error:  159.25391003470958
No. of estimators:  6
Validation error:  146.66132620161983
No. of estimators:  7
Validation error:  133.0130213725849
No. of estimators:  8
Validation error:  123.2145332667508
No. of estimators:  9
Validation error:  115.0040553143462
No. of estimators:  10
Validation error:  107.2372076475894
No. of estimators:  11
Validation error:  98.6093070101871
No. of estimators:  12
Validation error:  92.99796956110289
No. of estimators:  13
Validation error:  87.99477434001977
No. of estimators:  14
Validation error:  82.68805620695122
No. of estimators:  15
Validation error:  78.29380826946138
No. of estimators:  16
Validation error:  73.94983643062263
No. of estimators:  17
Validation error:  70

Validation error:  26.456510048725807
No. of estimators:  154
Validation error:  26.411304563287125
No. of estimators:  155
Validation error:  26.38895063600104
No. of estimators:  156
Validation error:  26.36842628480943
No. of estimators:  157
Validation error:  26.360027876734026
No. of estimators:  158
Validation error:  26.330698564345106
No. of estimators:  159
Validation error:  26.280812878951924
No. of estimators:  160
Validation error:  26.316954219408657
No. of estimators:  161
Validation error:  26.26242394911281
No. of estimators:  162
Validation error:  26.200579696928585
No. of estimators:  163
Validation error:  26.167266089735296
No. of estimators:  164
Validation error:  26.10075449615608
No. of estimators:  165
Validation error:  26.07290407638351
No. of estimators:  166
Validation error:  26.02252130341703
No. of estimators:  167
Validation error:  25.998899673786564
No. of estimators:  168
Validation error:  25.986731965788646
No. of estimators:  169
Validation err

No. of estimators:  290
Validation error:  23.238561277379684
No. of estimators:  291
Validation error:  23.20957362804191
No. of estimators:  292
Validation error:  23.16467800029386
No. of estimators:  293
Validation error:  23.141705446290555
No. of estimators:  294
Validation error:  23.157763915545637
No. of estimators:  295
Validation error:  23.132795675781747
No. of estimators:  296
Validation error:  23.12356772727035
No. of estimators:  297
Validation error:  23.083925048840793
No. of estimators:  298
Validation error:  23.065526463731974
No. of estimators:  299
Validation error:  23.054567839041777
No. of estimators:  300
Validation error:  23.017684767297087
No. of estimators:  301
Validation error:  23.016249354145092
No. of estimators:  302
Validation error:  22.994517288631837
No. of estimators:  303
Validation error:  22.987374938549745
No. of estimators:  304
Validation error:  22.959609774539427
No. of estimators:  305
Validation error:  22.942807383657268
No. of esti

No. of estimators:  425
Validation error:  21.288058068868782
No. of estimators:  426
Validation error:  21.288302089035806
No. of estimators:  427
Validation error:  21.265048790078765
No. of estimators:  428
Validation error:  21.26051420243857
No. of estimators:  429
Validation error:  21.242299380597917
No. of estimators:  430
Validation error:  21.24054729247757
No. of estimators:  431
Validation error:  21.23947303115706
No. of estimators:  432
Validation error:  21.243040306200214
No. of estimators:  433
Validation error:  21.241832639685548
No. of estimators:  434
Validation error:  21.214487381378095
No. of estimators:  435
Validation error:  21.197769704564358
No. of estimators:  436
Validation error:  21.179398065845668
No. of estimators:  437
Validation error:  21.1678734545952
No. of estimators:  438
Validation error:  21.167586548656598
No. of estimators:  439
Validation error:  21.166259547393395
No. of estimators:  440
Validation error:  21.168615195292535
No. of estima

In [33]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2) #shuffle data again

gbr_best = GradientBoostingRegressor(max_depth=3,
                                     n_estimators=n_estimators)

gbr_best.fit(x_train, y_train)

y_pred = gbr_best.predict(x_test)

r2_score(y_test, y_pred)

0.8985604135183776