In [2]:
import numpy as np 
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier

import data_prep as dp

### Gradient Booster and XGBoost

The process of training boosting models is sequential. First, we train the model, then we use the errors to train the model again and so on.

- from [here](https://vitalflux.com/bagging-vs-boosting-machine-learning-methods/)

The idea behind boosting is to train a series of weak models and then combine the predictions of those models to create a strong model. Unlike bagging, which trains multiple models independently, boosting trains each new model such that it focuses on correcting the errors made by the previous model. By training a series of weak models and combining their predictions, you can create a strong model that has high accuracy.

One of the benefits of using boosting in machine learning is that it can help to improve the accuracy of a classifier. This is because boosting can be used to combine the predictions of a number of different weak classifiers, which can result in a more accurate overall classification. Additionally, boosting can also be used to improve the robustness of a classifier, meaning that it is less likely to be affected by noise or inaccuracies in the data. Boosting can also be used to improve the generalization ability of the classifiers.

![image](https://vitalflux.com/wp-content/uploads/2022/11/boosting-vs-bagging-differences-examples.png)

In [3]:
X_train, X_val, X_test, y_train, y_val, y_test = \
    dp.X_train, dp.X_val, dp.X_test, dp.y_train, dp.y_val, dp.y_test
dv = dp.dv

In [4]:
features = dv.get_feature_names()
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [5]:
# set parameters to pass to xgboost
xgb_params = {
    'eta': 0.3, # learning rate
    'max_depth': 6,
    'min_child_weight': 1, # min_samples_leaf

    'objective': 'binary:logistic', # specify that we have a binary classification model
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1 # show the process 0, 1, 2
}

In [15]:
model = xgb.train(xgb_params, dtrain, num_boost_round=20)

In [16]:
y_pred = model.predict(dval)

In [17]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_val, y_pred)

0.8181110895836865

In [18]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [21]:
xgb_params = {
    'eta': 0.3, # learning rate
    'max_depth': 6,
    'min_child_weight': 1, # min_samples_leaf

    'objective': 'binary:logistic', # specify that we have a binary classification model
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1 # show the process 0, 1, 2
}
model = xgb.train(xgb_params, 
                    dtrain, 
                    evals=watchlist, # to print train and val sets auc score
                    verbose_eval = 5, # to print out only every 5th step
                    num_boost_round=200)

[0]	train-auc:0.86730	val-auc:0.77938
[5]	train-auc:0.93086	val-auc:0.80858
[10]	train-auc:0.95447	val-auc:0.80851
[15]	train-auc:0.96554	val-auc:0.81334
[20]	train-auc:0.97464	val-auc:0.81729
[25]	train-auc:0.97953	val-auc:0.81686
[30]	train-auc:0.98579	val-auc:0.81543
[35]	train-auc:0.99011	val-auc:0.81206
[40]	train-auc:0.99421	val-auc:0.80922
[45]	train-auc:0.99548	val-auc:0.80842
[50]	train-auc:0.99653	val-auc:0.80918
[55]	train-auc:0.99765	val-auc:0.81114
[60]	train-auc:0.99817	val-auc:0.81172
[65]	train-auc:0.99887	val-auc:0.80798
[70]	train-auc:0.99934	val-auc:0.80870
[75]	train-auc:0.99965	val-auc:0.80555
[80]	train-auc:0.99979	val-auc:0.80549
[85]	train-auc:0.99988	val-auc:0.80374
[90]	train-auc:0.99993	val-auc:0.80409
[95]	train-auc:0.99996	val-auc:0.80548
[100]	train-auc:0.99998	val-auc:0.80509
[105]	train-auc:0.99999	val-auc:0.80629
[110]	train-auc:1.00000	val-auc:0.80637
[115]	train-auc:1.00000	val-auc:0.80494
[120]	train-auc:1.00000	val-auc:0.80574
[125]	train-auc:1.0000

The model with 200 iterations is definetely overfitting. To capture the long output we can use the magic command `%%capture`, it saves the output into a variable.

In [22]:
%%capture output
model = xgb.train(xgb_params, 
                    dtrain, 
                    evals=watchlist, # to print train and val sets auc score
                    verbose_eval = 5, # to print out only every 5th step
                    num_boost_round=200)

In [26]:
print(output.stdout)

[0]	train-auc:0.86730	val-auc:0.77938
[5]	train-auc:0.93086	val-auc:0.80858
[10]	train-auc:0.95447	val-auc:0.80851
[15]	train-auc:0.96554	val-auc:0.81334
[20]	train-auc:0.97464	val-auc:0.81729
[25]	train-auc:0.97953	val-auc:0.81686
[30]	train-auc:0.98579	val-auc:0.81543
[35]	train-auc:0.99011	val-auc:0.81206
[40]	train-auc:0.99421	val-auc:0.80922
[45]	train-auc:0.99548	val-auc:0.80842
[50]	train-auc:0.99653	val-auc:0.80918
[55]	train-auc:0.99765	val-auc:0.81114
[60]	train-auc:0.99817	val-auc:0.81172
[65]	train-auc:0.99887	val-auc:0.80798
[70]	train-auc:0.99934	val-auc:0.80870
[75]	train-auc:0.99965	val-auc:0.80555
[80]	train-auc:0.99979	val-auc:0.80549
[85]	train-auc:0.99988	val-auc:0.80374
[90]	train-auc:0.99993	val-auc:0.80409
[95]	train-auc:0.99996	val-auc:0.80548
[100]	train-auc:0.99998	val-auc:0.80509
[105]	train-auc:0.99999	val-auc:0.80629
[110]	train-auc:1.00000	val-auc:0.80637
[115]	train-auc:1.00000	val-auc:0.80494
[120]	train-auc:1.00000	val-auc:0.80574
[125]	train-auc:1.0000

In [28]:
s = output.stdout
s.split('\n')[:5]

['[0]\ttrain-auc:0.86730\tval-auc:0.77938',
 '[5]\ttrain-auc:0.93086\tval-auc:0.80858',
 '[10]\ttrain-auc:0.95447\tval-auc:0.80851',
 '[15]\ttrain-auc:0.96554\tval-auc:0.81334',
 '[20]\ttrain-auc:0.97464\tval-auc:0.81729']

In [30]:
lines = s.split('\n')
lines[0].split('\t')

['[0]', 'train-auc:0.86730', 'val-auc:0.77938']

In [31]:
# assign the list values of the 1st line into variables
num_iterations, train_score, val_score = lines[0].split('\t')


In [32]:
# check the number of iterations
num_iterations

'[0]'

In [34]:
# get the integer out ouf the string
int(num_iterations.strip('[]'))

0

In [35]:
# check the train_score string
train_score

'train-auc:0.86730'

In [41]:
# turn trains_score into a number
print(train_score.split(':'))
print(train_score.split(':')[1])
float(train_score.split(':')[1])

['train-auc', '0.86730']
0.86730


0.8673

In [42]:
val_score

'val-auc:0.77938'

In [43]:
float(val_score.split(':')[1])

0.77938

Put everything into a function

In [None]:
def parse_xgb_output(output):
    ''' 
    Returns 3 lists: number of iterations, scores for train, scores for validation
    '''
    iterations = []
    train_scores = []
    validations_scores = []

    # access every line of the code
    for line in output.stdout.strip().split('\n'):
        # split the line by tabulation
        num_iterations, train_score, val_score = lines[0].split('\t')
        # save numeric values from the strings