# End to End XGBoost

https://github.com/mattharrison/talks

©2023 MetaSnake

`@__mharrison__`

## Libraries
We will also use SHAP, xgbfir, openpyxl, hyperopt

In [None]:
# for colab
!pip install dtreeviz feature_engine pybaobabdt xgbfir shap

In [None]:
import feature_engine
from feature_engine import encoding, imputation
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import sklearn
from sklearn import base, compose, datasets, ensemble, \
    metrics, model_selection, pipeline, preprocessing, tree
import xgboost as xgb
import yellowbrick
import yellowbrick.model_selection as ms
from yellowbrick import classifier

import urllib
import zipfile

In [None]:
for mod in [xgb, sklearn, yellowbrick, feature_engine]:
    print(f'{str(mod)[9:20]} {mod.__version__}')

## Datasets

I'll be demoing with Kaggle 2018 survey data


In [None]:
import os
local = 'kaggle-survey-2018.zip'
if not os.path.exists(local):
    url = 'https://github.com/mattharrison/datasets/raw/master/data/kaggle-survey-2018.zip'
    fin = urllib.request.urlopen(url)
    data = fin.read()
    with open(local, mode='wb') as fout:
        fout.write(data)
with zipfile.ZipFile(local) as z:
    print(z.namelist())
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    kag_questions = kag.iloc[0]
    raw = kag.iloc[1:]

In [None]:
def topn(ser, n=5, default='other'):
    counts = ser.value_counts()
    return ser.where(ser.isin(counts.index[:n]), default)

def tweak_kag(df):
    return (df
        #.query('Q3.isin(["United States of America", "China", "India"]) '\
        #       'and Q6.isin(["Data Scientist", "Software Engineer"])')
        .loc[df.Q3.isin(["United States of America", "China", "India"]) &
             df.Q6.isin(["Data Scientist", "Software Engineer"])]
        .pipe(lambda df_:
            df_.assign(**(df_.Q1.pipe(pd.get_dummies, drop_first=True, prefix='gender')),
                       age=df_.Q2.str.slice(0,2).astype(int),
                       **(df_.Q3.pipe(pd.get_dummies, drop_first=True, prefix='country')),
                       education=df_.Q4.replace({'Master’s degree': 18,
                         'Bachelor’s degree': 16,
                         'Doctoral degree': 20,
                         'Some college/university study without earning a bachelor’s degree': 13,
                         'Professional degree': 19,
                         'I prefer not to answer': None,
                         'No formal education past high school': 12}),
                       **(df_.Q5
                              .pipe(topn, n=3)
                              .replace({
                        'Computer science (software engineering, etc.)': 'cs',
                        'Engineering (non-computer focused)': 'eng',
                        'Mathematics or statistics': 'stat'})
                              .pipe(pd.get_dummies, drop_first=True, prefix='major')),
                       title=df_.Q6,
                       years_exp=(df_.Q8.str.replace('+','', regex=False)
                           .str.split('-', expand=True)
                           .iloc[:,0]
                           .astype(float)),
                       compensation=(df_.Q9.str.replace('+','', regex=False)
                           .str.replace(',','', regex=False)
                           .str.replace('500000', '500', regex=False)
                           .str.replace('I do not wish to disclose my approximate yearly compensation', '0', regex=False)
                           .str.split('-', expand=True)
                           .iloc[:,0]
                           .fillna(0)
                           .astype(int)
                           .mul(1_000)
                                    ),
                       python=df_.Q16_Part_1.fillna(0).replace('Python', 1),
                       r=df_.Q16_Part_2.fillna(0).replace('R', 1),
                       sql=df_.Q16_Part_3.fillna(0).replace('SQL', 1)
               )#assign
              
        )#pipe
        .rename(columns=lambda col:col.replace(' ', '_'))
        .loc[:, 'gender_Male':]   
        .dropna()
       )
kag = tweak_kag(raw)
kag_X = kag.drop(columns='title')
kag_y = (kag.title == 'Data Scientist')
kag_X_train, kag_X_test, kag_y_train, kag_y_test = model_selection.train_test_split(
    kag_X, kag_y, stratify=kag_y, random_state=42)

In [None]:
raw

In [None]:
kag

In [None]:
kag_y

## Stumps, Trees, and Forests

Decision trees use a greedy algorithm to split on a feature (column) that results in the most "pure" split.

In [None]:
# True - DS
kag_y.value_counts()

In [None]:
stump = tree.DecisionTreeClassifier(max_depth=1)
stump.fit(kag_X_train, kag_y_train)
stump.score(kag_X_test, kag_y_test)

In [None]:
# False - SE, Data Scientist - DS
stump.classes_

In [None]:
features = list(c for c in kag_X_train.columns)
_ = tree.plot_tree(stump, feature_names=features, filled=True, 
                   class_names=['SE', 'DS'], fontsize=10)

## Underfit
A stump is too simple. It has too much *bias*.

Solutions:

* Add more features
* Use a more complex model

For a tree we can let it grow deeper which should do both.

## Overfitting

A model is too complicated. It has too much variance.

Solutions:

* Simplify or constrain (*regularize*)
* Add more samples

For a tree we can prune back the growth so that the leaf nodes are overly specific.

In [None]:
hi_variance = tree.DecisionTreeClassifier(max_depth=None)
hi_variance.fit(kag_X_train, kag_y_train)
hi_variance.score(kag_X_test, kag_y_test)

In [None]:
features = list(c for c in kag_X_train.columns)
_ = tree.plot_tree(hi_variance, feature_names=features, filled=True, 
                   class_names=['SE', 'DS'])

In [None]:
# limit view to first 2
features = list(c for c in kag_X_train.columns)
_ = tree.plot_tree(hi_variance, feature_names=features, filled=True, 
                   class_names=['SE', 'DS'], max_depth=2, fontsize=10)


## Tree Hyperparameters

*max_\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)

*min_\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)

* 'max_depth=None' - Tree depth
* 'max_features=None' - Amount of features to examine for split
* 'max_leaf_nodes=None' - Number of leafs
* 'min_impurity_decrease=0' - Split when *impurity* is >= this value. (*Impurity* : 0 - 100% accurate, .3 - 70%. Going from 70% to 100% accurate is a decrease of .3) 
* 'min_samples_leaf=1', - Minimum samples at each leaf.
* 'min_samples_split=2' - Minimum samples required to split a node.
* 'min_weight_fraction_leaf=0' - The fraction fo the total weights required to be a leaf.


In [None]:
stump.get_params()

## Random Forest

Uses *bagging* to ensemble many trees in an attempt to lower variance.

In [None]:
rf = ensemble.RandomForestClassifier(random_state=42)
rf.fit(kag_X_train, kag_y_train)
rf.score(kag_X_test, kag_y_test)

In [None]:
len(rf.estimators_)

In [None]:
features = list(c for c in kag_X_train.columns)
_ = tree.plot_tree(rf.estimators_[0], feature_names=features, filled=True, 
                   class_names=['SE', 'DS'])

## Random Forest Hyperparameters

*max_\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)

*min_\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)

* 'n_estimators=100' - Number of trees - should be *max_estimators*
* 'oob_score=False' - Can estimate score when training (by using rows that weren't randomly selected). No need to hold out data
* 'warm_start=False' - Can add more trees w/o starting over

From tree:

* 'max_depth=None' - Tree depth (1 to Infinity (`None`))
* 'max_features="sqrt"' - Amount of features to examine for split (1 to number of features (int). Float of percent (0. to 1.0). "log2" log2(n_features) or "sqrt"  sqrt(n_features). (Default square root number of features.)
* 'max_leaf_nodes=None' - Number of leafs. Default (`None`) is unlimited.
* 'min_impurity_decrease=0' - Split when *impurity* is >= this value. (0.0 to 1.0) (*Impurity* : 0 - 100% accurate, .3 - 70%) 
* 'min_samples_leaf=1', - Minimum samples at each leaf. (1 to n_samples).
* 'min_samples_split=2' - Minimum samples required to split a node. (1 to n_samples)
* 'min_weight_fraction_leaf=0' - The fraction (0.0 to 1.0) of the total weights required to be a leaf.

In [None]:
rf.get_params()

In [None]:
# visualize how changing n_estimators affects score
results = []
rf_ws = ensemble.RandomForestClassifier(random_state=42, warm_start=True, n_estimators=1)
rf_ws.fit(kag_X_train, kag_y_train)
for i in range(2,100):
    rf_ws.set_params(n_estimators=i)
    rf_ws.fit(kag_X_train, kag_y_train)
    # see other metrics
    results.append(metrics.f1_score(kag_y_test, rf_ws.predict(kag_X_test)))
pd.Series(results, index=range(2, 100)).plot(figsize=(8,4))    

In [None]:
# visualize how changing max_depth affects score
results = []
train_results = []
vals = list(range(1,20))
for i in vals:
    rf_ws = ensemble.RandomForestClassifier(random_state=42, 
                                            max_depth=i)
    rf_ws.fit(kag_X_train, kag_y_train)
    results.append(metrics.f1_score(kag_y_test, rf_ws.predict(kag_X_test)))
    train_results.append(metrics.f1_score(kag_y_train, rf_ws.predict(kag_X_train)))
ax = pd.Series(results, index=vals, name='test').plot(figsize=(8,4))    
pd.Series(train_results, index=vals, name='train').plot(ax=ax)
ax.legend()

## XGBoost

Uses *boosting* to train a series of (weak) trees that try to correct the error of the previous output. (For classification this is mapped to a probability)

Like golfing (you continue to putt or use a different club depending on first error). Decision tree would be a single tee off. Random forest would be averaging the tee offs. 

* Regularization
* Parallel Processing
* Missing Number Support
* Category Support

In [None]:
xg = xgb.XGBClassifier()
xg.fit(kag_X_train, kag_y_train)
xg.score(kag_X_test, kag_y_test)

In [None]:
# Let's try w/ depth of 2 and 2 trees
xg = xgb.XGBClassifier(max_depth=2, n_estimators=2)
xg.fit(kag_X_train, kag_y_train)
xg.score(kag_X_test, kag_y_test)

In [None]:
# first tree
# leaf values are log probabilities (*logit*)
xgb.to_graphviz(xg, size='1,1', num_trees=0, fontsize='1')

In [None]:
# second tree
xgb.to_graphviz(xg, size='1,1', num_trees=1, fontsize='1')

In [None]:
# let's go down the left path with
# this data
row = pd.Series({'gender_Male': 0.0, 'gender_Prefer_not_to_say': 0.0, 
    'gender_Prefer_to_self-describe': 0.0, 'age': 30.0, 'country_India': 0.0, 
    'country_United_States_of_America': 1.0, 'education': 16.0, 'major_eng': 0.0, 
    'major_other': 0.0, 'major_stat': 0.0, 'years_exp': 0.0, 'compensation': 0.0, 
    'python': 0.0, 'r': 0.0, 'sql': 0.0}).to_frame().T
row

In [None]:
# result for DS = .4522
# < .5 ... so Software Engineer!
# this is [prob SE, prob DS]
xg.predict_proba(row)

In [None]:
xg.predict(row)

In [None]:
# sum up leafs and throw into 
# Example: no r, low ed, low exp
# -.251 + 0.0602

vals = np.linspace(-10, 10)
def inv_logit(p):
    return np.exp(p) / (1 + np.exp(p))

x = -.251 + 0.0602
y = inv_logit(-.251 + 0.0602)
print(f'({x:.2}, {y:.2})')
fig, ax = plt.subplots(figsize=(6,4))
ax.plot(vals, inv_logit(vals))
ax.plot([x], [y], marker='o')
ax.set_xlim([-5, 5])
_ = ax.set_xticks([-3, -2, -1, 0, 1, 2, 3])
_ = ax.set_yticks([0,.4, .5, .6, 1])

## Early Stopping
Because you can keep "putting" you can keep track of how far away you are from the hole and stop when you are closest.

In [None]:
# defaults
# 100 putts
xg = xgb.XGBClassifier()
xg.fit(kag_X_train, kag_y_train)
xg.score(kag_X_test, kag_y_test)

In [None]:
# Early stopping
# Go up to 100 but stop after you haven't improved for 20 hits
# Min value at round 9

xg = xgb.XGBClassifier(early_stopping_rounds=20)
xg.fit(kag_X_train, kag_y_train,
       eval_set=[(kag_X_train, kag_y_train),
                 (kag_X_test, kag_y_test)], verbose=10)
xg.score(kag_X_test, kag_y_test)

In [None]:
xg.best_iteration

In [None]:
# we can get the evaluation metrics
# validation_0 is for training data
# validation_1 is for testing data
results = xg.evals_result()
results

In [None]:
# Testing score is best at 11 trees
results = xg.evals_result()
ax = pd.DataFrame({'training': results['validation_0']['logloss'],
              'testing': results['validation_1']['logloss'],
             }).shift().plot(figsize=(5,4))
ax.set_xlabel('ntrees')

## XGBoost Hyperparameters

*max_\** parameters - Raise to make more complex (overfit|more variance), lower to simplify (underfit|more bias)

*min_\** parameters - Lower to make more complex (overfit|more variance), raise to simplify (underfit|more bias)

* Boosting

  * ``n_estimators=100`` - number of trees (or boosting rounds). Larger is more complex. Default 100. Use ``early_stopping_rounds`` with ``.fit`` to prevent overfitting.

  * ``learning_rate=.3`` (called ``eta`` too) - after each boosting step, shrink feature weights. Larger is more conservative. Can be used with n_estimators to adjust time for convergence [0,1], default .3

  * ``gamma=0`` / ``min_split_loss`` - L0 regularization. Global regularization. Minimum loss required for split. Larger is more conservative. [0, ∞], default 0 - No regularization.


* Regularization

  * ``reg_lambda=1`` - L2 regularization (Root of squared weights). Increase to be more conservative. Default 1
  * ``reg_alpha=0`` - L1 regularization (Mean of weights). Increase to be more conservative. Default 0

* Sampling - Use different rows

  * ``subsample=1`` - Use % of samples (this is rows!) for next boosting round. Lower to more conservative. [0, 1], default 1. (When not equal to 1.0, model does *stochastic gradient descent*, ie. there is some randomness in the model.)


New tree (sampling) parameters - Use different columns (not rows!):

  * ``colsample_bytree=1`` - Fraction of columns for each boosting round.
  
  * ``colsample_bylevel=1`` - Fraction of columns for each depth level.
  
  * ``colsample_bynode=1`` - Fraction of columns for each node.
  

From tree:

  * ``max_depth=6`` - depth of tree. Larger is more complex (more likely to overfit). How many feature interactions you can have. Each level doubles time. [0, ∞], default 6
  * ``min_child_weight=1`` - Stop splitting after certain amount of purity. Larger will be more conservative.


Imbalanced data:

* ``scale_pos_weight=1`` -  ratio negative/positive. Default 1
* Use ``'auc'`` or ``'aucpr'`` for ``eval_metric`` metric (rather than classification default ``'logless'``)
* ``max_delta_step=0`` - try values from 1-10. Default 0





In [None]:
# try gamma on xgb
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(xgb.XGBClassifier(),
                    kag_X, kag_y,
                    param_name='gamma', param_range=[0, .5, 1,2,5,10, 20])

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(xgb.XGBClassifier(),
                    kag_X, kag_y,
                    param_name='max_depth', param_range=[1,2,3,4,5,10])

In [None]:
# note this depends on n_estimators
# should really use early stopping but yellowbrick doesn't support this 😢
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(xgb.XGBClassifier(),
                    kag_X, kag_y,
                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1])

In [None]:
params = {'learning_rate': 0.1,
 'max_depth': 3,
 'n_estimators': 200,
 'n_jobs': -1,
 'random_state': 42,
 'reg_lambda': 0,
 'subsample': 1}

In [None]:
# this takes a while to run (about 2 minutes)
# can set scoring in GridSearchCV to 
# recall, precision, f1, accuracy
params = {'reg_lambda': [0],  # No effect
          'learning_rate': [.1, .3], # makes each boost more conservative (0 - no shrinkage) 
          #'colsample_bylevel': [.3, 1], # use 0, 50%, or 100% of columns in boost step
          'subsample': [.7, 1],
          #'gamma': [0, 1],
          'max_depth': [1, 2, 3],
          'random_state': [42],
          'n_jobs': [-1],
          #'early_stopping_rounds':[10],
          'n_estimators': [200]}
kag_xgb2 = xgb.XGBClassifier()
cv = (model_selection.GridSearchCV(kag_xgb2, params, cv=3)#, n_jobs=-1)
    .fit(kag_X_train, kag_y_train,
         eval_set=[(kag_X_test, kag_y_test)],
         early_stopping_rounds=5, verbose=10) 
     )

In [None]:
cv.best_params_

In [None]:
# vs default
params = {'learning_rate': 0.3,
 'max_depth': 2,
 'n_estimators': 200,
 'n_jobs': -1,
 'random_state': 42,
 'reg_lambda': 0,
 'subsample': 0.7}
xgb_def2 = xgb.XGBClassifier()
xgb_def2.fit(kag_X_train, kag_y_train)

xgb_grid2 = xgb.XGBClassifier(**params)
xgb_grid2.fit(kag_X_train, kag_y_train)
xgb_def2.score(kag_X_test, kag_y_test), xgb_grid2.score(kag_X_test, kag_y_test)

## Bonus: Tuning with Hyperopt


In [None]:
!pip install hyperopt

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import accuracy_score  
#https://bradleyboehmke.github.io/xgboost_databricks_tuning/index.html#slide21
space = {
    'learning_rate': hp.loguniform('learning_rate', -7, 0),
    'max_depth': hp.quniform('max_depth', 1, 12, 1),
    'min_child_weight': hp.loguniform('min_child_weight', -2, 3),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma', -10, 10),
    'reg_alpha': hp.loguniform('alpha', -10, 10),
    'reg_lambda': hp.loguniform('lambda', -10, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'seed': 123,
}

In [None]:
def hyperparameter_tuning(space):    
    model = xgb.XGBClassifier(max_depth = int(space['max_depth']), 
                gamma = space['gamma'],                                         
                reg_alpha = int(space['reg_alpha']),
                min_child_weight=space['min_child_weight'],                                 
                colsample_bytree=space['colsample_bytree'])
    evaluation = [(kag_X_train, kag_y_train),
            (kag_X_test, kag_y_test)]
    model.fit(kag_X_train, kag_y_train,
                 eval_set=evaluation, eval_metric="rmse",            
                 early_stopping_rounds=10,verbose=False)    
         
    pred = model.predict(kag_X_test)
    accuracy = accuracy_score(kag_y_test, pred>0.5)    
    print ("SCORE:", accuracy)    
    #change the metric if you like    
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

In [None]:
trials = Trials()
best = fmin(fn=hyperparameter_tuning,            
    space=space,           
    algo=tpe.suggest,            
    max_evals=1000,            
    trials=trials,
    #timeout=60*5 # 5 minutes
           )
print (best)

In [None]:
best # new

In [None]:
hyper_params ={'alpha': 0.19514909424102928,
 'colsample_bytree': 0.8227256149391048,
 'gamma': 0.010701959121627006,
 'lambda': 0.010955985134796302,
 'learning_rate': 0.004570442245136879,
 'max_depth': 3, 
 'min_child_weight': 0.2497193683952876,
 'subsample': 0.6416201529297743}
xgb_hyp = xgb.XGBClassifier(**hyper_params, eval_metric='logloss', 
                            use_label_encoder=False,
                           n_estimators=2_000)
evaluation = [(kag_X_train, kag_y_train),
            (kag_X_test, kag_y_test)]
xgb_hyp.fit(kag_X_train, kag_y_train, early_stopping_rounds=10,
           eval_set=evaluation)
xgb_hyp.score(kag_X_test, kag_y_test)#

[573]	validation_0-logloss:0.52103	validation_1-logloss:0.56339
[574]	validation_0-logloss:0.52094	validation_1-logloss:0.56329
[575]	validation_0-logloss:0.52085	validation_1-logloss:0.56328
[576]	validation_0-logloss:0.52075	validation_1-logloss:0.56323
[577]	validation_0-logloss:0.52067	validation_1-logloss:0.56315
[578]	validation_0-logloss:0.52060	validation_1-logloss:0.56308
[579]	validation_0-logloss:0.52052	validation_1-logloss:0.56303
[580]	validation_0-logloss:0.52046	validation_1-logloss:0.56300
[581]	validation_0-logloss:0.52036	validation_1-logloss:0.56295
[582]	validation_0-logloss:0.52026	validation_1-logloss:0.56293
[583]	validation_0-logloss:0.52017	validation_1-logloss:0.56284
[584]	validation_0-logloss:0.52009	validation_1-logloss:0.56277
[585]	validation_0-logloss:0.51998	validation_1-logloss:0.56269
[586]	validation_0-logloss:0.51991	validation_1-logloss:0.56270
[587]	validation_0-logloss:0.51986	validation_1-logloss:0.56272
[588]	validation_0-logloss:0.51976	valid

[702]	validation_0-logloss:0.51111	validation_1-logloss:0.55851
[703]	validation_0-logloss:0.51105	validation_1-logloss:0.55845
[704]	validation_0-logloss:0.51098	validation_1-logloss:0.55843
[705]	validation_0-logloss:0.51089	validation_1-logloss:0.55836
[706]	validation_0-logloss:0.51082	validation_1-logloss:0.55832
[707]	validation_0-logloss:0.51074	validation_1-logloss:0.55829
[708]	validation_0-logloss:0.51068	validation_1-logloss:0.55824
[709]	validation_0-logloss:0.51060	validation_1-logloss:0.55825
[710]	validation_0-logloss:0.51054	validation_1-logloss:0.55823
[711]	validation_0-logloss:0.51048	validation_1-logloss:0.55817
[712]	validation_0-logloss:0.51043	validation_1-logloss:0.55818
[713]	validation_0-logloss:0.51037	validation_1-logloss:0.55814
[714]	validation_0-logloss:0.51031	validation_1-logloss:0.55815
[715]	validation_0-logloss:0.51024	validation_1-logloss:0.55806
[716]	validation_0-logloss:0.51016	validation_1-logloss:0.55805
[717]	validation_0-logloss:0.51007	valid

0.7253814147018031

In [None]:
xgb_hyp.score(kag_X_test, kag_y_test)

In [None]:
# vs default and grid
xgb_def2.score(kag_X_test, kag_y_test), xgb_grid2.score(kag_X_test, kag_y_test)

In [None]:
grid = xgb_grid2.get_params()
hyp = xgb_hyp.get_params()
for k in grid:
    print(f'{k=:20} grid:{grid[k] or "":20} hyp:{hyp[k] or ""}')

## Model Evaluation
Now that we've tuned our model, let's look at how it performs

In [None]:
hyper_params ={'alpha': 0.19514909424102928,
 'colsample_bytree': 0.8227256149391048,
 'gamma': 0.010701959121627006,
 'lambda': 0.010955985134796302,
 'learning_rate': 0.004570442245136879,
 'max_depth': 3, 
 'min_child_weight': 0.2497193683952876,
 'subsample': 0.6416201529297743}
xgb_hyp = xgb.XGBClassifier(**hyper_params,
                           n_estimators=2_000)
evaluation = [(kag_X_train, kag_y_train),
            (kag_X_test, kag_y_test)]
xgb_hyp.fit(kag_X_train, kag_y_train, early_stopping_rounds=10,
           eval_set=evaluation, verbose=100)

In [None]:
metrics.accuracy_score(kag_y_test, xgb_hyp.predict(kag_X_test))

In [None]:
metrics.precision_score(kag_y_test, xgb_hyp.predict(kag_X_test))

In [None]:
metrics.recall_score(kag_y_test, xgb_hyp.predict(kag_X_test))

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
classifier.confusion_matrix(xgb_hyp, kag_X_train, kag_y_train,
                            kag_X_test, kag_y_test,
                            classes=['SE', 'DS']
                           )

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
metrics.RocCurveDisplay.from_estimator(xgb_hyp,
                       kag_X_test, kag_y_test,ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
classifier.precision_recall_curve(xgb_hyp, kag_X_train, kag_y_train,
                   kag_X_test, kag_y_test,
                   classes=['SE', 'DS'],
                   micro=False, macro=False
                   )

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
classifier.classification_report(xgb_hyp, kag_X_train, kag_y_train,
                   kag_X_test, kag_y_test,
                   classes=['SE', 'DS'],
                   micro=False, macro=False
                   )

## Training For Different Metrics

We tuned our model. But we tuned it against accuracy. What if we want to optimize for recall?

In [None]:
# accuracy tuning
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(xgb.XGBClassifier(),
                    kag_X_train, kag_y_train,
    #                param_name='max_depth', param_range=[1,2,5,10]
                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]
    )

In [None]:
# precision tuning - see scoring param
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(xgb.XGBClassifier(),
                    kag_X_train, kag_y_train,
                    scoring='precision',
                    #param_name='max_depth', param_range=[1,2,5,10]
                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]
                   )

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(xgb.XGBClassifier(),
                    kag_X_train, kag_y_train,
                    scoring='f1',
                    #param_name='max_depth', param_range=[1,2,5,10]
                    param_name='learning_rate', param_range=[0.001, .01, .1, .2, .5, .9, 1]
                   )

## Model Interpretation

In [None]:
# Trees are great when they overfit... They can explain what they overfit
# (You can use these for "surrogate models")
hi_variance = tree.DecisionTreeClassifier(max_depth=None)
hi_variance.fit(kag_X_train, kag_y_train)
hi_variance.score(kag_X_test, kag_y_test)

In [None]:
# Feature importance shows the magnitude (not direction) of impact
(pd.Series(hi_variance.feature_importances_, index=kag_X_train.columns)
 .sort_values()
 .plot.barh()
)

In [None]:
# XGBoost also supports feature importance
xgb_def = xgb.XGBClassifier()
xgb_def.fit(kag_X_train, kag_y_train)

In [None]:
(pd.Series(xgb_def.feature_importances_, index=kag_X_train.columns)
 .sort_values()
 .plot.barh()
)

In [None]:
# Feature importance is specific to model/hyperparameters
(pd.Series(xgb_hyp.feature_importances_, index=kag_X_train.columns)
 .sort_values()
 .plot.barh()
)

In [None]:
# * "weight" is the number of times a feature appears in a tree
# * "gain" is the average gain of splits which use the feature
# * "cover" is the average coverage of splits which use the feature
xgb.plot_importance(xgb_def, importance_type='cover')

## Bonus: xgbfir (Feature Interactions Reshaped)
 *Gain*: Total gain of each feature or feature interaction
 
 *FScore*: Amount of possible splits taken on a feature or feature Interaction
 
 *wFScore*: Amount of possible splits taken on a feature or feature nteraction weighted by the probability of the splits to take place
 
 *Average wFScore*: wFScore divided by FScore
 
 *Average Gain*: Gain divided by FScore
 
 *Expected Gain*: Total gain of each feature or feature interaction weighted by the probability to gather the gain


In [None]:
!pip install openpyxl

In [None]:
import xgbfir
xgbfir.saveXgbFI(xgb_def, feature_names=kag_X_train.columns, OutputXlsxFile='fir.xlsx')
pd.read_excel('fir.xlsx')

In [None]:
pd.read_excel('fir.xlsx', sheet_name='Interaction Depth 1')

In [None]:
pd.read_excel('fir.xlsx', sheet_name='Interaction Depth 2')

# SHAP (SHapley Additive exPlantations)
Should be *globally* consistent and accurate

 Shapley value (SHAP).
 
 From game theory, indicates how to distribute attribution of label



In [None]:
import shap
shap.initjs()

# make sure you initialize the js side
shap_ex = shap.TreeExplainer(xgb_def)
vals = shap_ex(kag_X_test)

In [None]:
vals

In [None]:
# Let's explain an individual
kag_X_test.iloc[0]

In [None]:
xgb_def.predict(kag_X_test.iloc[[0]])  # predicts SE... why?

In [None]:
# label is also SE
kag_y_test.iloc[0]

In [None]:
# the base value. We sum up the scores.
# > 0 Positive Case
shap_ex.expected_value

In [None]:
# < 0 therefore ... SE
shap_ex.expected_value + vals.values[0].sum()

In [None]:
# blue - SE
# red - DS

shap.initjs()
shap.plots.waterfall(vals[0])

In [None]:
fig, ax = plt.subplots(figsize=(8,3))
shap.plots.scatter(vals[:,'years_exp'], ax=ax)

In [None]:
# with jitter/alpha
fig, ax = plt.subplots(figsize=(8,3))
shap.plots.scatter(vals[:,'years_exp'], ax=ax, x_jitter=.5, alpha=.5)

In [None]:
# with jitter/alpha
fig, ax = plt.subplots(figsize=(8,3))
shap.plots.scatter(vals[:,'years_exp'], ax=ax, x_jitter=.5, alpha=.5)

In [None]:
# add interaction (color)
fig, ax = plt.subplots(figsize=(8,3))
shap.plots.scatter(vals[:,'r'], color=vals, ax=ax, x_jitter=.5, alpha=.5)

In [None]:
# specify interaction
fig, ax = plt.subplots(figsize=(8,3))
shap.plots.scatter(vals[:,'years_exp'], color=vals[:, 'education'], ax=ax, x_jitter=.5, alpha=.5)

In [None]:
shap.plots.beeswarm(vals, alpha=.5)

# Summary

XGBoost is very powerful. Combining with other tools will take you a long way.

Explore your data and your results.

Lots of libraries. Some are better integrated.

Suggestions:

* Pandas skills come in useful for manipulating data
* Make sure you discuss business value with stake holders


Questions?


Connect on LinkedIn or Twitter `@__mharrison__`

In [None]:
import random
random.randrange(6)

In [None]:
random.randrange(1,5)