# XGBoost Classification Template

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn import datasets
import xgboost as xgb

In [None]:

dbunch = datasets.load_breast_cancer(as_frame=True)
df = dbunch.frame
features = dbunch.feature_names 
target_names = dbunch.target_names 
target = 'target' 
df.info()

In [None]:
from sklearn.model_selection import train_test_split

n_valid = 50 

train_df, valid_df = train_test_split(df, test_size=n_valid, random_state=42)
train_df.shape, valid_df.shape

### Training with the  `train` function

In [None]:
params = {
    'tree_method': 'exact',
    'objective': 'binary:logistic',
}
num_boost_round = 50

dtrain = xgb.DMatrix(label=train_df[target], data=train_df[features])
dvalid = xgb.DMatrix(label=valid_df[target], data=valid_df[features])
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
                  evals=[(dtrain, 'train'), (dvalid, 'valid')],
                  verbose_eval=10)

### Training with `XGBClassifier`

In [None]:
params = {
    'tree_method': 'exact',
    'objective': 'binary:logistic',
}
num_boost_round = 50

clf = xgb.XGBClassifier(n_estimators=num_boost_round, **params)
clf.fit(train_df[features], train_df[target], 
        eval_set=[(train_df[features], train_df[target]), (valid_df[features], valid_df[target])], 
        verbose=10);

### Evaluating the Model

In [None]:
y_true = valid_df[target]
y_pred = clf.predict(valid_df[features])
y_score = clf.predict_proba(valid_df[features])[:,1]

In [None]:
from sklearn import metrics 

metrics.accuracy_score(y_true, y_pred)

In [None]:
print(metrics.classification_report(y_true, y_pred, target_names=target_names))

In [None]:
metrics.roc_auc_score(y_true, y_score)

### Feature Importance

In [None]:
from sklearn.inspection import permutation_importance
from sklearn.metrics import make_scorer

scorer = make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True)
permu_imp = permutation_importance(clf, valid_df[features], valid_df[target], 
                                   n_repeats=30, random_state=0, scoring=scorer)

In [None]:
importances_permutation = pd.Series(permu_imp['importances_mean'], index=features)
importances_permutation.sort_values(ascending=True)[-10:].plot.barh()
plt.title('Permutation Importance on Out-of-Sample Set')
plt.xlabel('change in log likelihood');


## Multi-Class Classification Example

In [None]:
dbunch = datasets.fetch_covtype(as_frame=True)
df = dbunch.frame
features = dbunch.feature_names 
df.info()

In [None]:
df['Cover_Type'].value_counts().sort_index().plot.bar()
plt.xlabel('cover type') 
plt.ylabel('count');

In [None]:
from sklearn.preprocessing import LabelEncoder 

target = 'encoded'
enc = LabelEncoder()
df[target] = enc.fit_transform(df['Cover_Type'])
print(np.sort(df[target].unique()))

In [None]:
n_valid = 20000

train_df, valid_df = train_test_split(df, test_size=n_valid, random_state=42)
train_df.shape, valid_df.shape

### Training with the `train` function

In [None]:
params = {
    'tree_method': 'approx',
    'objective': 'multi:softprob',
    'num_class': df[target].nunique()
}
num_boost_round = 10

dtrain = xgb.DMatrix(label=train_df[target], data=train_df[features])
dvalid = xgb.DMatrix(label=valid_df[target], data=valid_df[features])
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_boost_round,
                  evals=[(dtrain, 'train'), (dvalid, 'valid')],
                  verbose_eval=2)

### Training with `XGBClassifier`

In [None]:
params = {
    'tree_method': 'approx',
    'objective': 'multi:softprob',
}
num_boost_round = 10

clf = xgb.XGBClassifier(n_estimators=num_boost_round, **params)
clf.fit(train_df[features], train_df[target], 
        eval_set=[(train_df[features], train_df[target]), (valid_df[features], valid_df[target])], 
        verbose=2);

### Evaluating the Model

In [None]:
y_true = valid_df[target]
y_pred = clf.predict(valid_df[features])
y_score = clf.predict_proba(valid_df[features])
y_true.shape, y_pred.shape, y_score.shape

In [None]:
metrics.accuracy_score(y_true, y_pred)

In [None]:
print(metrics.classification_report(y_true, y_pred))

In [None]:
metrics.roc_auc_score(y_true, y_score, average='weighted', multi_class='ovr')

### Feature Importance

In [None]:
scorer = make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True)
permu_imp = permutation_importance(clf, valid_df[features], valid_df[target], 
                                   n_repeats=30, random_state=0, scoring=scorer)

In [None]:
importances_permutation = pd.Series(permu_imp['importances_mean'], index=features)
importances_permutation.sort_values(ascending=True)[-10:].plot.barh()
plt.title('Permutation Importance on Out-of-Sample Set')
plt.xlabel('change in multivariate log likelihood');
