### XGBoost with ADSTuner for HPO

* Imblearn for undersampling of negative class
* ADSTuner for HPO

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

import ads

# to use ADSTuner
from ads.hpo.search_cv import ADSTuner
from ads.hpo.stopping_criterion import *
from ads.hpo.distributions import *

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# for undersampling the negative class
from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt

# my utils.py
from utils import train_encoders, apply_encoders

import pickle

%matplotlib inline

In [2]:
# check the ADS version
print(ads.__version__)

2.5.4


In [3]:
# global constants
SEED = 4321

# number of features (with the two indicator cols)
N_FEATURES = 12

# name of col with label
TARGET = 'SeriousDlqin2yrs'

# cols with missing values
COL1_MISSING = 'MonthlyIncome'
COL2_MISSING = 'NumberOfDependents'

# nomi delle due colonne indicator (valgono 1 laddove il dato è inputato)
IND1 = 'isna_mi'
IND2 = 'isna_nod'

ind_col = [IND1, IND2]

COLS_TO_DROP = ['id']

# for undersampling to make the dataset more balanced
# ratio minority samples/majority
RATIO = 1./5.

In [4]:
# full dataset, not undersampled
data_full = pd.read_csv('cs-training-nonull.csv')

# remove unneeded cols
data_full = data_full.drop(COLS_TO_DROP, axis = 1)

In [5]:
cat_cols = ['age','NumberOfTime30-59DaysPastDueNotWorse',
               'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
               'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
               'NumberOfDependents']
num_cols = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome', ]

# indicators are not touched

In [6]:
# scaling and label encoding is done on data_full. After we will do resampling
# In this way coding and scaling cover entire range of values, not only for resampled data

# we don't need any scaling (it is ensambles of trees)

In [7]:
# cat cols treatment
# Code categorical columns (only season, weather, year)

# we don't need any pre-processing for cat columns

# so for XGBoost afpret Nan treatment no other pre-processing is needed

In [8]:
# estrae X: matrice features ed y, labels
y_train_full = data_full[TARGET].values
x_train_full = data_full.drop(TARGET, axis = 1).values

assert x_train_full.shape[1] == N_FEATURES

In [9]:
print(f'# of samples in full dataset: {x_train_full.shape[0]}')

# of samples in full dataset: 150000


In [10]:
# do the undersampling of the negative class, using IMblearn
rus = RandomUnderSampler(sampling_strategy=RATIO, random_state=SEED)

x_train, y_train = rus.fit_resample(x_train_full, y_train_full)

print(f'# of samples in resampled dataset: {x_train.shape[0]}')

# check ratio of classes
print(f'# of positive samples: {np.sum(y_train)}')
print(f'# of negative samples: {x_train.shape[0] - np.sum(y_train)}')

# of samples in resampled dataset: 60156
# of positive samples: 10026
# of negative samples: 50130


The resampled dataset (x_train, y_train) will be used for training

### Train the XGBoost Classifier

In [11]:
# parameters for the HPO session with Optuna
FOLDS = 5
SEED = 4321

N_TRIALS = 100
TIME_BUDGET = 7200
STUDY_NAME = "xgb01"

# ranges
LR_LOW = 1e-3
LR_HIGH = 1e-2
DEPTH_LOW = 4
DEPTH_HIGH = 8
N_ITER_LIST = [600, 700, 800, 900, 1000, 1100, 1200, 1300]

In [12]:
#
# Here we define the strategy, the space for hyper-parameters we want to explore
#
params = {
    "n_estimators": CategoricalDistribution(N_ITER_LIST),
    "learning_rate": LogUniformDistribution(low=LR_LOW, high=LR_HIGH),
    "max_depth": IntUniformDistribution(DEPTH_LOW, DEPTH_HIGH),
    "tree_method": "gpu_hist"
}

clf = xgb.XGBClassifier()


# per lista scorer sorted(sklearn.metrics.SCORERS.keys())
tuner = ADSTuner(clf, cv=FOLDS, strategy=params, scoring="roc_auc", study_name=STUDY_NAME, n_jobs=6, random_state=SEED)

tuner.tune(x_train, y_train, exit_criterion=[TimeBudget(TIME_BUDGET)])

[32m[I 2022-03-15 12:09:50,940][0m A new study created in RDB with name: xgb01[0m


In [74]:
# get the status to see if completed
print(f"The tuner status is: {tuner.get_status()}")

print(f"Remaining time is: {round(tuner.time_remaining, 1)} sec.")

The tuner status is: State.COMPLETED
Remaining time is: 0 sec.


In [75]:
# look only at completed trials, sorted with best on top. Metric chosen is in the value col.
result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
    by=["value"], ascending=False
)

result_df.head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_n_estimators,params_tree_method,user_attrs_mean_fit_time,...,user_attrs_metric,user_attrs_split0_test_score,user_attrs_split1_test_score,user_attrs_split2_test_score,user_attrs_split3_test_score,user_attrs_split4_test_score,user_attrs_std_fit_time,user_attrs_std_score_time,user_attrs_std_test_score,state
404,404,0.865759,2022-03-15 13:34:06.595550,2022-03-15 13:34:55.437010,0 days 00:00:48.841460,0.009964,4,1300,gpu_hist,9.69079,...,roc_auc,0.8649,0.860076,0.866373,0.864824,0.87262,0.098515,0.002096,0.004034,COMPLETE
578,578,0.865752,2022-03-15 13:58:00.248313,2022-03-15 13:58:49.396363,0 days 00:00:49.148050,0.00999,4,1300,gpu_hist,9.755813,...,roc_auc,0.865032,0.859949,0.866293,0.86498,0.872507,0.193885,0.003171,0.004018,COMPLETE
574,574,0.865735,2022-03-15 13:57:20.466841,2022-03-15 13:58:09.326594,0 days 00:00:48.859753,0.009956,4,1300,gpu_hist,9.692873,...,roc_auc,0.864853,0.859996,0.866346,0.864905,0.872575,0.135009,0.002081,0.004039,COMPLETE
403,403,0.865731,2022-03-15 13:34:04.196504,2022-03-15 13:34:53.199014,0 days 00:00:49.002510,0.009916,4,1300,gpu_hist,9.717765,...,roc_auc,0.864999,0.86018,0.866344,0.864703,0.872429,0.163836,0.00689,0.003941,COMPLETE
524,524,0.86573,2022-03-15 13:50:25.102396,2022-03-15 13:51:10.875932,0 days 00:00:45.773536,0.009976,4,1300,gpu_hist,9.083231,...,roc_auc,0.86488,0.859974,0.866202,0.864921,0.872672,0.126517,0.000805,0.004073,COMPLETE
336,336,0.865729,2022-03-15 13:25:27.591121,2022-03-15 13:26:16.281232,0 days 00:00:48.690111,0.00992,4,1300,gpu_hist,9.666801,...,roc_auc,0.864897,0.860182,0.866123,0.864978,0.872466,0.191272,0.002686,0.003939,COMPLETE
475,475,0.865725,2022-03-15 13:43:02.881622,2022-03-15 13:43:51.909344,0 days 00:00:49.027722,0.009647,4,1300,gpu_hist,9.72986,...,roc_auc,0.864855,0.859978,0.866301,0.864899,0.872591,0.059059,0.003169,0.004048,COMPLETE
341,341,0.865724,2022-03-15 13:26:03.975765,2022-03-15 13:26:52.963595,0 days 00:00:48.987830,0.009964,4,1300,gpu_hist,9.722157,...,roc_auc,0.864969,0.860094,0.866172,0.864862,0.872525,0.188338,0.002073,0.003987,COMPLETE
601,601,0.865723,2022-03-15 14:00:46.810750,2022-03-15 14:01:35.835118,0 days 00:00:49.024368,0.009972,4,1300,gpu_hist,9.7267,...,roc_auc,0.865046,0.859878,0.866196,0.864974,0.872521,0.191255,0.000829,0.00404,COMPLETE
347,347,0.865723,2022-03-15 13:26:52.981248,2022-03-15 13:27:42.218921,0 days 00:00:49.237673,0.009767,4,1300,gpu_hist,9.770642,...,roc_auc,0.864811,0.860103,0.866229,0.864785,0.872686,0.175391,0.005705,0.004051,COMPLETE


In [76]:
def show_tuner_results(tuner):

    # to count completed
    result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
        by=["value"], ascending=False
    )

    print("ADSTuner session results:")
    print(f"ADSTuner has launched {tuner.trials.shape[0]} trials")
    print(f"ADSTuner has completed {result_df.shape[0]} trials")
    print()
    print(f"The best trial is the #: {tuner.best_index}")
    print(f"Parameters for the best trial are: {tuner.best_params}")
    print(f"The metric used to optimize is: {tuner.scoring_name}")
    print(f"The best score is: {round(tuner.best_score, 4)}")
    
show_tuner_results(tuner)

ADSTuner session results:
ADSTuner has launched 667 trials
ADSTuner has completed 667 trials

The best trial is the #: 404
Parameters for the best trial are: {'learning_rate': 0.009964407355424202, 'max_depth': 4, 'n_estimators': 1300, 'tree_method': 'gpu_hist'}
The metric used to optimize is: roc_auc
The best score is: 0.8658


### Train with best params

In [77]:
%%time

clf = xgb.XGBClassifier(**tuner.best_params)

# addestro e valuto su train e su validation set
clf.fit(x_train, y_train,
        eval_set=[(x_train, y_train)],
        eval_metric='auc', verbose=100)

print()

evals_result = clf.evals_result()

[0]	validation_0-auc:0.83253
[100]	validation_0-auc:0.84957
[200]	validation_0-auc:0.85838
[300]	validation_0-auc:0.86237
[400]	validation_0-auc:0.86619
[500]	validation_0-auc:0.86889
[600]	validation_0-auc:0.87078
[700]	validation_0-auc:0.87206
[800]	validation_0-auc:0.87311
[900]	validation_0-auc:0.87406
[1000]	validation_0-auc:0.87480
[1100]	validation_0-auc:0.87550
[1200]	validation_0-auc:0.87612
[1299]	validation_0-auc:0.87684

CPU times: user 2.6 s, sys: 582 ms, total: 3.18 s
Wall time: 2.98 s


#### OK, consider that the slightly higher AUC is due to the fact here we're evaluating also on train data

In [None]:
def plot_auc(train_hist):
    plt.figure(figsize=(9,6))
    
    plt.plot(train_hist, label='Training AUC')
    plt.title('AUC')
    plt.legend(loc='lower right')
    plt.ylabel('auc')
    plt.xlabel('n_estimator')
    plt.grid(True)
    plt.show();

In [None]:
train_hist = evals_result['validation_0']['auc']

plot_auc(train_hist)

In [79]:
# compute accuracy on full dataset
y_pred = clf.predict(x_train_full)

predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_train_full, predictions)

print("Accuracy on train set: %.2f%%" % (accuracy * 100.0))

Accuracy on train set: 92.75%


In [80]:
# compute confusion matrix on full dataset
tn, fp, fn, tp = confusion_matrix(y_train_full, predictions).ravel()

(tn, fp, fn, tp)

(134568, 5406, 5465, 4561)

### Prediction on the TEST set (for submission to Kaggle)

In [None]:
# predictions on test set
orig_test = pd.read_csv('cs-test.csv')

# inpute missing values, add the two indicator columns
orig_test['isna_mi'] = 0
orig_test.loc[orig_test[COL1_MISSING].isna(), 'isna_mi'] = 1
orig_test.loc[orig_test[COL1_MISSING].isna(), COL1_MISSING] = MONTHLY_INC_MEDIAN

orig_test['isna_nod'] = 0
orig_test.loc[orig_test[COL2_MISSING].isna(), 'isna_nod'] = 1
orig_test.loc[orig_test[COL2_MISSING].isna(), COL2_MISSING] = N_OF_DEP_MODE

In [None]:
ind_test = orig_test[ind_col].values

In [None]:
orig_test = orig_test.drop(ind_col, axis = 1)

In [None]:
ID_COL_NAME = 'Unnamed: 0'
xorig_test = orig_test.drop(ID_COL_NAME, axis = 1)
xorig_test = xorig_test.drop(TARGET, axis = 1)

x_test = xorig_test.values

In [None]:
# aggiungi qui lo scaling !!!
x_test_scaled = scaler.transform(x_test)
# riaggiunge le colonne indicatore
x_test_scaled = np.c_[x_test_scaled, ind_test]

assert x_test_scaled.shape[1] == N_FEATURES

In [None]:
# do predictions on test set (no shuffle !)
y_pred = clf.predict_proba(x_test_scaled)

# y_pred contiene le probabilità
y_pred = y_pred[:, 1]

In [None]:
# prepara il csv per la submission
result_dict = {"Id": orig_test[ID_COL_NAME].values,
              'Probability': y_pred}

FILE_SUB = 'submission25.csv'

# build a dataframe and save to csv
result_df = pd.DataFrame(result_dict)

result_df.to_csv(FILE_SUB, index=False, float_format='%.5f')

In [None]:
### Save Modela and scaler

In [None]:
# save model: uso un formato semplice: pkl
pickle.dump(clf, open("credit-scoring.pkl", "wb"))

In [None]:
# salvo anche lo scaler
pickle.dump(scaler, open("scaler.pkl", "wb"))

### Online predictions

In [None]:
# reload the model
loaded_model = pickle.load(open("credit-scoring.pkl", "rb"))

In [None]:
# reload the scaler
loaded_scaler = pickle.load(open("scaler.pkl", "rb"))

In [None]:
# prepare for online predictions
# input are given as a numpy array, with no missing fields, but we need to add the two indicator columns
x_input = np.array([[1,2,3,4,5,6,7,8,9,10],
                   [1,2,3,4,5,6,7,8,9,10],
                   [1,2,3,4,5,6,7,8,9,10]])

In [None]:
# controlli
assert x_input.shape[1] == 10
# check there are no null
assert np.sum(np.isnan(x_input)) == 0

In [None]:
# normalize
x_input_scaled = loaded_scaler.transform(x_input)

# add two columns with 0
x_add = np.zeros((x_input.shape[0], 2))
x_input_scaled = np.c_[x_input_scaled, x_add]

In [None]:
y_pred = loaded_model.predict(x_input_scaled)

In [None]:
y_pred

In [None]:
train_df[TARGET].hist();