### XGBoost with ADSTuner for HPO

* Imblearn for undersampling of negative class
* ADSTuner for HPO

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

import ads

# to use ADSTuner
from ads.hpo.search_cv import ADSTuner
from ads.hpo.stopping_criterion import *
from ads.hpo.distributions import *

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# for undersampling the negative class
from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt

# my utils.py
from utils import train_encoders, apply_encoders

import pickle

%matplotlib inline

In [2]:
# check the ADS version
print(ads.__version__)

2.5.8


In [3]:
# global constants
SEED = 4321

# number of features (with the two indicator cols)
N_FEATURES = 12

# name of col with label
TARGET = 'SeriousDlqin2yrs'

# cols with missing values
COL1_MISSING = 'MonthlyIncome'
COL2_MISSING = 'NumberOfDependents'

# nomi delle due colonne indicator (valgono 1 laddove il dato è inputato)
IND1 = 'isna_mi'
IND2 = 'isna_nod'

ind_col = [IND1, IND2]

COLS_TO_DROP = ['id']

# for undersampling to make the dataset more balanced
# ratio minority samples/majority
RATIO = 1./5.

In [4]:
# full dataset, not undersampled
data_full = pd.read_csv('cs-training-nonull.csv')

# remove unneeded cols
data_full = data_full.drop(COLS_TO_DROP, axis = 1)

In [5]:
cat_cols = ['age','NumberOfTime30-59DaysPastDueNotWorse',
               'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
               'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
               'NumberOfDependents']
num_cols = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio', 'MonthlyIncome', ]

# indicators are not touched

In [6]:
# scaling and label encoding is done on data_full. After we will do resampling
# In this way coding and scaling cover entire range of values, not only for resampled data

# we don't need any scaling (it is ensambles of trees)

In [7]:
# cat cols treatment
# Code categorical columns (only season, weather, year)

# we don't need any pre-processing for cat columns

# so for XGBoost afpret Nan treatment no other pre-processing is needed

In [8]:
# estrae X: matrice features ed y, labels
y_train_full = data_full[TARGET].values
x_train_full = data_full.drop(TARGET, axis = 1).values

assert x_train_full.shape[1] == N_FEATURES

In [10]:
print(f'# of samples in full dataset: {x_train_full.shape[0]}')

# of samples in full dataset: 150000


In [12]:
# do the undersampling of the negative class, using IMblearn
rus = RandomUnderSampler(sampling_strategy=RATIO, random_state=SEED)

x_train, y_train = rus.fit_resample(x_train_full, y_train_full)

print(f'# of samples in resampled dataset: {x_train.shape[0]}')

# check ratio of classes
print(f'# of positive samples: {np.sum(y_train)}')
print(f'# of negative samples: {x_train.shape[0] - np.sum(y_train)}')

# of samples in resampled dataset: 60156
# of positive samples: 10026
# of negative samples: 50130


The resampled dataset (x_train, y_train) will be used for training

### Train the XGBoost Classifier

In [14]:
# parameters for the HPO session with Optuna
FOLDS = 5
SEED = 4321

N_TRIALS = 100
TIME_BUDGET = 7200
STUDY_NAME = "xgb01"

# ranges
LR_LOW = 1e-3
LR_HIGH = 1e-2
DEPTH_LOW = 4
DEPTH_HIGH = 8
N_ITER_LIST = [600, 700, 800, 900, 1000, 1100, 1200, 1300]

In [15]:
#
# Here we define the strategy, the space for hyper-parameters we want to explore
#
params = {
    "n_estimators": CategoricalDistribution(N_ITER_LIST),
    "learning_rate": LogUniformDistribution(low=LR_LOW, high=LR_HIGH),
    "max_depth": IntUniformDistribution(DEPTH_LOW, DEPTH_HIGH),
}

clf = xgb.XGBClassifier()


# per lista scorer sorted(sklearn.metrics.SCORERS.keys())
tuner = ADSTuner(clf, cv=FOLDS, strategy=params, scoring="roc_auc", study_name=STUDY_NAME, n_jobs=6, random_state=SEED)

tuner.tune(x_train, y_train, exit_criterion=[TimeBudget(TIME_BUDGET)])

[32m[I 2022-03-15 10:44:55,611][0m A new study created in RDB with name: xgb01[0m


In [16]:
# get the status to see if completed
print(f"The tuner status is: {tuner.get_status()}")

print(f"Remaining time is: {round(tuner.time_remaining, 1)} sec.")

The tuner status is: State.RUNNING
Remaining time is: 7171.0 sec.


In [31]:
# look only at completed trials, sorted with best on top. Metric chosen is in the value col.
result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
    by=["value"], ascending=False
)

result_df.head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_n_estimators,user_attrs_mean_fit_time,user_attrs_mean_score_time,...,user_attrs_metric,user_attrs_split0_test_score,user_attrs_split1_test_score,user_attrs_split2_test_score,user_attrs_split3_test_score,user_attrs_split4_test_score,user_attrs_std_fit_time,user_attrs_std_score_time,user_attrs_std_test_score,state
43,43,0.865601,2022-03-15 11:12:44.580822,2022-03-15 11:16:46.669010,0 days 00:04:02.088188,0.009475,4,1300,48.338119,0.034586,...,roc_auc,0.864633,0.860387,0.865874,0.864797,0.872316,0.864,0.003266,0.003845,COMPLETE
42,42,0.865601,2022-03-15 11:12:39.520932,2022-03-15 11:16:40.831658,0 days 00:04:01.310726,0.009506,4,1300,48.185437,0.032267,...,roc_auc,0.864676,0.860434,0.865796,0.864819,0.872279,0.786084,0.001985,0.003816,COMPLETE
78,78,0.865589,2022-03-15 11:36:10.410147,2022-03-15 11:40:00.604142,0 days 00:03:50.193995,0.009314,4,1300,45.963471,0.031837,...,roc_auc,0.864575,0.860453,0.865857,0.864771,0.872287,0.410735,0.002261,0.003821,COMPLETE
48,48,0.865524,2022-03-15 11:16:46.689930,2022-03-15 11:20:58.118984,0 days 00:04:11.429054,0.008123,4,1300,50.209382,0.032501,...,roc_auc,0.864349,0.860473,0.865899,0.864719,0.87218,0.664897,0.002582,0.003794,COMPLETE
76,76,0.865502,2022-03-15 11:35:43.825740,2022-03-15 11:39:35.125163,0 days 00:03:51.299423,0.009237,4,1300,46.180514,0.033492,...,roc_auc,0.864466,0.860408,0.865857,0.864519,0.87226,0.147837,0.003163,0.003841,COMPLETE
45,45,0.865497,2022-03-15 11:15:14.817528,2022-03-15 11:19:24.053789,0 days 00:04:09.236261,0.008142,4,1300,49.768803,0.032747,...,roc_auc,0.864445,0.86046,0.86594,0.864491,0.87215,0.466231,0.003442,0.003793,COMPLETE
50,50,0.865486,2022-03-15 11:18:47.910652,2022-03-15 11:23:01.668267,0 days 00:04:13.757615,0.00808,4,1300,50.675535,0.031981,...,roc_auc,0.864399,0.860471,0.8659,0.864601,0.872061,0.365501,0.001805,0.003755,COMPLETE
47,47,0.865485,2022-03-15 11:16:40.849067,2022-03-15 11:20:51.730777,0 days 00:04:10.881710,0.00834,4,1300,50.100374,0.033129,...,roc_auc,0.864542,0.860462,0.865902,0.864498,0.872021,0.659717,0.002016,0.003741,COMPLETE
49,49,0.865477,2022-03-15 11:18:28.099829,2022-03-15 11:22:41.582734,0 days 00:04:13.482905,0.008278,4,1300,50.623851,0.030466,...,roc_auc,0.864397,0.860458,0.865879,0.864638,0.872015,0.470939,0.001787,0.003741,COMPLETE
60,60,0.865474,2022-03-15 11:25:13.707733,2022-03-15 11:29:27.283489,0 days 00:04:13.575756,0.009057,4,1300,50.638212,0.033464,...,roc_auc,0.864513,0.860347,0.865848,0.864599,0.872065,0.094071,0.002859,0.003783,COMPLETE


In [28]:
def show_tuner_results(tuner):

    # to count completed
    result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
        by=["value"], ascending=False
    )

    print("ADSTuner session results:")
    print(f"ADSTuner has launched {tuner.trials.shape[0]} trials")
    print(f"ADSTuner has completed {result_df.shape[0]} trials")
    print()
    print(f"The best trial is the #: {tuner.best_index}")
    print(f"Parameters for the best trial are: {tuner.best_params}")
    print(f"The metric used to optimize is: {tuner.scoring_name}")
    print(f"The best score is: {round(tuner.best_score, 4)}")
    
show_tuner_results(tuner)

ADSTuner session results:
ADSTuner has launched 17 trials
ADSTuner has completed 11 trials

The best trial is the #: 3
Parameters for the best trial are: {'learning_rate': 0.009782305710376605, 'max_depth': 6, 'n_estimators': 1200}
The metric used to optimize is: roc_auc
The best score is: 0.8646


### Train with best params

In [None]:
%%time

clf = xgb.XGBClassifier(**tuner.best_params)

# addestro e valuto su train e su validation set
clf.fit(x_train, y_train,
        eval_set=[(x_train, y_train)],
        eval_metric='auc', verbose=100)

print()

evals_result = clf.evals_result()

#### OK, consider that the slightly higher AUC is due to the fact here we're evaluating also on train data

In [None]:
def plot_auc(train_hist):
    plt.figure(figsize=(9,6))
    
    plt.plot(train_hist, label='Training AUC')
    plt.title('AUC')
    plt.legend(loc='lower right')
    plt.ylabel('auc')
    plt.xlabel('n_estimator')
    plt.grid(True)
    plt.show();

In [None]:
train_hist = evals_result['validation_0']['auc']

plot_auc(train_hist)

In [None]:
# compute accuracy on full dataset
y_pred = clf.predict(x_train_full_scaled)

predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_train_full, predictions)

print("Accuracy on train set: %.2f%%" % (accuracy * 100.0))

In [None]:
# compute confusion matrix on full dataset
tn, fp, fn, tp = confusion_matrix(y_train_full, predictions).ravel()

(tn, fp, fn, tp)

### Prediction on the TEST set (for submission to Kaggle)

In [None]:
# predictions on test set
orig_test = pd.read_csv('cs-test.csv')

# inpute missing values, add the two indicator columns
orig_test['isna_mi'] = 0
orig_test.loc[orig_test[COL1_MISSING].isna(), 'isna_mi'] = 1
orig_test.loc[orig_test[COL1_MISSING].isna(), COL1_MISSING] = MONTHLY_INC_MEDIAN

orig_test['isna_nod'] = 0
orig_test.loc[orig_test[COL2_MISSING].isna(), 'isna_nod'] = 1
orig_test.loc[orig_test[COL2_MISSING].isna(), COL2_MISSING] = N_OF_DEP_MODE

In [None]:
ind_test = orig_test[ind_col].values

In [None]:
orig_test = orig_test.drop(ind_col, axis = 1)

In [None]:
ID_COL_NAME = 'Unnamed: 0'
xorig_test = orig_test.drop(ID_COL_NAME, axis = 1)
xorig_test = xorig_test.drop(TARGET, axis = 1)

x_test = xorig_test.values

In [None]:
# aggiungi qui lo scaling !!!
x_test_scaled = scaler.transform(x_test)
# riaggiunge le colonne indicatore
x_test_scaled = np.c_[x_test_scaled, ind_test]

assert x_test_scaled.shape[1] == N_FEATURES

In [None]:
# do predictions on test set (no shuffle !)
y_pred = clf.predict_proba(x_test_scaled)

# y_pred contiene le probabilità
y_pred = y_pred[:, 1]

In [None]:
# prepara il csv per la submission
result_dict = {"Id": orig_test[ID_COL_NAME].values,
              'Probability': y_pred}

FILE_SUB = 'submission25.csv'

# build a dataframe and save to csv
result_df = pd.DataFrame(result_dict)

result_df.to_csv(FILE_SUB, index=False, float_format='%.5f')

In [None]:
### Save Modela and scaler

In [None]:
# save model: uso un formato semplice: pkl
pickle.dump(clf, open("credit-scoring.pkl", "wb"))

In [None]:
# salvo anche lo scaler
pickle.dump(scaler, open("scaler.pkl", "wb"))

### Online predictions

In [None]:
# reload the model
loaded_model = pickle.load(open("credit-scoring.pkl", "rb"))

In [None]:
# reload the scaler
loaded_scaler = pickle.load(open("scaler.pkl", "rb"))

In [None]:
# prepare for online predictions
# input are given as a numpy array, with no missing fields, but we need to add the two indicator columns
x_input = np.array([[1,2,3,4,5,6,7,8,9,10],
                   [1,2,3,4,5,6,7,8,9,10],
                   [1,2,3,4,5,6,7,8,9,10]])

In [None]:
# controlli
assert x_input.shape[1] == 10
# check there are no null
assert np.sum(np.isnan(x_input)) == 0

In [None]:
# normalize
x_input_scaled = loaded_scaler.transform(x_input)

# add two columns with 0
x_add = np.zeros((x_input.shape[0], 2))
x_input_scaled = np.c_[x_input_scaled, x_add]

In [None]:
y_pred = loaded_model.predict(x_input_scaled)

In [None]:
y_pred

In [None]:
train_df[TARGET].hist();