In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

import ads

# to use ADSTuner
from ads.hpo.search_cv import ADSTuner
from ads.hpo.stopping_criterion import *
from ads.hpo.distributions import *

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# for undersampling the negative class
from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt

import pickle

%matplotlib inline

In [2]:
# check the ADS version
print(ads.__version__)

2.5.8


In [3]:
# global constants
N_ESTIMATORS = 1000
# numero di features compreso le due colonne indicatore aggiunte
N_FEATURES = 12

# nome della colonna con le label
TARGET = 'SeriousDlqin2yrs'

# colonne con missing values
COL1_MISSING = 'MonthlyIncome'
COL2_MISSING = 'NumberOfDependents'

# nomi delle due colonne indicator (valgono 1 laddove il dato è inputato)
IND1 = 'isna_mi'
IND2 = 'isna_nod'

ind_col = [IND1, IND2]

# prese da stats sul train test, usate per inputare i missing values su COL1 e CL2
MONTHLY_INC_MEDIAN = 5400.0
N_OF_DEP_MODE = 0

# ratio minority samples/majority
RATIO = 1./5.

In [4]:
# full, not undersampled
data_full = pd.read_csv('cs-training-nonull.csv')

# remove unneeded
data_full = data_full.drop('id', axis = 1)

In [5]:
# metto da parte le due colonne indicatori
ind_train = data_full[ind_col].values

data_full = data_full.drop(ind_col, axis = 1)

# estrae X: matrice features ed y, labels
y_train_full = data_full[TARGET].values
x_train_full = data_full.drop(TARGET, axis = 1).values

In [6]:
# prepara lo scaling
scaler = StandardScaler()

# calcola i parametri di scaling solo sul train set
scaler.fit(x_train_full)

# scalo tutto tranne che le due colonne indicatore
x_train_full_scaled = scaler.transform(x_train_full)

# riaggiungo gli indicatori (che non vengono scalati)
x_train_full_scaled = np.c_[x_train_full_scaled, ind_train]

# check
assert x_train_full_scaled.shape[1] == N_FEATURES

In [7]:
print(f'# of samples in full dataset: {x_train_full_scaled.shape[0]}')

# of samples in full dataset: 150000


In [8]:
# do the undersampling of the negative class
rus = RandomUnderSampler(sampling_strategy=RATIO, random_state=4321)

In [9]:
x_train, y_train = rus.fit_resample(x_train_full_scaled, y_train_full)

In [10]:
print(f'# of samples in resampled dataset: {x_train.shape[0]}')

# of samples in resampled dataset: 60156


In [11]:
# check ratio of classes
print(f'# of positive samples: {np.sum(y_train)}')
print(f'# of negative samples: {x_train.shape[0] - np.sum(y_train)}')

# of positive samples: 10026
# of negative samples: 50130


### Train the XGBoost Classifier

In [12]:
# parameters for the HPO session with Optuna
FOLDS = 5
SEED = 4321

N_TRIALS = 100
TIME_BUDGET = 7200
STUDY_NAME = "xgb01"

# ranges
LR_LOW = 1e-3
LR_HIGH = 1e-2
DEPTH_LOW = 4
DEPTH_HIGH = 8
N_ITER_LIST = [600, 700, 800, 900, 1000, 1100, 1200, 1300]

In [13]:
#
# Here we define the strategy, the space for hyper-parameters we want to explore
#
params = {
    "n_estimators": CategoricalDistribution(N_ITER_LIST),
    "learning_rate": LogUniformDistribution(low=LR_LOW, high=LR_HIGH),
    "max_depth": IntUniformDistribution(DEPTH_LOW, DEPTH_HIGH),
}

clf = xgb.XGBClassifier()


# per lista scorer sorted(sklearn.metrics.SCORERS.keys())
tuner = ADSTuner(clf, cv=FOLDS, strategy=params, scoring="roc_auc", study_name=STUDY_NAME, n_jobs=6)

tuner.tune(x_train, y_train, exit_criterion=[TimeBudget(TIME_BUDGET)])

[32m[I 2022-03-14 18:42:26,390][0m A new study created in RDB with name: xgb01[0m


In [21]:
# get the status to see if completed
print(f"The tuner status is: {tuner.get_status()}")

print(f"Remaining time is: {round(tuner.time_remaining, 1)} sec.")

The tuner status is: State.RUNNING
Remaining time is: 6684.0 sec.


In [39]:
# look only at completed trials, sorted with best on top. Metric chosen is in the value col.
result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
    by=["value"], ascending=False
)

result_df.head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_n_estimators,user_attrs_mean_fit_time,user_attrs_mean_score_time,...,user_attrs_metric,user_attrs_split0_test_score,user_attrs_split1_test_score,user_attrs_split2_test_score,user_attrs_split3_test_score,user_attrs_split4_test_score,user_attrs_std_fit_time,user_attrs_std_score_time,user_attrs_std_test_score,state
50,50,0.865477,2022-03-14 19:15:34.166132,2022-03-14 19:20:15.688278,0 days 00:04:41.522146,0.008492,4,1200,56.213223,0.040291,...,roc_auc,0.864384,0.860433,0.866017,0.864576,0.871974,14.187219,0.020774,0.00374,COMPLETE
49,49,0.865447,2022-03-14 19:15:08.777743,2022-03-14 19:20:01.041820,0 days 00:04:52.264077,0.008394,4,1200,58.361284,0.040683,...,roc_auc,0.864346,0.86049,0.865864,0.864479,0.872055,14.993363,0.014331,0.003757,COMPLETE
35,35,0.86542,2022-03-14 19:05:21.732425,2022-03-14 19:09:05.619339,0 days 00:03:43.886914,0.008013,4,1200,44.700664,0.032901,...,roc_auc,0.864327,0.860497,0.865925,0.864451,0.8719,0.408484,0.002534,0.003704,COMPLETE
34,34,0.86538,2022-03-14 19:04:46.994972,2022-03-14 19:08:31.409141,0 days 00:03:44.414169,0.007625,4,1200,44.808593,0.030716,...,roc_auc,0.864163,0.860499,0.865923,0.864508,0.871805,0.415131,0.002147,0.003678,COMPLETE
51,51,0.865374,2022-03-14 19:17:50.359384,2022-03-14 19:21:16.233045,0 days 00:03:25.873661,0.008329,4,1100,41.097793,0.027783,...,roc_auc,0.864168,0.86048,0.865927,0.864373,0.871922,0.802906,0.001606,0.00373,COMPLETE
36,36,0.86537,2022-03-14 19:05:47.642608,2022-03-14 19:09:33.205618,0 days 00:03:45.563010,0.007832,4,1200,45.037852,0.031325,...,roc_auc,0.864243,0.860521,0.8658,0.864424,0.871862,0.27547,0.001943,0.003688,COMPLETE
28,28,0.865361,2022-03-14 19:00:47.969233,2022-03-14 19:05:47.842584,0 days 00:04:59.873351,0.008288,5,1300,59.895556,0.036156,...,roc_auc,0.864396,0.860141,0.865449,0.864629,0.872188,0.473733,0.001226,0.003882,COMPLETE
25,25,0.865352,2022-03-14 18:57:11.598903,2022-03-14 19:01:49.973163,0 days 00:04:38.374260,0.009434,5,1200,55.595457,0.036831,...,roc_auc,0.864473,0.860012,0.865259,0.864683,0.872332,0.131069,0.003997,0.003961,COMPLETE
27,27,0.865342,2022-03-14 18:57:44.381019,2022-03-14 19:01:15.252882,0 days 00:03:30.871863,0.009654,5,900,42.097937,0.03253,...,roc_auc,0.864211,0.860217,0.865536,0.864603,0.872145,0.155584,0.001305,0.003858,COMPLETE
29,29,0.865333,2022-03-14 19:00:49.808633,2022-03-14 19:05:47.625723,0 days 00:04:57.817090,0.009802,5,1300,59.483752,0.035667,...,roc_auc,0.864674,0.859673,0.865336,0.864686,0.872294,0.350686,0.001221,0.004034,COMPLETE


In [38]:
def show_tuner_results(tuner):

    # to count completed
    result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
        by=["value"], ascending=False
    )

    print("ADSTuner session results:")
    print(f"ADSTuner has launched {tuner.trials.shape[0]} trials")
    print(f"ADSTuner has completed {result_df.shape[0]} trials")
    print()
    print(f"The best trial is the #: {tuner.best_index}")
    print(f"Parameters for the best trial are: {tuner.best_params}")
    print(f"The metric used to optimize is: {tuner.scoring_name}")
    print(f"The best score is: {round(tuner.best_score, 4)}")
    
show_tuner_results(tuner)

ADSTuner session results:
ADSTuner has launched 58 trials
ADSTuner has completed 52 trials

The best trial is the #: 50
Parameters for the best trial are: {'learning_rate': 0.008491587447830361, 'max_depth': 4, 'n_estimators': 1200}
The metric used to optimize is: roc_auc
The best score is: 0.8655


### Train with best params

In [32]:
%%time

clf = xgb.XGBClassifier(**tuner.best_params)

# addestro e valuto su train e su validation set
clf.fit(x_train, y_train,
        eval_set=[(x_train, y_train)],
        eval_metric='auc', verbose=100)

print()

evals_result = clf.evals_result()

[0]	validation_0-auc:0.83249
[100]	validation_0-auc:0.84925
[200]	validation_0-auc:0.85583
[300]	validation_0-auc:0.86053
[400]	validation_0-auc:0.86350
[500]	validation_0-auc:0.86660
[600]	validation_0-auc:0.86891
[700]	validation_0-auc:0.87065
[800]	validation_0-auc:0.87195
[900]	validation_0-auc:0.87289
[1000]	validation_0-auc:0.87373
[1100]	validation_0-auc:0.87443
[1199]	validation_0-auc:0.87509

CPU times: user 2h 12min 5s, sys: 1min 30s, total: 2h 13min 36s
Wall time: 5min


#### OK, consider that the slightly higher AUC is due to the fact here we're evaluating also on train data

In [None]:
def plot_auc(train_hist):
    plt.figure(figsize=(9,6))
    
    plt.plot(train_hist, label='Training AUC')
    plt.title('AUC')
    plt.legend(loc='lower right')
    plt.ylabel('auc')
    plt.xlabel('n_estimator')
    plt.grid(True)
    plt.show();

In [None]:
train_hist = evals_result['validation_0']['auc']

plot_auc(train_hist)

In [33]:
# compute accuracy on full dataset
y_pred = clf.predict(x_train_full_scaled)

predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_train_full, predictions)

print("Accuracy on train set: %.2f%%" % (accuracy * 100.0))

Accuracy on train set: 92.75%


In [34]:
# compute confusion matrix on full dataset
tn, fp, fn, tp = confusion_matrix(y_train_full, predictions).ravel()

(tn, fp, fn, tp)

(134608, 5366, 5503, 4523)

### Prediction on the TEST set (for submission to Kaggle)

In [None]:
# predictions on test set
orig_test = pd.read_csv('cs-test.csv')

# inpute missing values, add the two indicator columns
orig_test['isna_mi'] = 0
orig_test.loc[orig_test[COL1_MISSING].isna(), 'isna_mi'] = 1
orig_test.loc[orig_test[COL1_MISSING].isna(), COL1_MISSING] = MONTHLY_INC_MEDIAN

orig_test['isna_nod'] = 0
orig_test.loc[orig_test[COL2_MISSING].isna(), 'isna_nod'] = 1
orig_test.loc[orig_test[COL2_MISSING].isna(), COL2_MISSING] = N_OF_DEP_MODE

In [None]:
ind_test = orig_test[ind_col].values

In [None]:
orig_test = orig_test.drop(ind_col, axis = 1)

In [None]:
ID_COL_NAME = 'Unnamed: 0'
xorig_test = orig_test.drop(ID_COL_NAME, axis = 1)
xorig_test = xorig_test.drop(TARGET, axis = 1)

x_test = xorig_test.values

In [None]:
# aggiungi qui lo scaling !!!
x_test_scaled = scaler.transform(x_test)
# riaggiunge le colonne indicatore
x_test_scaled = np.c_[x_test_scaled, ind_test]

assert x_test_scaled.shape[1] == N_FEATURES

In [None]:
# do predictions on test set (no shuffle !)
y_pred = clf.predict_proba(x_test_scaled)

# y_pred contiene le probabilità
y_pred = y_pred[:, 1]

In [None]:
# prepara il csv per la submission
result_dict = {"Id": orig_test[ID_COL_NAME].values,
              'Probability': y_pred}

FILE_SUB = 'submission25.csv'

# build a dataframe and save to csv
result_df = pd.DataFrame(result_dict)

result_df.to_csv(FILE_SUB, index=False, float_format='%.5f')

In [None]:
### Save Modela and scaler

In [None]:
# save model: uso un formato semplice: pkl
pickle.dump(clf, open("credit-scoring.pkl", "wb"))

In [None]:
# salvo anche lo scaler
pickle.dump(scaler, open("scaler.pkl", "wb"))

### Online predictions

In [None]:
# reload the model
loaded_model = pickle.load(open("credit-scoring.pkl", "rb"))

In [None]:
# reload the scaler
loaded_scaler = pickle.load(open("scaler.pkl", "rb"))

In [None]:
# prepare for online predictions
# input are given as a numpy array, with no missing fields, but we need to add the two indicator columns
x_input = np.array([[1,2,3,4,5,6,7,8,9,10],
                   [1,2,3,4,5,6,7,8,9,10],
                   [1,2,3,4,5,6,7,8,9,10]])

In [None]:
# controlli
assert x_input.shape[1] == 10
# check there are no null
assert np.sum(np.isnan(x_input)) == 0

In [None]:
# normalize
x_input_scaled = loaded_scaler.transform(x_input)

# add two columns with 0
x_add = np.zeros((x_input.shape[0], 2))
x_input_scaled = np.c_[x_input_scaled, x_add]

In [None]:
y_pred = loaded_model.predict(x_input_scaled)

In [None]:
y_pred

In [None]:
train_df[TARGET].hist();