In [8]:
# load libraries
import pandas as pd
from pandas import DataFrame as df
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from imbens.ensemble import SelfPacedEnsembleClassifier as SPE

# local module with helper utils
import model_utils as mutils
from model_utils.evaluation import get_metrics, evaluate_model

current_k_fold = 2
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# deserialize pre-processed data
path_to_pickle = f'../data/creditcard/cc13_preprocessed_k{current_k_fold}.pkl'

with open(path_to_pickle, 'rb') as f:
    data = pickle.load(f)
    X_train = data['X_train']
    y_train = data['y_train']

    X_val = data['X_val']
    y_val = data['y_val']

    X_test = data['X_test']
    y_test = data['y_test']

print('Data loaded successfully')

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(y_train.value_counts())
IR_val = mutils.imb_ratio(y_val.value_counts())
IR_test = mutils.imb_ratio(y_test.value_counts())

# print imbalance ratios. They should be (nearly) the same. pct = 0.172 such as in the paper!
print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

Data loaded successfully
Imbalance ratio in training data: 599.48
Imbalance ratio in validation data: 602.68
Imbalance ratio in test data: 590.1

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373


# Hyperparameter finding and tuning

## Grid Tuning

In [None]:
# xbg hyperparams grid search

# hyperparams grid; params from https://xgboost.readthedocs.io/en/latest/parameter.html & https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#:~:text=to%20Booster%20parameters.-,Booster%20Parameters,-Though%C2%A0there%20are
search_space_grid_tuned_1= {
     # general params
    'device': [mutils.get_device()], # specify the device to be used for XGBoost
    'objective': ['binary:logistic'], # logistic regression for binary classification, output probability
    'verbose': [3], # 0 (silent), 1 (warning), 2 (info), 3 (debug)

    # tree booster params
    'max_leaves': [2**8],
    'learning_rate': [0.1, 0.3], # alias: eta, step size
    'gamma': [0.1, 0.3], # min loss reduction to create new tree split
    'max_depth': [7, 8], # max depth of tree
    'subsample': [0.7, 1],
   # 'colsample_bytree': [0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    'reg_alpha': [0.9, 1], # l1 regularization term on weights
    'reg_lambda': [0.3, 1], # l2 regularization term on weights
    'scale_pos_weight': [1, 2, 4, 580], # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
}

# clf = XGBClassifier(random_state=SEED, verbose=1, device=get_device())

# search  = GridSearchCV(clf, param_grid, scoring='roc_auc', verbose=1)
# search.fit(X_train, y_train)

# -> executed on Google Colab on A100 GPU, took ~ 37 minutes
# load clf from pickle
# clf_pkl = 'clf_xgb_cc.pkl'

# with open(clf_pkl, 'rb') as f:
#     data = pickle.load(f)
#     search = data['search']
#     results_df = data['results']

# results_df.head()

## RandomSearch Tuning

In [None]:
# search space

# hyperparams grid; params from https://xgboost.readthedocs.io/en/latest/parameter.html & https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#:~:text=to%20Booster%20parameters.-,Booster%20Parameters,-Though%C2%A0there%20are
search_space_random_tuned_1= {
     # general params
    'device': [mutils.get_device()], # specify the device to be used for XGBoost
    'objective': ['binary:logistic'], # logistic regression for binary classification, output probability

    # tree booster params
    'max_leaves': [0, 2**6, 2**8], # default 0
    'learning_rate': [0.002, 0.1, 0.3], # alias: eta, step size
    'gamma': [0.1, 0.3], # min loss reduction to create new tree split
    'max_depth': [6, 12, 20, 25], # max depth of tree
    'subsample': [0.4, 0.7, 1],
    'colsample_bytree': [0.4, 0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    'reg_alpha': [0, 0.9, 1], # l1 regularization term on weights, default = 0
    'reg_lambda': [0.3, 1], # l2 regularization term on weights, default = 1
    'scale_pos_weight': [1, 2, 4, 580], # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
    'eval_metric': ['auc'],
    'n_estimators': [100, 170, 500, 700, 1000, 2000, 5000],
    'booster': ['gbtree'],
    'tree_method': ['hist'],
}

clf = XGBClassifier(random_state=SEED, device=mutils.get_device())

random_search  = RandomizedSearchCV(clf, search_space_random_tuned_1, scoring='roc_auc', verbose=2, cv=5)
random_search.fit(X_train, y_train)

In [None]:
rs_search_winner = random_search.best_estimator_
rs_search_winner.get_params()

In [None]:
rs_search_winner = random_search.best_estimator_
rs_search_winner.get_params()

# Train the model

In [2]:
param_grid_search_tuned_1 = {
    # general params
    "device": mutils.get_device(),  # specify the device to be used for XGBoost
    "objective": "binary:logistic",  # logistic regression for binary classification, output probability,
    # tree booster params
    "max_leaves": 2**8,
    "learning_rate": 0.3,  # alias: learning_rate, step size
    "gamma": 0.3,  # min loss reduction to create new tree split
    "max_depth": 6,  # max depth of tree
    "subsample": 0.7,
    # 'colsample_bytree': [0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    "reg_alpha": 1,  # l1 regularization term on weights
    "reg_lambda": 0.3,  # l2 regularization term on weights
    "scale_pos_weight": 580,  # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances),
    "eval_metric": "auc",
    "n_estimators": 170,
    "booster": "gbtree",
    "tree_method": "hist",
}

param_random_searched_tuned = {
'objective': 'binary:logistic',
 'booster': 'gbtree',
 'colsample_bytree': 0.7,
 'device': mutils.get_device(),
 'eval_metric': 'auc',
 'gamma': 0.1,
 'learning_rate': 0.002,
 'max_depth': 12,
 'max_leaves': 0,
 'n_estimators': 5000,
 'reg_alpha': 0.9,
 'reg_lambda': 0.3,
 'scale_pos_weight': 580,
 'subsample': 0.7,
 'tree_method': 'hist',
 }

param_adjusted_vanilla_xgb = {
    # general params
    "device": mutils.get_device(),  # specify the device to be used for XGBoost
    "objective": "binary:logistic",  # logistic regression for binary classification, output probability
    "scale_pos_weight": 580,  # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
    "eval_metric": "auc",
    "booster": "gbtree",
    "tree_method": "hist",
}

In [9]:
import time

# pandas.core.series.Series to DataFrame
y_train = df(y_train)
y_val = df(y_val)
y_test = df(y_test)

eval_set = [(X_val, y_val)]

"""
Grid Tuned 1
"""
grid_tuned_1 = XGBClassifier(random_state=SEED, **param_grid_search_tuned_1)
grid_tuned_1_spe = SPE(estimator=grid_tuned_1, random_state=SEED)    

grid_tuned_1.fit(X_train, y_train, eval_set=eval_set)
grid_tuned_1_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})

"""
Random Searched
"""

random_search_tuned_3 = XGBClassifier(random_state=SEED, **param_random_searched_tuned)
# random_search_tuned_3_spe = SPE(estimator=random_search_tuned_3, random_state=SEED)

st = time.time()
random_search_tuned_3.fit(X_train, y_train, eval_set=eval_set)
# random_search_tuned_3_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})
elapsed_time = time.time() - st
print('RStuned Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))

"""
Vanilla XGB

- using default params: https://xgboost.readthedocs.io/en/stable/parameter.html#
"""
vanilla_xgb = XGBClassifier(random_state=SEED)
vanilla_xgb_spe = SPE(estimator=vanilla_xgb, random_state=SEED)

vanilla_xgb.fit(X_train, y_train, eval_set=eval_set)
vanilla_xgb_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})

"""
Slightly Adjusted Vanilla XGB
"""
adjusted_vanilla_xgb = XGBClassifier(random_state=SEED, **param_adjusted_vanilla_xgb)
adjusted_vanilla_xgb_spe = SPE(estimator=adjusted_vanilla_xgb, random_state=SEED)

adjusted_vanilla_xgb.fit(X_train, y_train, eval_set=eval_set)
st = time.time()
adjusted_vanilla_xgb_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})
elapsed_time = time.time() - st
print('SPE Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed_time)))


# evaluate
print(f"result for k-fold {current_k_fold}:")
evaluate_model(
    [
		adjusted_vanilla_xgb_spe,
        random_search_tuned_3,

    ],
    X_test,
    y_test,
    names=[
        "Adjusted Vanilla XGB SPE",
        "Random Search Tuned 3 ",

    ],
	as_table=True
)

SPE Execution time: 00:00:08
result for k-fold 2:


Unnamed: 0,Model Name,AUCPRC,F1,G-Mean,MCC,Precision,Recall,ROCAUC,ACCURACY,TP,FP,TN,FN
0,Adjusted Vanilla XGB SPE,0.797,0.7872,0.8778,0.7871,0.8043,0.7708,0.9437,0.9993,37.0,9.0,28316.0,11.0


In [9]:
y_train = df(y_train)
y_val = df(y_val)
y_test = df(y_test)

eval_set = [(X_val, y_val)]

adjusted_vanilla_xgb = XGBClassifier(random_state=SEED, **param_adjusted_vanilla_xgb)
adjusted_vanilla_xgb_spe = SPE(estimator=adjusted_vanilla_xgb, random_state=SEED)
adjusted_vanilla_xgb_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})

In [9]:
path_to_models = './saved_models/XGB/'

# save best model
random_search_tuned_3.save_model(path_to_models + 'XGB_CC_rs_tuned_3.json')

# save best spe model - save_model() not working for SelfPacedEnsembleClassifier
path_to_spe_model = path_to_models + 'XGB_SPE_CC_adjusted_vanilla.pkl'

# save best spe model using pickle
with open(path_to_spe_model, 'wb') as file:
    pickle.dump(adjusted_vanilla_xgb_spe, file)

print('Models saved successfully')


Models saved successfully


In [4]:
# restore
path_to_models = './saved_models/XGB/'
path_to_spe_model = path_to_models + 'XGB_SPE_CC_adjusted_vanilla.pkl'

random_search_tuned_3.load_model(path_to_models + 'XGB_CC_rs_tuned_3.json')

# save best spe model using pickle
with open(path_to_spe_model, 'rb') as file:
    adjusted_vanilla_xgb_spe = pickle.load(file)

print('Models loaded successfully')

Models loaded successfully
