In [50]:
# load libraries
import pandas as pd
from pandas import DataFrame as df
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
import time

# local module with helper utils
import model_utils as mutils
from model_utils.evaluation import get_metrics, evaluate_model

from imblearn.over_sampling import RandomOverSampler

current_k_fold = 10
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)


# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# deserialize pre-processed data
path_to_pickle = f'../data/creditcard/cc13_preprocessed_k{current_k_fold}.pkl'

with open(path_to_pickle, 'rb') as f:
    data = pickle.load(f)
    X_train = data['X_train']
    y_train = data['y_train']

    X_val = data['X_val']
    y_val = data['y_val']

    X_test = data['X_test']
    y_test = data['y_test']

print('Data loaded successfully')

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(y_train.value_counts())
IR_val = mutils.imb_ratio(y_val.value_counts())
IR_test = mutils.imb_ratio(y_test.value_counts())

# print imbalance ratios. They should be (nearly) the same. pct = 0.172 such as in the paper!
print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

"""
## Oversample minority class for training only
"""

ros = RandomOverSampler(random_state=SEED, sampling_strategy=1)
X_train, y_train = ros.fit_resample(X_train.to_numpy(), y_train.to_numpy())

res_value_counts = df(y_train).value_counts()

print("New Imbalance ratio:", mutils.imb_ratio(res_value_counts))

Data loaded successfully
Imbalance ratio in training data: 599.48
Imbalance ratio in validation data: 590.1
Imbalance ratio in test data: 602.68

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373
New Imbalance ratio: 1.0


# Hyperparameter finding and tuning

## Grid Tuning

In [None]:
# xbg hyperparams grid search

# hyperparams grid; params from https://xgboost.readthedocs.io/en/latest/parameter.html & https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#:~:text=to%20Booster%20parameters.-,Booster%20Parameters,-Though%C2%A0there%20are
search_space_grid_tuned_1= {
     # general params
    'device': [mutils.get_device()], # specify the device to be used for XGBoost
    'objective': ['binary:logistic'], # logistic regression for binary classification, output probability
    'verbose': [3], # 0 (silent), 1 (warning), 2 (info), 3 (debug)

    # tree booster params
    'max_leaves': [2**8],
    'learning_rate': [0.1, 0.3], # alias: eta, step size
    'gamma': [0.1, 0.3], # min loss reduction to create new tree split
    'max_depth': [7, 8], # max depth of tree
    'subsample': [0.7, 1],
   # 'colsample_bytree': [0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    'reg_alpha': [0.9, 1], # l1 regularization term on weights
    'reg_lambda': [0.3, 1], # l2 regularization term on weights
    'scale_pos_weight': [1, 2, 4, 580], # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
}

# clf = XGBClassifier(random_state=SEED, verbose=1, device=mutils.get_device())

# search  = GridSearchCV(clf, param_grid, scoring='roc_auc', verbose=1)
# search.fit(X_train, y_train)

# -> executed on Google Colab on A100 GPU, took ~ 37 minutes
# load clf from pickle
# clf_pkl = 'clf_xgb_cc.pkl'

# with open(clf_pkl, 'rb') as f:
#     data = pickle.load(f)
#     search = data['search']
#     results_df = data['results']

# results_df.head()

## RandomSearch Tuning

In [None]:
# search space

# hyperparams grid; params from https://xgboost.readthedocs.io/en/latest/parameter.html & https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#:~:text=to%20Booster%20parameters.-,Booster%20Parameters,-Though%C2%A0there%20are
search_space_random_tuned_1= {
     # general params
    'device': [mutils.get_device()], # specify the device to be used for XGBoost
    'objective': ['binary:logistic'], # logistic regression for binary classification, output probability

    # tree booster params
    'max_leaves': [0, 2**6, 2**8], # default 0
    'learning_rate': [0.002, 0.1, 0.3], # alias: eta, step size
    'gamma': [0.1, 0.3], # min loss reduction to create new tree split
    'max_depth': [6, 12, 20, 25], # max depth of tree
    'subsample': [0.4, 0.7, 1],
    'colsample_bytree': [0.4, 0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    'reg_alpha': [0, 0.9, 1], # l1 regularization term on weights, default = 0
    'reg_lambda': [0.3, 1], # l2 regularization term on weights, default = 1
    'scale_pos_weight': [1, 2, 4, 580], # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
    'eval_metric': ['auc'],
    'n_estimators': [100, 170, 500, 700, 1000, 2000, 5000],
    'booster': ['gbtree'],
    'tree_method': ['hist'],
}

clf = XGBClassifier(random_state=SEED, device=mutils.get_device())

random_search  = RandomizedSearchCV(clf, search_space_random_tuned_1, scoring='roc_auc', verbose=2, cv=5)
# random_search.fit(X_train, y_train)

In [None]:
rs_search_winner = random_search.best_estimator_
rs_search_winner.get_params()

In [None]:
rs_search_winner = random_search.best_estimator_
rs_search_winner.get_params()

# Train the model

In [19]:
param_grid_search_tuned_1 = {
    # general params
    "device": mutils.get_device(),  # specify the device to be used for XGBoost
    "objective": "binary:logistic",  # logistic regression for binary classification, output probability,
    # tree booster params
    "max_leaves": 2**8,
    "learning_rate": 0.3,  # alias: learning_rate, step size
    "gamma": 0.3,  # min loss reduction to create new tree split
    "max_depth": 6,  # max depth of tree
    "subsample": 0.7,
    # 'colsample_bytree': [0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    "reg_alpha": 1,  # l1 regularization term on weights
    "reg_lambda": 0.3,  # l2 regularization term on weights
    "scale_pos_weight": 580,  # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances),
    "eval_metric": "auc",
    "n_estimators": 170,
    "booster": "gbtree",
    "tree_method": "hist",
}

param_random_searched_tuned = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "colsample_bytree": 0.7,
    "device": mutils.get_device(),
    "gamma": 0.1,
    "learning_rate": 0.1,
    "max_depth": 20,
    "max_leaves": 256,
    "n_estimators": 2000,
    "reg_alpha": 1,
    "reg_lambda": 1,
    "scale_pos_weight": 2,
    "subsample": 0.4,
    "tree_method": "hist",
	'eval_metric': 'auc',
}

param_adjusted_vanilla_xgb = {
    # general params
    "device": mutils.get_device(),  # specify the device to be used for XGBoost
    "objective": "binary:logistic",  # logistic regression for binary classification, output probability
    "scale_pos_weight": 580,  # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
    "eval_metric": "auc",
    "booster": "gbtree",
    "tree_method": "hist",
}

In [51]:
# pandas.core.series.Series/Numpy to DataFrame
y_train = df(y_train)
y_val = df(y_val)
y_test = df(y_test)

# RandomOversampling removes column and returns numpy, reverse it
X_train = df(X_train)
X_train.columns = X_val.columns

eval_set = [(X_val, y_val)]

"""
Grid Tuned 1
"""
grid_tuned_1 = XGBClassifier(random_state=SEED, **param_grid_search_tuned_1)

grid_tuned_1.fit(X_train, y_train, eval_set=eval_set)

"""
Random Searched
"""
st = time.time()
random_search_tuned = XGBClassifier(random_state=SEED, **param_random_searched_tuned)

random_search_tuned.fit(X_train, y_train, eval_set=eval_set)
elapsed = time.time() - st
print('RStuned Execution time:', time.strftime("%H:%M:%S", time.gmtime(elapsed)))

"""
Vanilla XGB

- using default params: https://xgboost.readthedocs.io/en/stable/parameter.html#
"""
vanilla_xgb = XGBClassifier(random_state=SEED)

vanilla_xgb.fit(X_train, y_train, eval_set=eval_set)

"""
Slightly Adjusted Vanilla XGB
"""
adjusted_vanilla_xgb = XGBClassifier(random_state=SEED, **param_adjusted_vanilla_xgb)

adjusted_vanilla_xgb.fit(X_train, y_train, eval_set=eval_set)

[0]	validation_0-auc:0.69883
[1]	validation_0-auc:0.91949
[2]	validation_0-auc:0.92790
[3]	validation_0-auc:0.92186
[4]	validation_0-auc:0.93331
[5]	validation_0-auc:0.92481
[6]	validation_0-auc:0.92319
[7]	validation_0-auc:0.91307
[8]	validation_0-auc:0.91324
[9]	validation_0-auc:0.88812
[10]	validation_0-auc:0.89063
[11]	validation_0-auc:0.89146
[12]	validation_0-auc:0.89852
[13]	validation_0-auc:0.90275
[14]	validation_0-auc:0.88876
[15]	validation_0-auc:0.87526
[16]	validation_0-auc:0.87767
[17]	validation_0-auc:0.86637
[18]	validation_0-auc:0.86662
[19]	validation_0-auc:0.86505
[20]	validation_0-auc:0.86341
[21]	validation_0-auc:0.86315
[22]	validation_0-auc:0.85979
[23]	validation_0-auc:0.86032
[24]	validation_0-auc:0.86123
[25]	validation_0-auc:0.86174
[26]	validation_0-auc:0.86214
[27]	validation_0-auc:0.86152
[28]	validation_0-auc:0.86079
[29]	validation_0-auc:0.85938
[30]	validation_0-auc:0.86033
[31]	validation_0-auc:0.85968
[32]	validation_0-auc:0.86011
[33]	validation_0-au

In [52]:
print("current k fold: ", current_k_fold)
evaluate_model(
    [
        vanilla_xgb,
        adjusted_vanilla_xgb,
		random_search_tuned,
        grid_tuned_1,

    ],
    X_test,
    y_test,
    names=[
        "Vanilla XGB",
		"adjusted_vanilla_xgb",
        "Random Search Tuned",
        "GridTuned 1",
    ],
	as_table=True
)

current k fold:  10


Unnamed: 0,Model Name,AUCPRC,F1,G-Mean,MCC,Precision,Recall,ROCAUC,ACCURACY,TP,FP,TN,FN
2,Random Search Tuned,0.9335,0.9231,0.9453,0.9235,0.9545,0.8936,0.9901,0.9998,42.0,2.0,28324.0,5.0
1,adjusted_vanilla_xgb,0.9271,0.8958,0.9564,0.8959,0.8776,0.9149,0.993,0.9996,43.0,6.0,28320.0,4.0
0,Vanilla XGB,0.9108,0.9231,0.9453,0.9235,0.9545,0.8936,0.9892,0.9998,42.0,2.0,28324.0,5.0
3,GridTuned 1,0.9091,0.8431,0.9563,0.8455,0.7818,0.9149,0.9789,0.9994,43.0,12.0,28314.0,4.0
