In [4]:
# load libraries
import pandas as pd
import time
from pandas import DataFrame as df
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np
from imbens.ensemble import SelfPacedEnsembleClassifier as SPE

# local module with helper utils
import model_utils as mutils
from model_utils.evaluation import evaluate_model, get_metrics

current_k_fold=10
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# deserialize pre-processed data
path_to_pickle = f'../data/kddcup/kdd_preprocessed_k{current_k_fold}.pkl'


with open(path_to_pickle, 'rb') as f:
    data = pickle.load(f)
    X_train = data['X_train']
    y_train = data['y_train']

    X_val = data['X_val']
    y_val = data['y_val']

    X_test = data['X_test']
    y_test = data['y_test']

    le = data['label_enc']
    oh_enc = data['oh_enc']


print('Data loaded successfully')

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(y_train.value_counts())
IR_val = mutils.imb_ratio(y_val.value_counts())
IR_test = mutils.imb_ratio(y_test.value_counts())

# print imbalance ratios. They should be (nearly) the same. 
print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

Data loaded successfully
Imbalance ratio in training data: 4.13
Imbalance ratio in validation data: 4.13
Imbalance ratio in test data: 4.13


In [7]:
y_train.value_counts()

label
0    200349
1     48474
Name: count, dtype: int64

# Hyperparameter finding and tuning

## Grid Tuning

In [2]:
# xbg hyperparams grid search

# hyperparams grid; params from https://xgboost.readthedocs.io/en/latest/parameter.html & https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#:~:text=to%20Booster%20parameters.-,Booster%20Parameters,-Though%C2%A0there%20are
param_grid = {
       'device': [mutils.get_device()], # specify the device to be used for XGBoost
    'objective': ['binary:logistic'], # logistic regression for binary classification, output probability
    'verbose': [3], # 0 (silent), 1 (warning), 2 (info), 3 (debug)

    'n_estimators': [150, 170, 200, 250],
    'booster': ['gbtree'],
    'tree_method': ['hist'],

    # tree booster params
    'max_leaves': [2**6, 2**8],
    'learning_rate': [0.3], # alias: eta, step size
    'gamma': [0.3], # min loss reduction to create new tree split
    'max_depth': [5,6, 7], # max depth of tree
    'subsample': [0.7],
   # 'colsample_bytree': [0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    'reg_alpha': [1], # l1 regularization term on weights
    'reg_lambda': [0.3], # l2 regularization term on weights
    'scale_pos_weight': [2,4], # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
   
   }

clf = XGBClassifier(random_state=SEED, verbose=1, device=mutils.get_device())

# search  = GridSearchCV(clf, param_grid, scoring='roc_auc', verbose=1)
# search.fit(X_train, y_train)

# -> executed on Google Colab on A100 GPU, took ~ 37 minutes
# load clf from pickle
# clf_pkl = 'clf_xgb_cc.pkl'

# with open(clf_pkl, 'rb') as f:
#     data = pickle.load(f)
#     search = data['search']
#     results_df = data['results']

# results_df.head()

## RandomSearch Tuning

In [11]:
# search space

# hyperparams grid; params from https://xgboost.readthedocs.io/en/latest/parameter.html & https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/#:~:text=to%20Booster%20parameters.-,Booster%20Parameters,-Though%C2%A0there%20are
search_space_random_tuned_1= {
     # general params
    'device': [mutils.get_device()], # specify the device to be used for XGBoost
    'objective': ['binary:logistic'], # logistic regression for binary classification, output probability

    # tree booster params
    'max_leaves': [0, 2**6, 2**8], # default 0
    'learning_rate': [0.002, 0.1, 0.3], # alias: eta, step size
    'gamma': [0.1, 0.3], # min loss reduction to create new tree split
    'max_depth': [6, 12, 20, 25], # max depth of tree
    'subsample': [0.4, 0.7, 1],
    'colsample_bytree': [0.4, 0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    'reg_alpha': [0, 0.9, 1], # l1 regularization term on weights, default = 0
    'reg_lambda': [0.3, 1], # l2 regularization term on weights, default = 1
    'scale_pos_weight': [1, 2, 4, 580], # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
    'eval_metric': ['auc'],
    'n_estimators': [100, 170, 500, 700, 1000, 2000, 5000],
    'booster': ['gbtree'],
    'tree_method': ['hist'],
}

clf = XGBClassifier(random_state=SEED, device=mutils.get_device())

random_search  = RandomizedSearchCV(clf, search_space_random_tuned_1, scoring='roc_auc', verbose=3, cv=5)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END booster=gbtree, colsample_bytree=1, device=cpu, eval_metric=auc, gamma=0.3, learning_rate=0.002, max_depth=20, max_leaves=0, n_estimators=700, objective=binary:logistic, reg_alpha=0, reg_lambda=0.3, scale_pos_weight=4, subsample=1, tree_method=hist;, score=0.999 total time=  16.2s
[CV 2/5] END booster=gbtree, colsample_bytree=1, device=cpu, eval_metric=auc, gamma=0.3, learning_rate=0.002, max_depth=20, max_leaves=0, n_estimators=700, objective=binary:logistic, reg_alpha=0, reg_lambda=0.3, scale_pos_weight=4, subsample=1, tree_method=hist;, score=0.999 total time=  15.9s
[CV 3/5] END booster=gbtree, colsample_bytree=1, device=cpu, eval_metric=auc, gamma=0.3, learning_rate=0.002, max_depth=20, max_leaves=0, n_estimators=700, objective=binary:logistic, reg_alpha=0, reg_lambda=0.3, scale_pos_weight=4, subsample=1, tree_method=hist;, score=0.999 total time=  16.6s
[CV 4/5] END booster=gbtree, colsample_bytree=1, devic

In [12]:
xgb_best = random_search.best_estimator_
xgb_best.get_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': 'gbtree',
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 0.7,
 'device': device(type='cpu'),
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': 'auc',
 'feature_types': None,
 'gamma': 0.1,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.1,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': 20,
 'max_leaves': 256,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': 1000,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': 123,
 'reg_alpha': 1,
 'reg_lambda': 1,
 'sampling_method': None,
 'scale_pos_weight': 4,
 'subsample': 1,
 'tree_method': 'hist',
 'validate_parameters': None,
 'verbosity': None}

# Train the model

In [5]:
param_grid_search_tuned_1 = {
    # general params
    "device": mutils.get_device(),  # specify the device to be used for XGBoost
    "objective": "binary:logistic",  # logistic regression for binary classification, output probability,
    # tree booster params
    "max_leaves": 2**6,
    "eta": 0.1,  # alias: learning_rate, step size
    "gamma": 0.3,  # min loss reduction to create new tree split
    "max_depth": 5,  # max depth of tree
    "subsample": 0.7,
    # 'colsample_bytree': [0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    "reg_alpha": 1,  # l1 regularization term on weights
    "reg_lambda": 0.3,  # l2 regularization term on weights
    "scale_pos_weight": 2,  # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances),
    "eval_metric": "auc",
    "n_estimators": 350,
    "booster": "gbtree",
    "tree_method": "hist",
}

param_grid_search_tuned_2 = {
    # general params
    "device": mutils.get_device(),  # specify the device to be used for XGBoost
    "objective": "binary:logistic",  # logistic regression for binary classification, output probability
    # tree booster params
    "max_leaves": 2**6,
    "learning_rate": 0.3,  # alias: eta, step size
    "gamma": 0.3,  # min loss reduction to create new tree split
    "max_depth": 5,  # max depth of tree
    "subsample": 0.7,
    # 'colsample_bytree': [0.5, 0.7, 1], # subsample ratio of columns when constructing each tree
    "reg_alpha": 1,  # l1 regularization term on weights
    "reg_lambda": 0.3,  # l2 regularization term on weights
    "scale_pos_weight": 2,  # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
    "eval_metric": "auc",
    "n_estimators": 250,
    "booster": "gbtree",
    "tree_method": "hist",
}

param_random_search_tuned_1 = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "colsample_bytree": 0.4,
    "device": mutils.get_device(),
    "eval_metric": "auc",
    "gamma": 0.1,
    "learning_rate": 0.3,
    "max_depth": 20,
    "max_leaves": 64,
    "n_estimators": 170,
    "reg_alpha": 0.9,
    "reg_lambda": 0.3,
    "scale_pos_weight": 2,
    "subsample": 1,
    "tree_method": "hist",
}

param_random_search_tuned_2 = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "colsample_bytree": 0.7,
    "device": mutils.get_device(),
    "eval_metric": "auc",
    "gamma": 0.1,
    "learning_rate": 0.1,
    "max_depth": 20,
    "max_leaves": 256,
    "n_estimators": 1000,
    "reg_alpha": 1,
    "reg_lambda": 1,
    "scale_pos_weight": 4,
    "subsample": 1,
    "tree_method": "hist",
}


param_adjusted_vanilla_xgb = {
    # general params
    "device": mutils.get_device(),  # specify the device to be used for XGBoost
    "objective": "binary:logistic",  # logistic regression for binary classification, output probability
    "scale_pos_weight": 4,  # control balance of positive and negative weights, useful for unbalanced classes. recommended to start with sum(negative instances) / sum(positive instances)
    "eval_metric": "auc",
    "booster": "gbtree",
    "tree_method": "hist",
}

In [7]:
# pandas.core.series.Series to DataFrame
y_train = df(y_train)
y_val = df(y_val)
y_test = df(y_test)

eval_set = [(X_val, y_val)]

"""
Grid Tuned 1
"""
grid_tuned_1 = XGBClassifier(random_state=SEED, **param_grid_search_tuned_1)
grid_tuned_1_spe = SPE(estimator=grid_tuned_1, random_state=SEED)

grid_tuned_1.fit(X_train, y_train, eval_set=eval_set)
grid_tuned_1_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})

"""
Grid Tuned 2
"""
grid_tuned_2 = XGBClassifier(random_state=SEED, **param_grid_search_tuned_2)
grid_tuned_2_spe = SPE(estimator=grid_tuned_1, random_state=SEED)

grid_tuned_2.fit(X_train, y_train, eval_set=eval_set)
grid_tuned_2_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})


"""
Random Searched 1
"""
random_search_tuned_1 = XGBClassifier(random_state=SEED, **param_random_search_tuned_1)
random_search_tuned_1_spe = SPE(estimator=random_search_tuned_1, random_state=SEED)

random_search_tuned_1.fit(X_train, y_train, eval_set=eval_set)
random_search_tuned_1_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})

"""
Random Searched 2
"""
random_search_tuned_2 = XGBClassifier(random_state=SEED, **param_random_search_tuned_2)
random_search_tuned_2_spe = SPE(estimator=random_search_tuned_2, random_state=SEED)

start_spe = time.time()
random_search_tuned_2.fit(X_train, y_train, eval_set=eval_set)
random_search_tuned_2_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})
print(f"xgb+spe: {time.time()-start_spe}s")
"""
Vanilla XGB

- using default params: https://xgboost.readthedocs.io/en/stable/parameter.html#
"""
vanilla_xgb = XGBClassifier(random_state=SEED)
vanilla_xgb_spe = SPE(estimator=vanilla_xgb, random_state=SEED)
start_normal = time.time()
vanilla_xgb.fit(X_train, y_train, eval_set=eval_set)
vanilla_xgb_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})
print(f"xgb+normal: {time.time()-start_spe}s")
"""
Slightly Adjusted Vanilla XGB
"""
adjusted_vanilla_xgb = XGBClassifier(random_state=SEED, **param_adjusted_vanilla_xgb)
adjusted_vanilla_xgb_spe = SPE(estimator=adjusted_vanilla_xgb, random_state=SEED)

adjusted_vanilla_xgb.fit(X_train, y_train, eval_set=eval_set)
adjusted_vanilla_xgb_spe.fit(X_train, y_train, eval_datasets={"valid": (X_val, y_val)})

xgb+spe: 175.08385610580444s
[0]	validation_0-logloss:0.30010
[1]	validation_0-logloss:0.21863
[2]	validation_0-logloss:0.16767
[3]	validation_0-logloss:0.13203
[4]	validation_0-logloss:0.10739
[5]	validation_0-logloss:0.09045
[6]	validation_0-logloss:0.07840
[7]	validation_0-logloss:0.06882
[8]	validation_0-logloss:0.06087
[9]	validation_0-logloss:0.05514
[10]	validation_0-logloss:0.05149
[11]	validation_0-logloss:0.04863
[12]	validation_0-logloss:0.04573
[13]	validation_0-logloss:0.04422
[14]	validation_0-logloss:0.04268
[15]	validation_0-logloss:0.04164
[16]	validation_0-logloss:0.04095
[17]	validation_0-logloss:0.04023
[18]	validation_0-logloss:0.03925
[19]	validation_0-logloss:0.03881
[20]	validation_0-logloss:0.03857
[21]	validation_0-logloss:0.03828
[22]	validation_0-logloss:0.03760
[23]	validation_0-logloss:0.03688
[24]	validation_0-logloss:0.03659
[25]	validation_0-logloss:0.03647
[26]	validation_0-logloss:0.03631
[27]	validation_0-logloss:0.03589
[28]	validation_0-logloss:0.0

'\nSlightly Adjusted Vanilla XGB\n'

In [6]:
mutils.evaluate_model(
    [
        vanilla_xgb,
        vanilla_xgb_spe,
		
        adjusted_vanilla_xgb,
        adjusted_vanilla_xgb_spe,
		
        random_search_tuned_1,
        random_search_tuned_1_spe,
		
		random_search_tuned_2,
		random_search_tuned_2_spe,
		
        grid_tuned_1,
        grid_tuned_1_spe,
		
        grid_tuned_2,
        grid_tuned_2_spe,
    ],
    X_test,
    y_test,
    names=[
        "Vanilla XGB",
        "Vanilla XGB SPE",
		
        "Adjusted Vanilla XGB",
        "Adjusted Vanilla XGB SPE",
		
        "Random Searched Tuned 1",
        "Random Searched Tuned 1 SPE",
		
		"Random Searched Tuned 2 SPE",
		"Random Searched Tuned 2",
		
        "Grid Tuned Model 1",
        "Grid Tuned Model 1 SPE",
		
        "Grid Tuned Model 2",
        "Grid Tuned Model 2 SPE",
    ],
)

Unnamed: 0,Model Name,AUC PRC,ROC AUC,F1,G-Mean,MCC,ACCURACY,TP,FP,TN,FN,Precision,Recall
0,Vanilla XGB,0.955,0.9687,0.95,0.95,0.938,0.9805,5752.0,297.0,24746.0,308.0,0.9509,0.9492
5,Random Searched Tuned 1 SPE,0.9527,0.9419,0.9379,0.9397,0.9266,0.9772,5357.0,6.0,25037.0,703.0,0.9989,0.884
7,Random Searched Tuned 2,0.9507,0.9822,0.948,0.9487,0.9359,0.9789,5985.0,582.0,24461.0,75.0,0.9114,0.9876
9,Grid Tuned Model 1 SPE,0.9506,0.9397,0.9352,0.937,0.9235,0.9762,5331.0,10.0,25033.0,729.0,0.9981,0.8797
11,Grid Tuned Model 2 SPE,0.9506,0.9397,0.9352,0.937,0.9235,0.9762,5331.0,10.0,25033.0,729.0,0.9981,0.8797
1,Vanilla XGB SPE,0.949,0.9372,0.9327,0.9348,0.9208,0.9754,5300.0,5.0,25038.0,760.0,0.9991,0.8746
4,Random Searched Tuned 1,0.9464,0.9835,0.9434,0.9447,0.9308,0.9768,6027.0,690.0,24353.0,33.0,0.8973,0.9946
8,Grid Tuned Model 1,0.945,0.9837,0.9418,0.9432,0.929,0.976,6038.0,724.0,24319.0,22.0,0.8929,0.9964
10,Grid Tuned Model 2,0.9445,0.9833,0.9413,0.9427,0.9283,0.9758,6034.0,727.0,24316.0,26.0,0.8925,0.9957
6,Random Searched Tuned 2 SPE,0.9435,0.9838,0.9401,0.9417,0.9271,0.9752,6047.0,757.0,24286.0,13.0,0.8887,0.9979


In [7]:
# path_to_models = './saved_models/XGB/'

# # save best model
# vanilla_xgb.save_model(path_to_models + 'XGB_KDD_vanilla_xgb.json')

# # save best spe model - save_model() not working for SelfPacedEnsembleClassifier
# path_to_spe_model = path_to_models + 'XGB_SPE_KDD_random_search_1.pkl'

# # save best spe model using pickle
# # with open(path_to_spe_model, 'wb') as file:
# #     pickle.dump(random_search_tuned_1_spe, file)

# # print('Models saved successfully')

Models saved successfully


In [11]:
# load models
path_to_models = './saved_models/XGB/'

vanilla_xgb.load_model(path_to_models + 'XGB_KDD_vanilla_xgb.json')

# unpickle spe model
with open(path_to_models + 'XGB_SPE_KDD_random_search_1.pkl', 'rb') as file:
	random_search_tuned_1_spe = pickle.load(file)
