In [3]:
# load libraries
import pandas as pd
from pandas import DataFrame as df
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np

# local module with helper utils
import model_utils as mutils
from model_utils.evaluation import get_metrics, evaluate_model

current_k_fold = 10
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# deserialize pre-processed data
path_to_pickle = f'../data/creditcard/cc13_preprocessed_k{current_k_fold}.pkl'

with open(path_to_pickle, 'rb') as f:
    data = pickle.load(f)
    X_train = data['X_train']
    y_train = data['y_train']

    X_val = data['X_val']
    y_val = data['y_val']

    X_test = data['X_test']
    y_test = data['y_test']

print('Data loaded successfully')

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(y_train.value_counts())
IR_val = mutils.imb_ratio(y_val.value_counts())
IR_test = mutils.imb_ratio(y_test.value_counts())

print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

Data loaded successfully
Imbalance ratio in training data: 599.48
Imbalance ratio in validation data: 590.1
Imbalance ratio in test data: 602.68

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373


## Create NaN Values

In [12]:
def cutout_data(X, pct, filler=np.nan):
	X_missing = X.copy()

	pct_idx = np.random.choice(X_missing.index, int(len(X_missing) * pct), replace=False)
	pct_cols = np.random.choice(X_missing.columns, int(len(X_missing.columns) * 1), replace=False)

	X_missing.loc[pct_idx, pct_cols] = filler
	return X_missing

X_missing10 = cutout_data(X_train, 0.1)
X_missing20 = cutout_data(X_train, 0.2)

print("Missing Data in X_missing10: ", X_missing10.isna().mean().mean().round(4) * 100)
print("Missing Data in X_missing20: ", X_missing20.isna().mean().mean().round(4) * 100)

Missing Data in X_missing10:  10.0
Missing Data in X_missing20:  20.0


## Feature imputation with scikit-learn

In [7]:
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier

# enable experimental features
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

# XGB params
param_random_searched_tuned_2 = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "device": mutils.get_device(),
    "scale_pos_weight": 580,
    "tree_method": "hist",
	'eval_metric': 'auc',
}

# datasets
datasets = {
	'10pct': (X_missing10),
	'20pct': (X_missing20)
}

# define imputers
imputers = {
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
	'iterative': IterativeImputer(random_state=SEED)
}

models = []
names = []

# for each dataset
for ds_name, ds in datasets.items():
	# impute with each imputer & fit on imputed data
	for imp_name, imputer in imputers.items():

		# create pipeline
		pipeline = Pipeline(steps=[
			('imputer', imputer),
			('clf', XGBClassifier(random_state=SEED, **param_random_searched_tuned_2))
		])

		x_ds, y_ds = ds
		models.append(pipeline.fit(x_ds, y_ds))
		names.append(ds_name + '_' + imp_name)

# evaluate
evaluate_model(
	model=models,
	X=X_test,
	y_true=df(y_test),
	names=names,
	as_table=True
)

Unnamed: 0,Model Name,AUCPRC,F1,G-Mean,MCC,Precision,Recall,ROCAUC,ACCURACY,TP,FP,TN,FN
3,20pct_mean,0.9334,0.9032,0.9452,0.9031,0.913,0.8936,0.9937,0.9997,42.0,4.0,28322.0,5.0
5,20pct_iterative,0.9334,0.9032,0.9452,0.9031,0.913,0.8936,0.9937,0.9997,42.0,4.0,28322.0,5.0
4,20pct_median,0.93,0.9032,0.9452,0.9031,0.913,0.8936,0.994,0.9997,42.0,4.0,28322.0,5.0
0,10pct_mean,0.9287,0.8936,0.9452,0.8934,0.8936,0.8936,0.991,0.9996,42.0,5.0,28321.0,5.0
2,10pct_iterative,0.9287,0.8936,0.9452,0.8934,0.8936,0.8936,0.991,0.9996,42.0,5.0,28321.0,5.0
1,10pct_median,0.9225,0.9149,0.9564,0.9148,0.9149,0.9149,0.9895,0.9997,43.0,4.0,28322.0,4.0
