In [2]:
# load libraries
import pandas as pd
from pandas import DataFrame as df
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np

# local module with helper utils
import model_utils as mutils
from model_utils.evaluation import get_metrics, evaluate_model

current_k_fold = 10
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# deserialize pre-processed data
path_to_pickle = f'../data/creditcard/cc13_preprocessed_k{current_k_fold}.pkl'

with open(path_to_pickle, 'rb') as f:
    data = pickle.load(f)
    X_train = data['X_train']
    y_train = data['y_train']

    X_val = data['X_val']
    y_val = data['y_val']

    X_test = data['X_test']
    y_test = data['y_test']

print('Data loaded successfully')

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(y_train.value_counts())
IR_val = mutils.imb_ratio(y_val.value_counts())
IR_test = mutils.imb_ratio(y_test.value_counts())

print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

Data loaded successfully
Imbalance ratio in training data: 599.48
Imbalance ratio in validation data: 590.1
Imbalance ratio in test data: 602.68

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373


## Create NaN Values

In [3]:
def cutout_data(X, pct, filler=np.nan):
	X_missing = X.copy()

	pct_idx = np.random.choice(X_missing.index, int(len(X_missing) * pct), replace=False)
	pct_cols = np.random.choice(X_missing.columns, int(len(X_missing.columns) * 1), replace=False)

	X_missing.loc[pct_idx, pct_cols] = filler
	return X_missing

X_missing10 = cutout_data(X_train, 0.1)
X_missing20 = cutout_data(X_train, 0.2)
X_missing30 = cutout_data(X_train, 0.3)
X_missing50 = cutout_data(X_train, 0.5)

print("Missing Data in X_missing10: ", X_missing10.isna().mean().mean().round(4) * 100)
print("Missing Data in X_missing20: ", X_missing20.isna().mean().mean().round(4) * 100)
print("Missing Data in X_missing30: ", X_missing30.isna().mean().mean().round(4) * 100)
print("Missing Data in X_missing50: ", X_missing50.isna().mean().mean().round(4) * 100)

Missing Data in X_missing10:  10.0
Missing Data in X_missing20:  20.0
Missing Data in X_missing30:  30.0
Missing Data in X_missing50:  50.0


## Feature imputation with scikit-learn

In [None]:
# from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier

# enable experimental features
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer

# datasets
datasets = {
	'10pct': (X_missing10, y_train),
	'20pct': (X_missing20, y_train),
	'30pct': (X_missing30, y_train),
	'50pct': (X_missing50, y_train)
}


# configure IterativeImputer
min_values = np.concatenate([np.full(34, -1.), np.zeros(87)])
max_values = np.concatenate([np.full(34, 1.), np.ones(87)])

# define imputers
imputers = {
    'mean': SimpleImputer(strategy='mean'),
    'median': SimpleImputer(strategy='median'),
	#'iterative': IterativeImputer(random_state=SEED, missing_values=np.nan, min_value=min_values, max_value=max_values)
}

models = []
names = []

# for each dataset
for ds_name, ds in datasets.items():
	# impute with each imputer & fit on imputed data
	for imp_name, imputer in imputers.items():

		# create pipeline
		pipeline = Pipeline(steps=[
			
			('imputer', imputer),
			('ros', RandomOverSampler(random_state=SEED, sampling_strategy=1)),
			('clf', XGBClassifier(random_state=SEED)) # vanilla XGB was the best for KDD dataset
		])

		x_ds, y_ds = ds
		models.append(pipeline.fit(x_ds, y_ds))
		names.append(ds_name + '_' + imp_name)

# evaluate
evaluate_model(
	model=models,
	X=X_test,
	y_true=df(y_test),
	names=names,
	as_table=True # 80m 40s
)