Model Architecture from https://keras.io/examples/structured_data/imbalanced_classification/

In [1]:
import pickle
import time

# imoprt data science libraries
import pandas as pd
from pandas import DataFrame as df
import matplotlib.pyplot as plt
import numpy as np

# Import ML libraries
import keras
import model_utils as mutils
from model_utils.evaluation import get_metrics, evaluate_model, table
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# For SelfPacedEnsemble
from imbens.ensemble import SelfPacedEnsembleClassifier as SPE
from scikeras.wrappers import KerasClassifier

current_k_fold = 3
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# deserialize pre-processed data
path_to_pickle = f'../data/creditcard/cc13_preprocessed_k{current_k_fold}.pkl'

with open(path_to_pickle, "rb") as f:
    data = pickle.load(f)
    X_train = data["X_train"].to_numpy()
    y_train = data["y_train"].to_numpy()

    X_val = data["X_val"].to_numpy()
    y_val = data["y_val"].to_numpy()

    X_test = data["X_test"].to_numpy()
    y_test = data["y_test"].to_numpy()

    col_names = data["col_names"]

print("Data loaded successfully")

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(data["y_train"].value_counts())
IR_val = mutils.imb_ratio(data["y_val"].value_counts())
IR_test = mutils.imb_ratio(data["y_test"].value_counts())

# print imbalance ratios. They should be (nearly) the same. pct = 0.172 such as in the paper!
print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

# Reshape
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

# Set Weight
res_value_counts = df(y_train).value_counts()
weight_for_0 = 1.0 / res_value_counts[0]
weight_for_1 = 1.0 / res_value_counts[1]

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

# Normalize Data
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

Data loaded successfully
Imbalance ratio in training data: 599.48
Imbalance ratio in validation data: 602.68
Imbalance ratio in test data: 590.1

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373


## Build the Model

In [3]:
np.shape(X_train)[1]

29

In [4]:
{0: weight_for_0, 1: weight_for_1}

{0: 4.413023715589447e-06, 1: 0.0026455026455026454}

In [52]:
"""
## Credits for Model: https://keras.io/examples/structured_data/imbalanced_classification/
"""

# Hyperparams:
data_dim = np.shape(X_train)[1]
batch_size=2048
epochs=50
learning_rate=1e-2
Adam = keras.optimizers.Adam(learning_rate)
class_weight = {0: weight_for_0, 1: weight_for_1}

model = keras.Sequential(
    [
        keras.Input(shape=(data_dim,)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

metrics = [
    keras.metrics.AUC(name="roc_auc", curve='ROC'),
    keras.metrics.AUC(name='auc_prc', curve="PR"),
	keras.metrics.F1Score(name="f1"),
	
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=Adam, loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint(f"./ckp/MLP_Normal_CC/k{current_k_fold}"+"_epoch_{epoch}.keras")]


hist = model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=2,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    class_weight=class_weight,
)

Epoch 1/50
111/111 - 2s - 20ms/step - auc_prc: 0.3840 - f1: 0.0033 - fn: 43.0000 - fp: 30708.0000 - loss: 2.3285e-06 - precision: 0.0108 - recall: 0.8862 - roc_auc: 0.9557 - tn: 195894.0000 - tp: 335.0000 - val_auc_prc: 0.5805 - val_f1: 0.0033 - val_fn: 5.0000 - val_fp: 1260.0000 - val_loss: 0.1435 - val_precision: 0.0323 - val_recall: 0.8936 - val_roc_auc: 0.9808 - val_tn: 27066.0000 - val_tp: 42.0000
Epoch 2/50
111/111 - 1s - 8ms/step - auc_prc: 0.4638 - f1: 0.0033 - fn: 29.0000 - fp: 9200.0000 - loss: 1.4274e-06 - precision: 0.0365 - recall: 0.9233 - roc_auc: 0.9816 - tn: 217402.0000 - tp: 349.0000 - val_auc_prc: 0.2677 - val_f1: 0.0033 - val_fn: 3.0000 - val_fp: 3190.0000 - val_loss: 0.3161 - val_precision: 0.0136 - val_recall: 0.9362 - val_roc_auc: 0.9797 - val_tn: 25136.0000 - val_tp: 44.0000
Epoch 3/50
111/111 - 1s - 8ms/step - auc_prc: 0.4467 - f1: 0.0033 - fn: 25.0000 - fp: 8861.0000 - loss: 1.1370e-06 - precision: 0.0383 - recall: 0.9339 - roc_auc: 0.9897 - tn: 217741.0000 - 

In [53]:
# getting best epoch 
metric_history = hist.history["val_auc_prc"]
best_epoch = metric_history.index(max(metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(metric_history))
# or 
# metric_history = hist.history["val_loss"]
# best_epoch = metric_history.index(min(metric_history)) + 1
# print("Best epoch: ", best_epoch, "Best Value: ", max(metric_history))

# best_epoch=42
model.load_weights(f'./ckp/MLP_Normal_CC/k{current_k_fold}_epoch_{best_epoch}.keras')

Best epoch:  11 Best Value:  0.7319148182868958


# Self Paced Ensemble for MLP

In [54]:
# wrap Keras MLP into a scikit-learn API-compatible Wrapper class with `scikeras.KerasClassifier`
model2 = model

mlp = KerasClassifier(model2,
					  epochs=epochs, 
					  optimizer=Adam, 
					  batch_size=batch_size,
					  metrics=metrics,
					  loss="binary_crossentropy",
					  random_state=SEED,
                      verbose=0,
					  class_weight=class_weight
					  )
# Initialize Instance. Note: Do not run mlp.fit(). This would retrain the model.
mlp.initialize(X_train, y_train)

# create SelfPacedEnsembleClassifier
mlp_spe = SPE(
	estimator=mlp,
    verbose=0,
	n_estimators=50, # number of mlp's in ensemble (default: 50)
	random_state=SEED)

# Fit SPE-boosted MLP
mlp_spe.fit(
	X_train,
	y_train,
	eval_datasets={"valid": (X_val, y_val)},
	)

# path_to_spe_model = './saved_models/MLP/MLP_SPE_CC.pkl'

# # load model
# with open(path_to_spe_model, 'rb') as f:
#     mlp_spe = pickle.load(f)

print("results for fold: ", current_k_fold)
evaluate_model([model, mlp_spe], X_test, y_test, as_table=True)

results for fold:  3


Unnamed: 0,Model Name,AUCPRC,F1,G-Mean,MCC,Precision,Recall,ROCAUC,ACCURACY,TP,FP,TN,FN
0,Sequential,0.8106,0.2275,0.9417,0.3395,0.1303,0.8958,0.963,0.9897,43.0,287.0,28038.0,5.0
1,SelfPacedEnsembleClassifier,0.7994,0.0513,0.9296,0.1502,0.0264,0.9167,0.9632,0.9427,44.0,1623.0,26702.0,4.0
