Model Architecture from https://keras.io/examples/structured_data/imbalanced_classification/

In [39]:
import pickle

# imoprt data science libraries
import pandas as pd
from pandas import DataFrame as df
import matplotlib.pyplot as plt
import numpy as np

# Import ML libraries
import keras
import model_utils as mutils
from model_utils.evaluation import get_metrics, evaluate_model, table
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

current_k_fold = 9
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# deserialize pre-processed data
path_to_pickle = f'../data/creditcard/cc13_preprocessed_k{current_k_fold}.pkl'

with open(path_to_pickle, "rb") as f:
    data = pickle.load(f)
    X_train = data["X_train"].to_numpy()
    y_train = data["y_train"].to_numpy()

    X_val = data["X_val"].to_numpy()
    y_val = data["y_val"].to_numpy()

    X_test = data["X_test"].to_numpy()
    y_test = data["y_test"].to_numpy()

    col_names = data["col_names"]

print("Data loaded successfully")

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(data["y_train"].value_counts())
IR_val = mutils.imb_ratio(data["y_val"].value_counts())
IR_test = mutils.imb_ratio(data["y_test"].value_counts())

# print imbalance ratios. They should be (nearly) the same. pct = 0.172 such as in the paper!
print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

"""
## Oversample minority class for training only
"""

ros = RandomOverSampler(random_state=SEED, sampling_strategy=1)
X_train, y_train = ros.fit_resample(X_train, y_train)

res_value_counts = df(y_train).value_counts()

print("New Imbalance ratio:", mutils.imb_ratio(res_value_counts))

# Reshape
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

# Set Weight
weight_for_0 = 1.0 / res_value_counts[0]
weight_for_1 = 1.0 / res_value_counts[1]

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

# Normalize Data
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

"""
## Build a binary classification model. Credits: https://keras.io/examples/structured_data/imbalanced_classification/
"""
data_dim = np.shape(X_train)[1]
model = keras.Sequential(
    [
        keras.Input(shape=(data_dim,)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

Data loaded successfully
Imbalance ratio in training data: 599.48
Imbalance ratio in validation data: 602.68
Imbalance ratio in test data: 590.1

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373
New Imbalance ratio: 1.0

Number of samples in training data: 453204
Number of samples in validation data: 28373
Number of samples in test data: 28373


In [40]:
metrics = [
    keras.metrics.AUC(name="roc_auc", curve='ROC'),
    keras.metrics.AUC(name='auc_prc', curve="PR"),
	keras.metrics.F1Score(name="f1"),
	
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint(f"./ckp/MLP_RandOver_CC/k{current_k_fold}"+"_epoch_{epoch}.keras")]
class_weight = {0: weight_for_0, 1: weight_for_1}
print("Weights: ", class_weight)

# TODO: check if you need callback! not that we overwrite good ckp's.
hist = model.fit(
    X_train,
    y_train,
    batch_size=2048,
    epochs=50,
    verbose=2,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    class_weight=class_weight,
)

Weights:  {0: 4.413023715589447e-06, 1: 4.413023715589447e-06}
Epoch 1/50
222/222 - 4s - 16ms/step - auc_prc: 0.9745 - f1: 0.6667 - fn: 28152.0000 - fp: 11212.0000 - loss: 9.8281e-07 - precision: 0.9465 - recall: 0.8758 - roc_auc: 0.9696 - tn: 215390.0000 - tp: 198450.0000 - val_auc_prc: 0.6050 - val_f1: 0.0033 - val_fn: 4.0000 - val_fp: 446.0000 - val_loss: 0.0700 - val_precision: 0.0879 - val_recall: 0.9149 - val_roc_auc: 0.9829 - val_tn: 27880.0000 - val_tp: 43.0000
Epoch 2/50
222/222 - 2s - 11ms/step - auc_prc: 0.9961 - f1: 0.6667 - fn: 9283.0000 - fp: 4328.0000 - loss: 3.5970e-07 - precision: 0.9805 - recall: 0.9590 - roc_auc: 0.9960 - tn: 222274.0000 - tp: 217319.0000 - val_auc_prc: 0.6306 - val_f1: 0.0033 - val_fn: 4.0000 - val_fp: 383.0000 - val_loss: 0.0493 - val_precision: 0.1009 - val_recall: 0.9149 - val_roc_auc: 0.9805 - val_tn: 27943.0000 - val_tp: 43.0000
Epoch 3/50
222/222 - 2s - 10ms/step - auc_prc: 0.9990 - f1: 0.6677 - fn: 2254.0000 - fp: 2415.0000 - loss: 1.5507e-07

In [41]:
# getting best epoch 
metric_history = hist.history["val_auc_prc"]
best_epoch = metric_history.index(max(metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(metric_history))

# best_epoch=50
print("current fold: ", current_k_fold)
model.load_weights(f'./ckp/MLP_RandOver_CC/k{current_k_fold}_epoch_{best_epoch}.keras')
evaluate_model(model, X_test, y_test, as_table=True)

Best epoch:  18 Best Value:  0.8537222743034363
current fold:  9


Unnamed: 0,Model Name,AUCPRC,F1,G-Mean,MCC,Precision,Recall,ROCAUC,ACCURACY,TP,FP,TN,FN
0,Sequential,0.7484,0.7556,0.8415,0.7569,0.8095,0.7083,0.9449,0.9992,34.0,8.0,28317.0,14.0


In [8]:
# getting best epoch 
metric_history = hist.history["val_auc_prc"]
best_epoch = metric_history.index(max(metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(metric_history))

# best_epoch=50
model.load_weights(f'./ckp/MLP_RandOver_CC/k{current_k_fold}_epoch_{best_epoch}.keras')
evaluate_model(model, X_test, y_test, as_table=True)

Unnamed: 0,Model Name,AUCPRC,ROCAUC,F1,G-Mean,MCC,ACCURACY,TP,FP,TN,FN,Precision,Recall
0,Sequential,0.8998,0.9902,0.8842,0.9452,0.8841,0.9996,42.0,6.0,28320.0,5.0,0.875,0.893617
