In [24]:
import pickle

# imoprt data science libraries
import pandas as pd
from pandas import DataFrame as df
import matplotlib.pyplot as plt
import numpy as np

# Import ML libraries
import keras
import model_utils as mutils
from model_utils.evaluation import evaluate_model, get_metrics
from sklearn.model_selection import train_test_split

# For SelfPacedEnsemble
from imbens.ensemble import SelfPacedEnsembleClassifier as SPE
from scikeras.wrappers import KerasClassifier

current_k_fold=1
# set seed
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# deserialize pre-processed data
path_to_pickle = f'../data/kddcup/kdd_preprocessed_k{current_k_fold}.pkl'

with open(path_to_pickle, "rb") as f:
    data = pickle.load(f)
    X_train = data["X_train"].to_numpy()
    y_train = data["y_train"].to_numpy()

    X_val = data["X_val"].to_numpy()
    y_val = data["y_val"].to_numpy()

    X_test = data["X_test"].to_numpy()
    y_test = data["y_test"].to_numpy()

    col_names = data["col_names"]

print("Data loaded successfully")

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(data["y_train"].value_counts())
IR_val = mutils.imb_ratio(data["y_val"].value_counts())
IR_test = mutils.imb_ratio(data["y_test"].value_counts())

print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

value_counts = df(y_train).value_counts()

print("New Imbalance ratio:", mutils.imb_ratio(value_counts))

# Reshape
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

# Set Weight
class_weight = {label[0]: 1.0 / count for label, count in value_counts.items()}
print("Weights: ", class_weight)

data_dim = np.shape(X_train)[1]

Data loaded successfully
Imbalance ratio in training data: 4.13
Imbalance ratio in validation data: 4.13
Imbalance ratio in test data: 4.13

Number of samples in training data: 248823
Number of samples in validation data: 31103
Number of samples in test data: 31103
New Imbalance ratio: 4.13
Weights:  {0: 4.991290198603437e-06, 1: 2.062961587655238e-05}


# Test same architecture as for CC Dataset

Model Architecture from https://keras.io/examples/structured_data/imbalanced_classification/

In [34]:
"""
## Credits for Model: https://keras.io/examples/structured_data/imbalanced_classification/
"""

# Hyperparams:
data_dim = np.shape(X_train)[1]
batch_size=2048
epochs=50
learning_rate=1e-2
Adam = keras.optimizers.Adam(learning_rate)

model = keras.Sequential(
    [
        keras.Input(shape=(data_dim,)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

metrics = [
    keras.metrics.AUC(name="roc_auc", curve='ROC'),
    keras.metrics.AUC(name='auc_prc', curve="PR"),
	keras.metrics.F1Score(name="f1"),
	
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=Adam, loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("./ckp/MLP_Normal_CC/epoch_{epoch}.keras")]


hist = model.fit(
    X_train,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=2,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    class_weight=class_weight,
)

In [38]:
metric_history = hist.history["val_auc_prc"]
best_epoch = metric_history.index(max(metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(metric_history))

model.load_weights(f'./ckp/MLP_Normal_KDD/epoch_{best_epoch}.keras')
evaluate_model(model, X_test, y_test, as_table=True)

Unnamed: 0,Model Name,AUCPRC,ROCAUC,F1,G-Mean,MCC,ACCURACY,TP,FP,TN,FN,Precision,Recall
0,Sequential,0.9844,0.9961,0.9173,0.9772,0.8999,0.9649,6047.0,1078.0,23965.0,13.0,0.848702,0.997855


# Hyperparameter Tuning

In [25]:
import keras_tuner
from keras import layers

def build_model(hp):
    """Generate the model for the keras tuner using hyperparameters.

    Args:
        hp (keras_tuner.HyperParameters): Hyperparameter object to define ranges.

    Returns:
        keras.models.Sequential: The configured Sequential model.
    """
    model = keras.models.Sequential()

    # Input Layer
    model.add(layers.Input(shape=(data_dim,)))
    model.add(
        layers.Dense(
            units=hp.Int("input_units", min_value=64, max_value=512, step=64),
            activation=hp.Choice("input_activation", ["relu", "tanh"]), name='input_layer'
        )
    )
    if hp.Boolean("dropout_input"):
        model.add(layers.Dropout(rate=hp.Float("dropout_input_rate", min_value=0.1, max_value=0.5, step=0.1, name='dropout_input')))

    # Optional Hidden Layer 1
    if hp.Boolean("h1"):
        model.add(
            layers.Dense(
                units=hp.Int("h1_units", min_value=64, max_value=512, step=64),
                activation=hp.Choice("h1_activation", ["relu", "tanh"]), name='h1'
            )
        )
        if hp.Boolean("dropout_h1"):
            model.add(layers.Dropout(rate=hp.Float("dropout_h1_rate", min_value=0.1, max_value=0.5, step=0.1), name="dropout_h1"))

    # Optional Hidden Layer 2
    if hp.Boolean("h2"):
        model.add(
            layers.Dense(
                units=hp.Int("h2_units", min_value=64, max_value=512, step=64),
                activation=hp.Choice("h2_activation", ["relu", "tanh"]), name="h2"
            )
        )
        if hp.Boolean("dropout_h2"):
            model.add(layers.Dropout(rate=hp.Float("dropout_h2_rate", min_value=0.1, max_value=0.5, step=0.1), name="dropout_h2"))

    # Output Layer for binary classification
    model.add(layers.Dense(1, activation="sigmoid", name="output"))

    # Learning rate selection
    learning_rate = hp.Float("lr", min_value=0.01, max_value=0.05, step=0.01)

    metrics = [
        keras.metrics.AUC(name="roc_auc", curve='ROC'),
        keras.metrics.AUC(name='auc_prc', curve="PR"),
        keras.metrics.F1Score(name="f1"),

        keras.metrics.FalseNegatives(name="fn"),
        keras.metrics.FalsePositives(name="fp"),
	]

    model.compile(
		optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
		loss=keras.losses.binary_crossentropy,
		metrics=metrics
	)

    return model

## Define & Start Search

In [None]:
random_tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective=keras_tuner.Objective("val_auc_prc", direction="max"),
    # max_trials=3,
    # executions_per_trial=2,
    overwrite=True,
    directory="./keras_tuner_results/",
    project_name="MLP_Normal_KDD"
)

# print summary of search space
random_tuner.search_space_summary()

In [None]:
random_tuner.search(
    X_train,
    y_train,
    epochs=2,
    validation_data=(X_val, y_val),
    class_weight=class_weight,
)

In [None]:
best_hps = random_tuner.get_best_hyperparameters(1)
best_hps[0].values

In [26]:
saved_hp = {'input_units': 256,
 'input_activation': 'tanh',
 'dropout_input': False,
 'h1': False,
 'h2': True,
 'lr': 0.01,
 'dropout_input_rate': 0.4,
 'h1_units': 384,
 'h1_activation': 'tanh',
 'dropout_h1': False,
 'h2_units': 192,
 'h2_activation': 'relu',
 'dropout_h2': True,
 'dropout_h1_rate': 0.30000000000000004,
 'dropout_h2_rate': 0.1}

In [29]:
if saved_hp is not None:
	 # Initialize HyperParameters from a dictionary
	hp = keras_tuner.HyperParameters()
	for key, value in saved_hp.items():
		hp.Fixed(key, value)

	model = build_model(hp=hp)
else:
	model = build_model(best_hps[0])

rs_history = model.fit(
    x=X_train,
    y=y_train,
    epochs=50,
	class_weight=class_weight,
    validation_data=(X_val, y_val),
    callbacks=[
        #keras.callbacks.ModelCheckpoint("./ckp/RS_Tuned_MLP_Normal_KDD/epoch_{epoch}.keras")
    ],
)


Epoch 1/50
[1m7776/7776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 1ms/step - auc_prc: 0.9718 - f1: 0.3282 - fn: 156.8328 - fp: 4868.3174 - loss: 6.8454e-07 - roc_auc: 0.9931 - val_auc_prc: 0.9800 - val_f1: 0.5111 - val_fn: 30.0000 - val_fp: 1085.0000 - val_loss: 0.0719 - val_roc_auc: 0.9950
Epoch 2/50
[1m7776/7776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/step - auc_prc: 0.9802 - f1: 0.4636 - fn: 69.1904 - fp: 4560.5127 - loss: 5.2600e-07 - roc_auc: 0.9951 - val_auc_prc: 0.9802 - val_f1: 0.5116 - val_fn: 10.0000 - val_fp: 1081.0000 - val_loss: 0.0744 - val_roc_auc: 0.9953
Epoch 3/50
[1m7776/7776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 2ms/step - auc_prc: 0.9810 - f1: 0.5253 - fn: 57.9068 - fp: 4489.1265 - loss: 5.0741e-07 - roc_auc: 0.9953 - val_auc_prc: 0.9807 - val_f1: 0.5915 - val_fn: 5.0000 - val_fp: 1101.0000 - val_loss: 0.0725 - val_roc_auc: 0.9952
Epoch 4/50
[1m7776/7776[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2ms/s

In [43]:
# getting best epoch 
rs_metric_history = rs_history.history["val_auc_prc"]
best_epoch = rs_metric_history.index(max(rs_metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(rs_metric_history))
# or 
rs_metric_history = rs_history.history["val_loss"]
best_epoch = rs_metric_history.index(min(metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(rs_metric_history))


model.load_weights(f'./ckp/RS_Tuned_MLP_Normal_KDD/epoch_{best_epoch}.keras')
evaluate_model(model, X_test, y_test, as_table=True)

Unnamed: 0,Model Name,AUCPRC,ROCAUC,F1,G-Mean,MCC,ACCURACY,TP,FP,TN,FN,Precision,Recall
0,Sequential,0.9848,0.9958,0.916,0.9772,0.8985,0.9643,6054.0,1105.0,23938.0,6.0,0.845649,0.99901


# Self Paced Ensemble for MLP

In [None]:
# wrap Keras MLP into a scikit-learn API-compatible Wrapper class with `scikeras.KerasClassifier`
mlp = KerasClassifier(model,
					  epochs=epochs, 
					  optimizer=Adam, 
					  batch_size=batch_size,
					  metrics=metrics,
					  loss="binary_crossentropy",
					  random_state=SEED,
					  class_weight=class_weight
					  )
# Initialize Instance. Note: Do not run mlp.fit(). This would retrain the model.
mlp.initialize(X_train, y_train)

# create SelfPacedEnsembleClassifier
mlp_spe = SPE(
	estimator=mlp,
	n_estimators=50, # number of mlp's in ensemble (default: 50)
	random_state=SEED)

# Fit SPE-boosted MLP
mlp_spe.fit(
	X_train,
	y_train,
	eval_datasets={"valid": (X_val, y_val)},
	)

path_to_spe_model = './saved_models/MLP/MLP_SPE_KDD.pkl'

# # read model 
# with open(path_to_spe_model, 'rb') as f:
#     mlp_spe = pickle.load(f)

In [47]:
evaluate_model([mlp_spe, mlp], X_test, y_test, as_table=True)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━

Unnamed: 0,Model Name,AUCPRC,ROCAUC,F1,G-Mean,MCC,ACCURACY,TP,FP,TN,FN,Precision,Recall
0,SelfPacedEnsembleClassifier,0.9849,0.9962,0.9177,0.976,0.9001,0.9652,6024.0,1045.0,23998.0,36.0,0.852171,0.994059
1,KerasClassifier,0.9848,0.9958,0.916,0.9772,0.8985,0.9643,6054.0,1105.0,23938.0,6.0,0.845649,0.99901
