In [12]:
import pickle

# imoprt data science libraries
import pandas as pd
from pandas import DataFrame as df
import matplotlib.pyplot as plt
import numpy as np

# Import ML libraries
import keras
import model_utils as mutils
from model_utils.evaluation import get_metrics, evaluate_model
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

current_k_fold = 10
SEED=current_k_fold**3
np.random.seed(SEED)

# pandas options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# deserialize pre-processed data
path_to_pickle = f"../data/kddcup/KDD_preprocessed_k{current_k_fold}.pkl"

with open(path_to_pickle, "rb") as f:
    data = pickle.load(f)
    X_train = data["X_train"].to_numpy()
    y_train = data["y_train"].to_numpy()

    X_val = data["X_val"].to_numpy()
    y_val = data["y_val"].to_numpy()

    X_test = data["X_test"].to_numpy()
    y_test = data["y_test"].to_numpy()

    col_names = data["col_names"]

print("Data loaded successfully")

# get imbalance ratio for each data set
IR_train = mutils.imb_ratio(data["y_train"].value_counts())
IR_val = mutils.imb_ratio(data["y_val"].value_counts())
IR_test = mutils.imb_ratio(data["y_test"].value_counts())

print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

"""
## Oversample minority class for training only
"""

ros = RandomOverSampler(random_state=SEED, sampling_strategy=1)
X_train, y_train = ros.fit_resample(X_train, y_train)

res_value_counts = df(y_train).value_counts()

print("New Imbalance ratio:", mutils.imb_ratio(res_value_counts))

# Reshape
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)

# Set Weight
class_weight = {label[0]: 1.0 / count for label, count in res_value_counts.items()}
print("Weights: ", class_weight)

data_dim = np.shape(X_train)[1]

Data loaded successfully
Imbalance ratio in training data: 4.13
Imbalance ratio in validation data: 4.13
Imbalance ratio in test data: 4.13

Number of samples in training data: 248823
Number of samples in validation data: 31103
Number of samples in test data: 31103
New Imbalance ratio: 1.0
Weights:  {0: 4.991290198603437e-06, 1: 4.991290198603437e-06}


# Test same architecture as for CC Dataset

Model Architecture from https://keras.io/examples/structured_data/imbalanced_classification/

In [2]:
"""
## Build a binary classification model. Credits: https://keras.io/examples/structured_data/imbalanced_classification/
"""
model = keras.Sequential(
    [
        keras.Input(shape=(data_dim,)),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation="relu"),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation="sigmoid"),
    ]
)

metrics = [
    keras.metrics.AUC(name="roc_auc", curve='ROC'),
    keras.metrics.AUC(name='auc_prc', curve="PR"),
	keras.metrics.F1Score(name="f1"),
	
    keras.metrics.FalseNegatives(name="fn"),
    keras.metrics.FalsePositives(name="fp"),
    keras.metrics.TrueNegatives(name="tn"),
    keras.metrics.TruePositives(name="tp"),
    keras.metrics.Precision(name="precision"),
    keras.metrics.Recall(name="recall"),
]

model.compile(
    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
)

callbacks = [keras.callbacks.ModelCheckpoint("./ckp/MLP_RandOver_KDD/epoch_{epoch}.keras")]


hist = model.fit(
    X_train,
    y_train,
    batch_size=2048,
    epochs=50,
    verbose=2,
    callbacks=callbacks,
    validation_data=(X_val, y_val),
    class_weight=class_weight,
)

In [3]:
# getting best epoch 
metric_history = hist.history["val_auc_prc"]
best_epoch = metric_history.index(max(metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(metric_history))
# or 
metric_history = hist.history["val_loss"]
best_epoch = metric_history.index(min(metric_history)) + 1
print("Best epoch: ", best_epoch, "Best Value: ", max(metric_history))

model.load_weights(f'./ckp/MLP_RandOver_KDD/epoch_{best_epoch}.keras')
evaluate_model(model, X_test, y_test, as_table=True)

Unnamed: 0,Model Name,AUCPRC,ROCAUC,F1,G-Mean,MCC,ACCURACY,TP,FP,TN,FN,Precision,Recall
0,Sequential,0.9858,0.9964,0.9182,0.9774,0.901,0.9654,6046.0,1063.0,23980.0,14.0,0.850471,0.99769


# Hyperparameter Tuning

In [16]:
import keras_tuner
from keras import layers

def build_model(hp):
    """Generate the model for the keras tuner using hyperparameters.

    Args:
        hp (keras_tuner.HyperParameters): Hyperparameter object to define ranges.

    Returns:
        keras.models.Sequential: The configured Sequential model.
    """
    model = keras.models.Sequential()

    # Input Layer
    model.add(layers.Input(shape=(data_dim,)))
    model.add(
        layers.Dense(
            units=hp.Int("input_units", min_value=64, max_value=512, step=64),
            activation=hp.Choice("input_activation", ["relu", "tanh"]),
        )
    )
    if hp.Boolean("dropout_input"):
        model.add(layers.Dropout(rate=hp.Float("dropout_input_rate", min_value=0.1, max_value=0.5, step=0.1)))

    # Optional Hidden Layer 1
    if hp.Boolean("h1"):
        model.add(
            layers.Dense(
                units=hp.Int("h1_units", min_value=64, max_value=512, step=64),
                activation=hp.Choice("h1_activation", ["relu", "tanh"]),
            )
        )
        if hp.Boolean("dropout_h1"):
            model.add(layers.Dropout(rate=hp.Float("dropout_h1_rate", min_value=0.1, max_value=0.5, step=0.1)))

    # Optional Hidden Layer 2
    if hp.Boolean("h2"):
        model.add(
            layers.Dense(
                units=hp.Int("h2_units", min_value=64, max_value=512, step=64),
                activation=hp.Choice("h2_activation", ["relu", "tanh"]),
            )
        )
        if hp.Boolean("dropout_h2"):
            model.add(layers.Dropout(rate=hp.Float("dropout_h2_rate", min_value=0.1, max_value=0.5, step=0.1)))

    # Output Layer for binary classification
    model.add(layers.Dense(1, activation="sigmoid"))

    # Learning rate selection
    learning_rate = hp.Float("lr", min_value=0.01, max_value=0.05, step=0.01)

    metrics = [
        keras.metrics.AUC(name="roc_auc", curve='ROC'),
        keras.metrics.AUC(name='auc_prc', curve="PR"),
        keras.metrics.F1Score(name="f1"),

        keras.metrics.FalseNegatives(name="fn"),
        keras.metrics.FalsePositives(name="fp"),
	]

    model.compile(
		optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
		loss=keras.losses.binary_crossentropy,
		metrics=metrics
	)

    return model

## Define & Start Search

In [21]:
random_tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective=keras_tuner.Objective("val_auc_prc", direction="max"),
    # max_trials=3,
    # executions_per_trial=2,
    overwrite=True,
    directory="./keras_tuner_results/",
    project_name="MLP_RandOver_KDD"
)

# print summary of search space
random_tuner.search_space_summary()

Search space summary
Default search space size: 6
input_units (Int)
{'default': None, 'conditions': [], 'min_value': 64, 'max_value': 512, 'step': 64, 'sampling': 'linear'}
input_activation (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh'], 'ordered': False}
dropout_input (Boolean)
{'default': False, 'conditions': []}
h1 (Boolean)
{'default': False, 'conditions': []}
h2 (Boolean)
{'default': False, 'conditions': []}
lr (Float)
{'default': 0.01, 'conditions': [], 'min_value': 0.01, 'max_value': 0.05, 'step': 0.01, 'sampling': 'linear'}


In [22]:
random_tuner.search(X_train, y_train, epochs=2, validation_data=(X_val, y_val), class_weight=class_weight)

Trial 10 Complete [00h 01m 02s]
val_auc_prc: 0.9797707796096802

Best val_auc_prc So Far: 0.9836588501930237
Total elapsed time: 00h 08m 31s


In [23]:
best_hps = random_tuner.get_best_hyperparameters(1)

In [25]:
best_hps[0].values

{'input_units': 128,
 'input_activation': 'tanh',
 'dropout_input': True,
 'h1': True,
 'h2': True,
 'lr': 0.02,
 'dropout_input_rate': 0.2,
 'h1_units': 128,
 'h1_activation': 'relu',
 'dropout_h1': True,
 'dropout_h1_rate': 0.1,
 'h2_units': 512,
 'h2_activation': 'tanh',
 'dropout_h2': False}

In [14]:
saved_hp = {'input_units': 128,
 'input_activation': 'tanh',
 'dropout_input': True,
 'h1': True,
 'h2': True,
 'lr': 0.02,
 'dropout_input_rate': 0.2,
 'h1_units': 128,
 'h1_activation': 'relu',
 'dropout_h1': True,
 'dropout_h1_rate': 0.1,
 'h2_units': 512,
 'h2_activation': 'tanh',
 'dropout_h2': False}

In [17]:
if saved_hp is not None:
	 # Initialize HyperParameters from a dictionary
	hp = keras_tuner.HyperParameters()
	for key, value in saved_hp.items():
		hp.Fixed(key, value)

	rs_tuned_model = build_model(hp=hp)
else:
	rs_tuned_model = build_model(best_hps[0])

rs_history = rs_tuned_model.fit(
    x=X_train,
    y=y_train,
    epochs=10,
	class_weight=class_weight,
    validation_data=(X_val, y_val),
    callbacks=[
       # keras.callbacks.ModelCheckpoint("./ckp/RS_Tuned_MLP_RandOver_KDD/epoch_{epoch}.keras")
    ],
)

Epoch 1/10
[1m12522/12522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 2ms/step - auc_prc: 0.9929 - f1: 0.6667 - fn: 498.2765 - fp: 4719.5371 - loss: 3.9777e-07 - roc_auc: 0.9934 - val_auc_prc: 0.9801 - val_f1: 0.3261 - val_fn: 7.0000 - val_fp: 1151.0000 - val_loss: 0.0771 - val_roc_auc: 0.9949
Epoch 2/10
[1m12522/12522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - auc_prc: 0.9950 - f1: 0.6646 - fn: 230.4757 - fp: 4457.9546 - loss: 3.1728e-07 - roc_auc: 0.9953 - val_auc_prc: 0.9805 - val_f1: 0.3261 - val_fn: 7.0000 - val_fp: 1097.0000 - val_loss: 0.0714 - val_roc_auc: 0.9951
Epoch 3/10
[1m12522/12522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - auc_prc: 0.9951 - f1: 0.6665 - fn: 188.5979 - fp: 4350.4302 - loss: 3.0691e-07 - roc_auc: 0.9954 - val_auc_prc: 0.9804 - val_f1: 0.3261 - val_fn: 15.0000 - val_fp: 1078.0000 - val_loss: 0.0726 - val_roc_auc: 0.9953
Epoch 4/10
[1m12522/12522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s

In [15]:
rs_tuned_model.load_weights(f'./ckp/RS_Tuned_MLP_RandOver_KDD/epoch_{best_epoch}.keras')
evaluate_model(rs_tuned_model, X_test, y_test, as_table=True)

Unnamed: 0,Model Name,AUCPRC,ROCAUC,F1,G-Mean,MCC,ACCURACY,TP,FP,TN,FN,Precision,Recall
0,Sequential,0.9844,0.9956,0.917,0.9774,0.8997,0.9648,6052.0,1087.0,23956.0,8.0,0.847738,0.99868
