<a href="https://colab.research.google.com/github/moaz9090/AI-based-threat-detection-system/blob/main/Moaz_GP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install keras-tuner --upgrade
!pip install heatmapz

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-tuner
  Downloading keras_tuner-1.3.5-py3-none-any.whl (176 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)


In [None]:
import numpy as np
import pandas as pd
import keras_tuner as kt
import matplotlib.pyplot as plt
from fastai.tabular.all import df_shrink
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras_tuner as kt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from heatmap import heatmap, corrplot

# data load:


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Moaz_GP/New data/NF-UNSW-NB15-v2.csv')

In [None]:
data.head()

In [None]:
data.shape

**Data visualization before preprocessing**

In [None]:
plt.figure(figsize=(15, 15))
corrplot(data.corr(),size_scale=800)

# Preprocessing

In [None]:
data.dtypes

In [None]:
data.Label.value_counts()

In [None]:
data.Attack.value_counts()

In [None]:
data.isna().sum()

In [None]:
data.isna().sum().sum()

In [None]:
data = data.drop(columns=['IPV4_SRC_ADDR', 'IPV4_DST_ADDR'])

In [None]:
data = df_shrink(data, obj2cat=False, int2uint=False)

In [None]:
data.dtypes

In [None]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)    
print(data.isna().any(axis=1).sum(), "rows with at least one NaN to remove")
data.dropna(inplace=True)

In [None]:
print(data.duplicated().sum(), "fully duplicate rows to remove")
data.drop_duplicates(inplace=True)
data.reset_index(inplace=True, drop=True)
data.shape

**Data visualization after preprocessing**

In [None]:
plt.figure(figsize=(15, 15))
corrplot(data.corr(),size_scale=800)

In [None]:
# Columns that are not necessary for training
columns_to_remove = ['L4_SRC_PORT','L4_DST_PORT','Label','Attack']

# Train test split
train , test= train_test_split(data, test_size=0.2, shuffle=True)
y_test = np.array(test['Label'], dtype=np.uint0)

# Indices of benign and attack traffic in train data
train_benign_idx = train['Label'] == 0
train_attack_idx = train['Label'] == 1

# Drop unnecessary columns
train.drop(columns=columns_to_remove, axis=1, inplace=True)
test.drop(columns=columns_to_remove, axis=1, inplace=True)

# Cast to numpy array
train_normal = train[train_benign_idx].values
train_attack = train[train_attack_idx].values


# Scaling
scaler = MinMaxScaler()
train = scaler.fit_transform(train_normal)
train_attack = scaler.transform(train_attack)
test = scaler.fit_transform(test.values)

# Define a validation set
train , validation = train_test_split(train, test_size=0.2)

print(f'Shape train data: {train.shape}')
print(f'Shape validation data: {validation.shape}')
print(f'Shape test data: {test.shape}')

In [None]:
def test_model(model, threshold_quantile, validation_benign, validation_attack, test, y_test, mae=True):
    
    # Evaluate the losses of the reconstructions of the validation set with benign traffic
    val_losses = None
    if mae:
        # MAE loss
        val_losses = np.mean(abs(validation_benign - model.predict(validation_benign)), axis=1)
    else:
        #MSE loss
        val_losses = np.mean((validation_benign - model.predict(validation_benign))**2, axis=1)
        
    val_losses = pd.DataFrame({'benign' : val_losses})
        
    print('Statistics benign reconstruction losses:')
    print('-'*20)
    print(val_losses.describe())
    
    
    # Evaluate the losses of the reconstructions of the validation set with attack traffic
    attack_losses = None
    if mae:
        # MAE loss
        attack_losses = np.mean(abs(validation_attack - model.predict(validation_attack)), axis=1)
    else:
        # MSE loss
        attack_losses = np.mean((validation_attack - model.predict(validation_attack))**2, axis=1)

    attack_losses = pd.DataFrame({'attack' : attack_losses})
    
    print()
    print('Statistics attack reconstruction losses:')
    print('-'*20)
    print(attack_losses.describe())
    
    
    # Define the threshold based on the supplied quantile
    threshold = np.quantile(val_losses, 0.99)

    test_losses = None
    recons = model.predict(test)
    if mae:
        # MAE loss
        test_losses = np.mean(abs(test - recons), axis=1)
    else:
        # MSE loss
        test_losses = np.mean((test - recons)**2, axis=1)
        
    preds = np.array(test_losses > threshold, dtype=np.uint0)
    
    
    print(f'ACCURACY:\n\t{accuracy_score(preds, y_test)}')
    print(f'PRECISION:\n\t{precision_score(preds, y_test)}')
    print(f'RECALL:\n\t{recall_score(preds, y_test)}')
    tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
    print(f'True Positives: {tp}')
    print(f'False Positives: {fp}')
    print(f'True Negatives: {tn}')
    print(f'False Negatives: {fn}')

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(4, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(39, activation='sigmoid'))


model.compile(optimizer='adam', loss='mae')

es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

model.fit(train, train, batch_size=128, epochs=50, validation_split=0.1, shuffle=True, callbacks=[es])

print("TRAINED WITH LOSS 'MAE':")
print("="*20)
print("\tEVALUATE WITH MAE & QUANTILE 0.95:")
test_model(model, 0.95, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MAE & QUANTILE 0.98:")
test_model(model, 0.98, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MSE & QUANTILE 0.95:")
test_model(model, 0.95, validation, train_attack, test, y_test, mae=False)
print("\tEVALUATE WITH MSE & QUANTILE 0.98:")
test_model(model, 0.98, validation, train_attack, test, y_test, mae=False)

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(4, activation='relu'))
model.add(tf.keras.layers.Dense(8, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(39, activation='sigmoid'))

model.compile(optimizer='adam', loss='mse')

model.fit(train, train, batch_size=128, epochs=50, validation_split=0.1, shuffle=True, callbacks=[es])

print("TRAINED WITH LOSS 'MSE':")
print("="*20)
print("\tEVALUATE WITH MAE & QUANTILE 0.95:")
test_model(model, 0.95, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MAE & QUANTILE 0.98:")
test_model(model, 0.98, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MSE & QUANTILE 0.95:")
test_model(model, 0.95, validation, train_attack, test, y_test, mae=False)
print("\tEVALUATE WITH MSE & QUANTILE 0.98:")
test_model(model, 0.98, validation, train_attack, test, y_test, mae=False)

# Hyperparameter tuning for the simple autoencoder

---



In [None]:
def build_model(hp):
    model = tf.keras.Sequential()
    
    model.add(tf.keras.layers.Dense(
        32,
        activation=hp.Choice(f'encoder_layer_1_activation', ['relu','tanh','selu','elu']),
        activity_regularizer=tf.keras.regularizers.l2(hp.Choice("regularizer_1", [1e-2,1e-3,1e-4,1e-5,1e-6]))
    ))
    model.add(tf.keras.layers.Dense(
        16, 
        activation=hp.Choice(f'encoder_layer_2_activation', ['relu','tanh','selu','elu']),
        activity_regularizer=tf.keras.regularizers.l2(hp.Choice("regularizer_2", [1e-2,1e-3,1e-4,1e-5,1e-6]))

    ))
    model.add(tf.keras.layers.Dense(
        8, 
        activation=hp.Choice(f'encoder_layer_3_activation', ['relu','tanh','selu','elu']),
        activity_regularizer=tf.keras.regularizers.l2(hp.Choice("regularizer_3", [1e-2,1e-3,1e-4,1e-5,1e-6]))
    ))
    model.add(tf.keras.layers.Dense(
        4, 
        activation=hp.Choice(f'latent_dim_activation', ['relu','tanh','selu','elu']),
        activity_regularizer=tf.keras.regularizers.l2(hp.Choice("regularizer_latent_dim", [1e-2,1e-3,1e-4,1e-5,1e-6]))
    ))
    model.add(tf.keras.layers.Dense(
        8, 
        activation=hp.Choice(f'decoder_layer_1_activation', ['relu','tanh','selu','elu']),
        activity_regularizer=tf.keras.regularizers.l2(hp.Choice("regularizer_4", [1e-2,1e-3,1e-4,1e-5,1e-6]))
    ))
    model.add(tf.keras.layers.Dense(
        16,
        activation=hp.Choice(f'decoder_layer_2_activation', ['relu','tanh','selu','elu']),
        activity_regularizer=tf.keras.regularizers.l2(hp.Choice("regularizer_5", [1e-2,1e-3,1e-4,1e-5,1e-6]))
    ))
    model.add(tf.keras.layers.Dense(
        32,
        activation=hp.Choice(f'decoder_layer_3_activation', ['relu','tanh','selu','elu']),
        activity_regularizer=tf.keras.regularizers.l2(hp.Choice("regularizer_6", [1e-2,1e-3,1e-4,1e-5,1e-6]))    
    ))
    model.add(tf.keras.layers.Dense(39, activation='sigmoid'))
    
    
    
    model.compile(
        optimizer='adam',
        loss='mae'
    )
    
    return model




tuner1 = kt.RandomSearch(
    build_model,
    objective="val_loss",
    max_trials=10,
    directory='results_tuning',
    project_name='autoencoder'
)

# Use a subset of the training data
idx_tuner_data = int(len(train)*0.5)
tuner_data = train[:idx_tuner_data]

tuner1.search(
    tuner_data, tuner_data,
    validation_split=0.1,
    batch_size=128,
    epochs=10
)

model_1 = tuner1.get_best_models()[0]
tuner1.results_summary(1)

# Train the best model on more data

In [None]:
model_1.compile(optimizer='adam', loss='mae')
model_1.fit(train, train, epochs=50, batch_size=128, validation_split=0.1, shuffle=True, callbacks=[es])

print("TRAINED WITH LOSS 'MAE':")
print("="*20)
print("\tEVALUATE WITH MAE & QUANTILE 0.95:")
test_model(model_1, 0.95, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MAE & QUANTILE 0.98:")
test_model(model_1, 0.98, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MSE & QUANTILE 0.95:")
test_model(model_1, 0.95, validation, train_attack, test, y_test, mae=False)
print("\tEVALUATE WITH MSE & QUANTILE 0.98:")
test_model(model_1, 0.98, validation, train_attack, test, y_test, mae=False)

model_1 = tuner1.get_best_models()[0]
model_1.compile(optimizer='adam', loss='mse')
model_1.fit(train, train, epochs=50, batch_size=128, validation_split=0.1, shuffle=True, callbacks=[es])

print("TRAINED WITH LOSS 'MSE':")
print("="*20)
print("\tEVALUATE WITH MAE & QUANTILE 0.95:")
test_model(model_1, 0.95, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MAE & QUANTILE 0.98:")
test_model(model_1, 0.98, validation, train_attack, test, y_test)
print("\tEVALUATE WITH MSE & QUANTILE 0.95:")
test_model(model_1, 0.95, validation, train_attack, test, y_test, mae=False)
print("\tEVALUATE WITH MSE & QUANTILE 0.98:")
test_model(model_1, 0.98, validation, train_attack, test, y_test, mae=False)

# Search for a more complex architecture¶

In [None]:
def build_model(hp):
    model = tf.keras.Sequential()
    hidden_layers = list()
    
    for i in range(hp.Int('encoder_layers', min_value=1, max_value=6, step=1)):
        n_neurons = hp.Int(f'encoder_layer_{i}', min_value=16, max_value=39, step=2)
        model.add(tf.keras.layers.Dense(
                    units=n_neurons,
                    activation=hp.Choice(f'encoder_layer_{i}_activation', ['relu','tanh']),
                    activity_regularizer=tf.keras.regularizers.l1(hp.Choice(f'encoder_layer_{i}_regularizer', [1e-1, 1e-2, 1e-3, 1e-4]))
                )
        )
        if hp.Boolean("dropout"):
            model.add(tf.keras.layers.Dropout(rate=hp.Choice(f'encoder_layer_{i}_dropout', [0.25, 0.5])))
        
        
        hidden_layers.insert(0, n_neurons)
        
    model.add(
        tf.keras.layers.Dense(
            units=hp.Int('latent_dimension', min_value=4, max_value=15, step=1),
            activation=hp.Choice(f'latent_dimension_activation', ['relu','tanh']),
            activity_regularizer=tf.keras.regularizers.l1(hp.Choice(f'latent_dimension_regularizer', [1e-1, 1e-2, 1e-3, 1e-4]))
        )
    )
    if hp.Boolean("dropout"):
            model.add(tf.keras.layers.Dropout(rate=hp.Choice(f'latent_dimension_dropout', [0.25, 0.5])))
    
    decoder_layer = 0
    for neurons in hidden_layers:
        model.add(
            tf.keras.layers.Dense(
                neurons,
                activation=hp.Choice(f'decoder_layer_{decoder_layer}_activation', ['relu','tanh']),
                activity_regularizer=tf.keras.regularizers.l1(hp.Choice(f'decoder_layer_{decoder_layer}_regularizer', [1e-1, 1e-2, 1e-3, 1e-4]))
            )
        )
        if hp.Boolean("dropout"):
            model.add(tf.keras.layers.Dropout(rate=hp.Choice(f'decoder_layer_{i}_dropout', [0.25, 0.5])))
        decoder_layer += 1
    
    model.add(tf.keras.layers.Dense(39, activation='sigmoid'))
    
    
    model.compile(
        optimizer='adam',
        loss='mae'
    )
    
    return model




tuner2 = kt.RandomSearch(
    build_model,
    objective="val_loss",
    max_trials=10,
    directory='results_tuning',
    project_name='autoencoder'
)

idx_tuner_data = int(len(train)*0.5)
tuner_data = train[:idx_tuner_data]

tuner2.search(
    tuner_data, tuner_data,
    validation_split=0.1,
    batch_size=128,
    epochs=10
)

model_2 = tuner2.get_best_models()[0]
tuner2.results_summary(1)

In [None]:
prediction = model_1.predict(train)
prediction

In [None]:
model_1.save('NIDS_model.h5')

NameError: ignored

In [None]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('/content/drive/MyDrive/Moaz_GP/NIDS_model.hdf5')