In [1]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import tensorflow as tf
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


from SupervisedAD_methods import *
from kdd import *

# Data Preprocessing

In [2]:
df = get_df('data/KDDTrain+.txt', columns=columns, drop=False)
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20
125969,8,udp,private,SF,105,145,0,0,0,0,...,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal,21
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,...,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal,18
125971,0,tcp,klogin,S0,0,0,0,0,0,0,...,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20


In [3]:
test_df = get_df('data/KDDTest+.txt', columns=columns, drop=False)
test_df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,level
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune,21
1,0,tcp,private,REJ,0,0,0,0,0,0,...,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune,21
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,normal,21
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00,saint,15
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71,mscan,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22539,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22540,0,tcp,http,SF,317,938,0,0,0,0,...,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22541,0,tcp,http,SF,54540,8314,0,0,0,2,...,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back,15
22542,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal,21


In [4]:
#  https://www.kaggle.com/code/avk256/nsl-kdd-anomaly-detection/notebook

# map normal to 0, all attacks to 1
is_attack = df.attack.map(lambda a: 0 if a == 'normal' else 1)
test_attack = test_df.attack.map(lambda a: 0 if a == 'normal' else 1)

#data_with_attack = df.join(is_attack, rsuffix='_flag')
df['attack_flag'] = is_attack
test_df['attack_flag'] = test_attack

# map normal to 1, all attacks to 0
is_normal = df.attack.map(lambda a: 1 if a == 'normal' else 0)
test_normal = test_df.attack.map(lambda a: 1 if a == 'normal' else 0)

df['normal_flag'] = is_normal
test_df['normal_flag'] = test_normal

# map the data and join to the data set
attack_map = df.attack.apply(map_attack)
df['attack_map'] = attack_map

test_attack_map = test_df.attack.apply(map_attack)
test_df['attack_map'] = test_attack_map

# categorical features
features_to_encode = ['protocol_type', 'service', 'flag']

# get numeric features, we won't worry about encoding these at this point
# numeric_features = ['duration', 'src_bytes', 'dst_bytes']
# Use all features
numeric_features = list(set(df.columns[:-5]) - set(features_to_encode))


def feat_eng(df, test_df, features_to_encode=features_to_encode, numeric_features=numeric_features):
#     https://www.kaggle.com/code/avk256/nsl-kdd-anomaly-detection/notebook

    # get the intial set of encoded features and encode them
    encoded = pd.get_dummies(df[features_to_encode])
    test_encoded_base = pd.get_dummies(test_df[features_to_encode])

    # not all of the features are in the test set, so we need to account for diffs
    test_index = np.arange(len(test_df.index))
    column_diffs = list(set(encoded.columns.values)-set(test_encoded_base.columns.values))

    diff_df = pd.DataFrame(0, index=test_index, columns=column_diffs)

    # we'll also need to reorder the columns to match, so let's get those
    column_order = encoded.columns.to_list()

    # append the new columns
    test_encoded_temp = test_encoded_base.join(diff_df)

    # reorder the columns
    test_final = test_encoded_temp[column_order].fillna(0)

    # model to fit/test
    to_fit = encoded.join(df[numeric_features])
    test_set = test_final.join(test_df[numeric_features])
    
    return to_fit, test_set

In [5]:
data_train, data_test = feat_eng(df, test_df)
data_train

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,dst_host_rerror_rate,is_guest_login,srv_serror_rate,srv_rerror_rate,diff_srv_rate,count,duration,num_file_creations,dst_bytes,num_root
0,0,1,0,0,0,0,0,0,0,0,...,0.05,0,0.0,0.0,0.00,2,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0,...,0.00,0,0.0,0.0,0.15,13,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0.00,0,1.0,0.0,0.07,123,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0.00,0,0.2,0.0,0.00,5,0,0,8153,0
4,0,1,0,0,0,0,0,0,0,0,...,0.00,0,0.0,0.0,0.00,30,0,0,420,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,1,0,0,0,0,0,0,0,0,...,0.00,0,1.0,0.0,0.06,184,0,0,0,0
125969,0,0,1,0,0,0,0,0,0,0,...,0.00,0,0.0,0.0,0.00,2,8,0,145,0
125970,0,1,0,0,0,0,0,0,0,0,...,0.01,0,0.0,0.0,0.00,1,0,0,384,0
125971,0,1,0,0,0,0,0,0,0,0,...,0.00,0,1.0,0.0,0.05,144,0,0,0,0


In [6]:
df['attack_map'].value_counts()

0    67352
1    45927
2    11656
4      995
3       43
Name: attack_map, dtype: int64

In [7]:
test_df['attack_map'].value_counts()

0    9855
1    7460
4    2743
2    2421
3      65
Name: attack_map, dtype: int64

In [8]:
scaler = StandardScaler()

new_attacks = [1,2,3,4]
test_classes = [0,1,2,3,4]


def get_x_y(df, data, classes=[0,1]):

    indices = df['attack_map'].isin(classes)
    x = data[indices]
    y = df['normal_flag'][indices]
    
    return x.to_numpy(), y.to_numpy()


x_train, y = get_x_y(df, data_train)
X = scaler.fit_transform(x_train)

np.random.seed(0)
np.random.shuffle(X)
np.random.seed(0)
np.random.shuffle(y)

x_normal = X[y==1]

# x_testing, y_test = get_x_y(test_df, data_test, classes=test_classes)
# x_test = scaler.transform(x_testing)

In [9]:
strategy = tf.distribute.MirroredStrategy()
num_inputs = X.shape[-1]

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


# Modelling: Pretrain AE

In [10]:
# Build Models

def build_seq_layer(activation, sigma=0.5, train=False, layer_number=1,
                seed=0, neurons=5, batchnorm=True, regulariser=None):
    
    layer = []
    
    initialiser = tf.keras.initializers.GlorotUniform(seed=seed)
    
    if activation == "r":
        layer.append(RBFLayer(neurons, gamma=1.0, initializer=initialiser))
        
        if batchnorm:
            layer.append(tf.keras.layers.BatchNormalization())
            
    else:
        layer.append(tf.keras.layers.Dense(neurons,
                      kernel_initializer=initialiser, kernel_regularizer=regulariser))
        
        if batchnorm:
            layer.append(tf.keras.layers.BatchNormalization())
            
        if activation == "b":
            layer.append(Bump(sigma=sigma, trainable=train,
                              name=f"bump{layer_number+1}"))
        elif activation == "s":
            layer.append(tf.keras.layers.Activation(tf.keras.activations.sigmoid))
        else:
            layer.append(tf.keras.layers.LeakyReLU(alpha=0.01))
    
    return layer


class AE_Module(tf.keras.Model):
    
    def __init__(self, input_dim=122, hidden_dim=[60, 45], output_dim=30,
                 activation=tf.keras.layers.LeakyReLU(alpha=0.3),
                 regulariser=None, decoder=False, seed=0, **kwargs):
        
        super(AE_Module, self).__init__()
        
        layers = tf.keras.Sequential([tf.keras.Input(shape=(input_dim,))])
        
        for i, neurons in enumerate(hidden_dim):
            
            if type(activation) is str:
                layer = build_seq_layer(activation, sigma=0.5,
                                        train=False, layer_number=i,
                                        seed=i*10+seed, neurons=neurons,
                                        batchnorm=True, regulariser=regulariser)
                for l in layer:
                    layers.add(l)
                    
            else:
            
                initialiser = tf.keras.initializers.GlorotUniform(seed)
            
                layers.add(
                    tf.keras.layers.Dense(neurons,
                                          kernel_initializer=initialiser,
                                          kernel_regularizer=regulariser))
                layers.add(tf.keras.layers.BatchNormalization())

                layers.add(activation)
                
        self.hidden_layers = layers
        
        initialiser = tf.keras.initializers.GlorotUniform(seed=2023+seed)
        self.last_layer = tf.keras.layers.Dense(output_dim, use_bias=decoder,
                                  kernel_initializer=initialiser,
                                  kernel_regularizer=regulariser)
        
    def call(self, x):
        
#         for layer in self.hidden_layers:
#             x = layer(x)
            
        x = self.hidden_layers(x)
            
        return self.last_layer(x)
    

    
class AE(tf.keras.Model):
    def __init__(self, input_dim=122, hidden_dim=[60, 45], latent_dim=30,
                 activation=tf.keras.layers.LeakyReLU(alpha=0.3),
                 regulariser=None, seed=0, **kwargs):
        super(AE, self).__init__()
        self.encoder = AE_Module(input_dim, hidden_dim, latent_dim,
                                 activation, regulariser, decoder=False, seed=seed)
        self.decoder = AE_Module(latent_dim, list(reversed(hidden_dim)), input_dim,
                                 activation, regulariser, decoder=True, seed=seed)

    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [11]:
# sanity check
lr = 3e-4
epochs = 50
verbose = 1      # can change this to 0 to suppress verbosity during training-
shuffle = False
val_split = 0.1
repeats = 2
epochs = 50

early_stopping = tf.keras.callbacks.EarlyStopping(patience=3, monitor='val_loss',
                                                  restore_best_weights=True)

callbacks = [early_stopping]

hidden_dim = []
latent_dim = 122

with strategy.scope():
    # change activation to desired activation
    ae = AE(input_dim=num_inputs, hidden_dim=hidden_dim, latent_dim=latent_dim,
                     activation="b",
                     regulariser=None, seed=0)
    
    ae.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=lr))
    
#     ae.summary()
    
    hist = ae.fit(X, X, epochs=epochs, verbose=verbose, validation_split=val_split, shuffle=shuffle, callbacks=callbacks)

INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
Epoch 1/50
INFO:tensorflow:batch_all_reduce: 3 all-reduces with algorithm = nccl, num_packs = 1
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device

In [12]:
lr = 3e-4
verbose = 1      # can change this to 0 to suppress verbosity during training-
shuffle = False
val_split = 0.1
epochs = 500

early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, monitor='val_loss',
                                                  restore_best_weights=True)

callbacks = [early_stopping]

hidden_dim = [90]
latent_dim = 60

with strategy.scope():
#     ae = AE(input_dim=num_inputs, hidden_dim=hidden_dim, latent_dim=latent_dim,
#                      activation="b",
#                      regulariser=None, seed=0)
    
#     ae.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=lr))
    
#     ae.summary()
    
    hist = ae.fit(X, X, epochs=epochs, verbose=verbose, validation_split=val_split, shuffle=shuffle, callbacks=callbacks)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 80/500
Epoch 81/500
Epoch 82/500
Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500


In [13]:
ae.encoder.save("models/KDD99/Encoder_Bump_90_60")
ae.decoder.save("models/KDD99/Decoder_Bump_90_60")

INFO:tensorflow:Assets written to: models/KDD99/Encoder_Bump_90_60/assets
INFO:tensorflow:Assets written to: models/KDD99/Decoder_Bump_90_60/assets
