In [1]:
# Nikhil Patil
# CSEC 620
# Project 1
# This code was sourced from: https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers#numerical_columns
# --------------------------------------------
# This model is wildly inefficient. It is not reccommended to run. The "New Model" is the best version to run. 
# ----------------------------------------------
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from keras import layers

file_path = 'Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv'
batch_size = 32
np.random.seed(3)
dataframe = pd.read_csv(file_path)

dataframe.columns = dataframe.columns.str.strip()
dataframe.columns = dataframe.columns.str.replace("/", "", regex=False) # remove any / to prevent any issues when normalizing 

label_encoder = LabelEncoder()
dataframe['target'] = label_encoder.fit_transform(dataframe['Label'])

dataframe['target'] = np.where(dataframe['Label']== 'DDoS', 1, 0)
dataframe = dataframe.drop(columns=['Label'])

training, validation, test = np.split(dataframe.sample(frac=1), [int(0.8*len(dataframe)), int(0.9*len(dataframe))]) 
# change dataset size  

# print num of traning sizes 
print(len(training), 'training examples')
print(len(validation), 'validation examples')
print(len(test), 'test examples')



def df_to_dataset(df, shuffle=True, batch_size=32):
    '''
    Name: df_to_dataset
    Function: converts pandas dataframe to a dataset to be used by tensorflow
    Parameters: df - dataframe, shuffle - boolean to shuffle the dataframe or no, batch_size - the batch size of the dataset
    '''
    df = df.copy()
    labels = df.pop('target')
    df = {key: value.to_numpy()[:,tf.newaxis] for key, value in df.items()}
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(df))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds


train_ds = df_to_dataset(training, batch_size=batch_size)

[(train_features, label_batch)] = train_ds.take(1)
# print('Every feature:', list(train_features.keys()))

def get_normalization_layer(name,dataset):
    normalizer = layers.Normalization(axis=None)
    feature_ds = dataset.map(lambda x, y: x[name])
    normalizer.adapt(feature_ds)
    return normalizer


train_ds = df_to_dataset(training, batch_size=batch_size)
val_ds = df_to_dataset(validation, batch_size=batch_size)
test_ds = df_to_dataset(test, batch_size=batch_size)

all_inputs={}
encoded_features = []

# Numerical Features. These features the best features that were listed for best detecting DDoS attacks. 
for header in ['Average Packet Size','Flow Duration','Flow IAT Std','Bwd Packet Length Std']:
    numeric_col = tf.keras.layers.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs[header] = numeric_col
    encoded_features.append(encoded_numeric_col)


all_features = tf.keras.layers.concatenate(encoded_features)

# Build the model
x = layers.Dense(64, activation='relu')(all_features)  # Increase the number of neurons to 64
x = layers.Dropout(0.3)(x)  # Add dropout layer to prevent overfitting

x = layers.Dense(32, activation='relu')(x)  # Add another Dense layer with 32 neurons
x = layers.Dropout(0.3)(x)  # Another Dropout layer

x = layers.Dense(16, activation='relu')(x)  # Add another Dense layer with 16 neurons

output = layers.Dense(1, activation='sigmoid')(x)  # Output layer with sigmoid activation for binary classification

model = tf.keras.Model(inputs=all_inputs, outputs=output)
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

model.fit(train_ds, epochs=150, validation_data=val_ds)
result = model.evaluate(test_ds, return_dict=True)
print(result)


    





  return bound(*args, **kwds)


Training time: 51.527567625045776 seconds
Validation Accuracy: 1.0
Test Accuracy: 0.9998228128460687

Classification Report on Test Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9661
           1       1.00      1.00      1.00     12914

    accuracy                           1.00     22575
   macro avg       1.00      1.00      1.00     22575
weighted avg       1.00      1.00      1.00     22575

