In [368]:
# # # Import libraries
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.losses import MeanSquaredLogarithmicError
from keras.callbacks import EarlyStopping
import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')



In [369]:
# # # Load data

train_original_data = pd.read_csv('/content/drive/MyDrive/dataset/UNSW_repaired_data_new/UNSW_repaired_noisy_dur_02_5.csv')

OUTLIER_PECENTAGE = 5

# Shuffle data
train_original_data = train_original_data.sample(frac=1).reset_index(drop=True)

In [370]:
# # # Preprocessing data

labelencoder = LabelEncoder()

train_original_data["proto"] = labelencoder.fit_transform(train_original_data["proto"])
train_original_data["service"] = labelencoder.fit_transform(train_original_data["service"])
train_original_data["state"] = labelencoder.fit_transform(train_original_data["state"])
train_original_data["attack_cat"] = labelencoder.fit_transform(train_original_data["attack_cat"])

unscaled_train_original_data_target = train_original_data.iloc[:,47] # last column
unscaled_train_original_data_inputs = train_original_data.iloc[:,3:46] # remove index and label columns

#Scale
scaler = StandardScaler()

scaler.fit(unscaled_train_original_data_inputs)
scaled_train_inputs = scaler.transform(unscaled_train_original_data_inputs)

X = scaled_train_inputs
y = unscaled_train_original_data_target


# print(np.count_nonzero(y == 0))

In [371]:
# # # Splitting the data

x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.3,stratify=y)


In [372]:
# # Random Forest
forest = RandomForestClassifier(n_estimators = 10, random_state = 0)
forest.fit(x_train, y_train)
preds = forest.predict(x_test)

# Metrics
print('accuracy_score: ', round((accuracy_score(y_test,preds)*100),2))
print('f1_score: ', round(f1_score(y_test,preds),6))
print('precision_score: ', round(precision_score(y_test,preds),6))
print('recall_score: ', round(recall_score(y_test,preds),6))
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
specificity = tn/(tn+fp)
print('Specificity : ', round(specificity,6))



accuracy_score:  94.35
f1_score:  0.970851
precision_score:  0.955601
recall_score:  0.986596
Specificity :  0.058391


In [373]:
# # # # Autoencoder

# Splitting the for testing and validating dataset
x_val, x_test, y_val, y_test = train_test_split(x_test,y_test, test_size=0.33,stratify=y_test)

class AutoEncoder(Model):
  """
  Parameters
  ----------
  output_units: int
    Number of output units
  
  code_size: int
    Number of units in bottle neck
  """

  def __init__(self, output_units, code_size=8):
    super().__init__()
    self.encoder = Sequential([
      Dense(64, activation='relu'),
      Dropout(0.1),
      Dense(32, activation='relu'),
      Dropout(0.1),
      Dense(16, activation='relu'),
      Dropout(0.1),
      Dense(code_size, activation='relu')
    ])
    self.decoder = Sequential([
      Dense(16, activation='relu'),
      Dropout(0.1),
      Dense(32, activation='relu'),
      Dropout(0.1),
      Dense(64, activation='relu'),
      Dropout(0.1),
      Dense(output_units, activation='sigmoid')
    ])
  
  def call(self, inputs):
    encoded = self.encoder(inputs)
    decoded = self.decoder(encoded)
    return decoded

model = AutoEncoder(output_units=x_train.shape[1])
# configurations of model
model.compile(loss='msle', metrics=['mse'], optimizer='adam')

# simple early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

history = model.fit(
    x_train,
    x_train,
    epochs=50,
    batch_size=128,
    validation_data=(x_val, x_val),
    verbose=0,
    callbacks=[es]
)

def find_threshold(model, x_train_scaled):
  # another method to find threshold
  reconstructions = model.predict(x_train_scaled)
  # provides losses of individual instances
  reconstruction_errors = tf.keras.losses.msle(reconstructions, x_train_scaled)

  threshold_2 = np.percentile(reconstruction_errors, 100-OUTLIER_PECENTAGE)
  return threshold_2

def get_predictions(model, x_test_scaled, threshold):
  predictions = model.predict(x_test_scaled)
  # provides losses of individual instances
  errors = tf.keras.losses.msle(predictions, x_test_scaled)
  # 0 = anomaly, 1 = normal
  anomaly_mask = pd.Series(errors) > threshold
  preds = anomaly_mask.map(lambda x: 0.0 if x == True else 1.0)
  return preds

threshold = find_threshold(model, x_train)
preds = get_predictions(model, x_test, threshold)

# Metrics
print('accuracy_score: ', round((accuracy_score(y_test,preds)*100),2))
print('f1_score: ', round(f1_score(y_test,preds),6))
print('precision_score: ', round(precision_score(y_test,preds),6))
print('recall_score: ', round(recall_score(y_test,preds),6))
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
specificity = tn/(tn+fp)
print('Specificity : ', round(specificity,6))

Epoch 2: early stopping
accuracy_score:  90.83
f1_score:  0.951801
precision_score:  0.953983
recall_score:  0.949629
Specificity :  0.058617


In [None]:
# # # KNeighborsClassifier

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(x_train, y_train)

preds = neigh.predict(x_test)

# Metrics
print('accuracy_score: ', round((accuracy_score(y_test,preds)*100),2))
print('f1_score: ', round(f1_score(y_test,preds),6))
print('precision_score: ', round(precision_score(y_test,preds),6))
print('recall_score: ', round(recall_score(y_test,preds),6))
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
specificity = tn/(tn+fp)
print('Specificity : ', round(specificity,6))