# Importing libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, auc
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve
from sklearn.metrics import precision_recall_curve, PrecisionRecallDisplay, roc_auc_score, plot_roc_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

ImportError: cannot import name 'plot_roc_curve' from 'sklearn.metrics' (c:\Users\arise\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\metrics\__init__.py)

# Loading and Checking the dataset

We chose UNSW_NB15 dataset for this IDS project.

This is the link for [UNSW_NB15 dataset](https://www.kaggle.com/datasets/mrwellsdavid/unsw-nb15).

The training and testing sets were reversed, so we changed the names before loading them from CSV files.

In [None]:
df_train = pd.read_csv("./UNSW_NB15/UNSW_NB15_training-set.csv")
df_test = pd.read_csv("./UNSW_NB15/UNSW_NB15_testing-set.csv")
print("Length of training set: ", len(df_train))
print("Length of testing set: ", len(df_test))

In order to ensure the balance between the training and testing sets and avoid processing twice, we decided to concatenate them into one dataframe and redivide them with a different ratio later with *sklearn.model_selection.train_test_split()*.

In [None]:
df = pd.concat([df_train, df_test])
# information about the dataset
df.info()

In [None]:
df.describe(include="all")

In [None]:
df.head()

## Checking for duplicates

In [None]:
print(df.duplicated().sum())

There is no duplicate record.

## Checking for missing values

In [None]:
print(df.isna().sum())

There is no missing value.

## Checking the balance between benign and attack data

In [None]:
df['label'].value_counts().plot.bar()

In [None]:
df['label'].value_counts(normalize=True)

The ratio between attack and normal data is not equal, but just slightly imbalanced.
Therefore, we will not do a sampling fix here.

# Feature engineering

## Dropping unnecessary features
The first column we will drop is <code>id</code>. This is just for identification, so we can remove this column.

This is a binary classification problem, so we only use column <code>label</code> to classify <code>attack</code> (1) or <code>normal</code> (0).
Then, we do not need attack details in <code>attack_cat</code>.

In [None]:
df = df.drop(columns=['id', 'attack_cat'])

## Encoding categorical features
Encoding categorical features using LabelEncoder.

In [None]:
df_cat = df.select_dtypes(exclude=[np.number])
print(df_cat.columns)
for feature in df_cat.columns:
    df[feature] = LabelEncoder().fit_transform(df[feature])

In [None]:
df.head()

## Data Correlation
Removing highly correlated features.

In [None]:
sns.heatmap(df.corr())
plt.show()

In [None]:
columns = df.columns.tolist()
corr = df.corr()
correlated_vars = []
for i in range(len(columns) - 1):
    for j in range(i+1, len(columns)):
        if corr[columns[i]][columns[j]] > 0.98:
            print(columns[i], columns[j], corr[columns[i]][columns[j]])
            correlated_vars.append(columns[j])

In [None]:
df = df.drop(columns=correlated_vars)

## Splitting training and testing sets

In [None]:
X = df.drop(columns=['label'])
feature_list = list(X.columns)
X = np.array(X)
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
print("Training set:", len(X_train))
print("Testing set:", len(X_test))

## Scaling
Scaling all features using StandardScaler.

In [None]:
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Model training

In [None]:
models = {}
models['Decision Tree Classifier'] = DecisionTreeClassifier()
models['Random Forest Classifier'] = RandomForestClassifier()
models['Gaussian Naive Bayes'] = GaussianNB()

In [None]:
train_score, accuracy, precision, recall, training_time, y_pred = {}, {}, {}, {}, {}, {}
for key in models.keys():
    start_time = time.time()
    models[key].fit(X_train, y_train)
    training_time[key] = time.time() - start_time
    
    y_pred[key] = models[key].predict(X_test)
    
    train_score[key] = models[key].score(X_train, y_train)
    accuracy[key] = models[key].score(X_test, y_test)
    precision[key] = precision_score(y_test, y_pred[key])
    recall[key] = recall_score(y_test, y_pred[key])

Try Feature selection using Recursive Feature Elimination.

In [None]:
rfc_rfe = 'Random Forest Classifier + Recursive Feature Elimination'
models[rfc_rfe] = RandomForestClassifier()
rfe = RFE(models[rfc_rfe])
start_time = time.time()
rfe.fit(X_train, y_train)
training_time[rfc_rfe] = time.time() - start_time

X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

start_time = time.time()
models[rfc_rfe].fit(X_train_rfe, y_train)
training_time[rfc_rfe] = training_time[rfc_rfe] + (time.time() - start_time)
    
y_pred[key] = models[rfc_rfe].predict(X_test_rfe)
    
train_score[rfc_rfe] = models[rfc_rfe].score(X_train_rfe, y_train)
accuracy[rfc_rfe] = models[rfc_rfe].score(X_test_rfe, y_test)
precision[rfc_rfe] = precision_score(y_test, y_pred[key])
recall[rfc_rfe] = recall_score(y_test, y_pred[key])

In [None]:
print("Old number of features:", X.shape[1])
print("New number of features:", len(rfe.estimator_.feature_importances_))

## Models comparison

In [None]:
df_models = pd.DataFrame(index=models.keys(), columns=['Training score', 'Accuracy', 'Precision', 'Recall', 'Training time'])
df_models['Training score'] = train_score.values()
df_models['Accuracy'] = accuracy.values()
df_models['Precision'] = precision.values()
df_models['Recall'] = recall.values()
df_models['Training time'] = training_time.values()

In [None]:
df_models

Because Random Forest Classifier is the best model so far. We will choose this model for the Intrusion Detection System.
The following are more details about this model.

In [None]:
display = RocCurveDisplay.from_estimator(models['Random Forest Classifier'], X_test, y_test)
plt.show()

In [None]:
display = PrecisionRecallDisplay.from_predictions(y_test, y_pred['Random Forest Classifier'])

In [None]:
cm = confusion_matrix(y_test, y_pred['Random Forest Classifier'], labels=models['Random Forest Classifier'].classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=models['Random Forest Classifier'].classes_)
disp.plot(cmap='Blues')
plt.show()