# Intrusion Detection System

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.naive_bayes import GaussianNB

In [38]:
train = pd.read_csv("Train_data.csv")
test = pd.read_csv("Test_data.csv")

l_encoder = LabelEncoder()
train['protocol_type'] = l_encoder.fit_transform(train['protocol_type'])
train['service'] = l_encoder.fit_transform(train['service'])
train['flag'] = l_encoder.fit_transform(train['flag'])
train['class'] = l_encoder.fit_transform(train['class'])

test['protocol_type'] = l_encoder.fit_transform(test['protocol_type'])
test['service'] = l_encoder.fit_transform(test['service'])
test['flag'] = l_encoder.fit_transform(test['flag'])

# no need for further treatment as there are no NaN values, duplicates or outliers

print(train)
print(test)

features = train.to_numpy()
covar = np.cov(features)
print(covar)

       duration  protocol_type  service  flag  src_bytes  dst_bytes  land  \
0             0              1       19     9        491          0     0   
1             0              2       41     9        146          0     0   
2             0              1       46     5          0          0     0   
3             0              1       22     9        232       8153     0   
4             0              1       22     9        199        420     0   
...         ...            ...      ...   ...        ...        ...   ...   
25187         0              1       16     2          0          0     0   
25188         0              1       19     9        334          0     0   
25189         0              1       46     1          0          0     0   
25190         0              1       38     5          0          0     0   
25191         0              1       17     5          0          0     0   

       wrong_fragment  urgent  hot  ...  dst_host_srv_count  \
0           

In [39]:
# selecting the 10 most important features required for model's decision making
X_train = train.drop(['class'], axis=1)
Y_train = train['class']

rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)

feature_importance = rfc.feature_importances_
sorted_indices = feature_importance.argsort()[::-1] #descending order

selected_indices = sorted_indices[:10]
selected_features = X_train.columns[selected_indices]

print(selected_features)

Index(['src_bytes', 'dst_bytes', 'flag', 'same_srv_rate',
       'dst_host_same_srv_rate', 'dst_host_srv_count', 'count',
       'protocol_type', 'dst_host_diff_srv_rate', 'logged_in'],
      dtype='object')


In [40]:
# splitting the dataset for training and testing
X = train[selected_features]
Y = train['class']

print(X)
print(Y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.80, random_state = 42)

       src_bytes  dst_bytes  flag  same_srv_rate  dst_host_same_srv_rate  \
0            491          0     9           1.00                    0.17   
1            146          0     9           0.08                    0.00   
2              0          0     5           0.05                    0.10   
3            232       8153     9           1.00                    1.00   
4            199        420     9           1.00                    1.00   
...          ...        ...   ...            ...                     ...   
25187          0          0     2           0.07                    0.03   
25188        334          0     9           1.00                    1.00   
25189          0          0     1           0.07                    0.05   
25190          0          0     5           0.14                    0.08   
25191          0          0     5           0.24                    0.19   

       dst_host_srv_count  count  protocol_type  dst_host_diff_srv_rate  \
0           

In [50]:
# LOGISTIC REGRESSION
logistic_model = LogisticRegression()
logistic_model.fit(X_train,y_train)
y_pred = logistic_model.predict(X_test)

logistic_accuracy = accuracy_score(y_test, y_pred)
logistic_precision = precision_score(y_test, y_pred)
logistic_recall = recall_score(y_test, y_pred)
logistic_f1 = f1_score(y_test, y_pred)
logistic_conf_matrix = confusion_matrix(y_test, y_pred)
logistic_report = classification_report(y_test,y_pred)

print("Accuracy : ", logistic_accuracy)
print("Precision : ", logistic_precision)
print("Recall : ", logistic_recall)
print("F1 Score : ", logistic_f1)
print("Confusion Matrix : ", logistic_conf_matrix)
print("Classification Report:")
print(logistic_report)

Accuracy :  0.8906529073228815
Precision :  0.8725868725868726
Recall :  0.9296933433059088
F1 Score :  0.9002353793228318
Confusion Matrix :  [[2002  363]
 [ 188 2486]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.85      0.88      2365
           1       0.87      0.93      0.90      2674

    accuracy                           0.89      5039
   macro avg       0.89      0.89      0.89      5039
weighted avg       0.89      0.89      0.89      5039



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
# reducing the number of features while still maintaining or improving the accuracy of the training model

logistic_model = LogisticRegression()
logistic_rfecv = RFECV(estimator=logistic_model, step=1, cv=StratifiedKFold(n_splits=5), scoring='accuracy')
logistic_rfecv.fit(X_train, y_train)
print("Optimal number of features : ", logistic_rfecv.n_features_)
logistic_reduced_features = X_train.columns[logistic_rfecv.support_]
print("Reduced features : ", logistic_reduced_features)

X_train_reduced = logistic_rfecv.transform(X_train)
X_test_reduced = logistic_rfecv.transform(X_test)
logistic_model.fit(X_train_reduced, y_train)
y_pred_reduced = logistic_model.predict(X_test_reduced)

logistic_accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
logistic_precision_reduced = precision_score(y_test, y_pred_reduced)
logistic_recall_reduced = recall_score(y_test, y_pred_reduced)
logistic_f1_reduced = f1_score(y_test, y_pred_reduced)
logistic_conf_matrix_reduced = confusion_matrix(y_test, y_pred_reduced)

# Print evaluation metrics
print("Accuracy (reduced features): ", logistic_accuracy_reduced)
print("Precision (reduced features): ", logistic_precision_reduced)
print("Recall (reduced features): ", logistic_recall_reduced)
print("F1 Score (reduced features): ", logistic_f1_reduced)
print("Confusion Matrix (reduced features): ", logistic_conf_matrix_reduced)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Optimal number of features :  4
Reduced features :  Index(['same_srv_rate', 'protocol_type', 'dst_host_diff_srv_rate',
       'logged_in'],
      dtype='object')
Accuracy (reduced features):  0.9428458027386386
Precision (reduced features):  0.9382806759735489
Recall (reduced features):  0.9551234106207929
F1 Score (reduced features):  0.9466271312083024
Confusion Matrix (reduced features):  [[2197  168]
 [ 120 2554]]


In [46]:
# XGBOOST GRADIENT
XGB_model = XGBClassifier()
XGB_model.fit(X_train,y_train)
y_pred = XGB_model.predict(X_test)

XGB_accuracy = accuracy_score(y_test, y_pred)
XGB_precision = precision_score(y_test, y_pred)
XGB_recall = recall_score(y_test, y_pred)
XGB_f1 = f1_score(y_test, y_pred)
XGB_conf_matrix = confusion_matrix(y_test, y_pred)
XGB_report = classification_report(y_test,y_pred)

print("Accuracy : ", XGB_accuracy)
print("Precision : ", XGB_precision)
print("Recall : ", XGB_recall)
print("F1 Score : ", XGB_f1)
print("Confusion Matrix : ", XGB_conf_matrix)
print("Classification Report:")
print(XGB_report)

Accuracy :  0.9970232188926375
Precision :  0.9977536503182328
Recall :  0.9966342557965595
F1 Score :  0.9971936389148737
Confusion Matrix :  [[2359    6]
 [   9 2665]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2365
           1       1.00      1.00      1.00      2674

    accuracy                           1.00      5039
   macro avg       1.00      1.00      1.00      5039
weighted avg       1.00      1.00      1.00      5039



In [47]:
XGB_model = XGBClassifier()
XGB_rfecv = RFECV(estimator=XGB_model, step=1, cv=StratifiedKFold(n_splits=5), scoring='accuracy')
XGB_rfecv.fit(X_train, y_train)
print("Optimal number of features : ", XGB_rfecv.n_features_)
XGB_reduced_features = X_train.columns[XGB_rfecv.support_]
print("Reduced features : ", XGB_reduced_features)

# Algorithm did not find any further improvements by removing additional features

X_train_reduced = XGB_rfecv.transform(X_train)
X_test_reduced = XGB_rfecv.transform(X_test)
XGB_model.fit(X_train_reduced, y_train)
y_pred_reduced = XGB_model.predict(X_test_reduced)

XGB_accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
XGB_precision_reduced = precision_score(y_test, y_pred_reduced)
XGB_recall_reduced = recall_score(y_test, y_pred_reduced)
XGB_f1_reduced = f1_score(y_test, y_pred_reduced)
XGB_conf_matrix_reduced = confusion_matrix(y_test, y_pred_reduced)

# Print evaluation metrics
print("Accuracy (reduced features): ", XGB_accuracy_reduced)
print("Precision (reduced features): ", XGB_precision_reduced)
print("Recall (reduced features): ", XGB_recall_reduced)
print("F1 Score (reduced features): ", XGB_f1_reduced)
print("Confusion Matrix (reduced features): ", XGB_conf_matrix_reduced)

Optimal number of features :  10
Reduced features :  Index(['src_bytes', 'dst_bytes', 'flag', 'same_srv_rate',
       'dst_host_same_srv_rate', 'dst_host_srv_count', 'count',
       'protocol_type', 'dst_host_diff_srv_rate', 'logged_in'],
      dtype='object')
Accuracy (reduced features):  0.9970232188926375
Precision (reduced features):  0.9977536503182328
Recall (reduced features):  0.9966342557965595
F1 Score (reduced features):  0.9971936389148737
Confusion Matrix (reduced features):  [[2359    6]
 [   9 2665]]
