# Intrusion Detection System

In [109]:
import pandas as pd
import numpy as np
import seaborn as sns
from xgboost import XGBClassifier
import pickle as pk
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import PCA

In [110]:
train = pd.read_csv("Train_data.csv")
test = pd.read_csv("Test_data.csv")

l_encoder = LabelEncoder()
train['protocol_type'] = l_encoder.fit_transform(train['protocol_type'])
train['service'] = l_encoder.fit_transform(train['service'])
train['flag'] = l_encoder.fit_transform(train['flag'])
train['class'] = l_encoder.fit_transform(train['class'])

test['protocol_type'] = l_encoder.fit_transform(test['protocol_type'])
test['service'] = l_encoder.fit_transform(test['service'])
test['flag'] = l_encoder.fit_transform(test['flag'])

print(train.describe())
# no need for further treatment as there are no NaN values, duplicates or outliers

print(train)
print(test)

features = train.to_numpy()
# covar = np.cov(features)
# print(covar)

           duration  protocol_type       service          flag     src_bytes  \
count  25192.000000   25192.000000  25192.000000  25192.000000  2.519200e+04   
mean     305.054104       1.053827     29.039139      6.982455  2.433063e+04   
std     2686.555640       0.426998     15.555601      2.679322  2.410805e+06   
min        0.000000       0.000000      0.000000      0.000000  0.000000e+00   
25%        0.000000       1.000000     19.000000      5.000000  0.000000e+00   
50%        0.000000       1.000000     22.000000      9.000000  4.400000e+01   
75%        0.000000       1.000000     46.000000      9.000000  2.790000e+02   
max    42862.000000       2.000000     65.000000     10.000000  3.817091e+08   

          dst_bytes          land  wrong_fragment       urgent           hot  \
count  2.519200e+04  25192.000000    25192.000000  25192.00000  25192.000000   
mean   3.491847e+03      0.000079        0.023738      0.00004      0.198039   
std    8.883072e+04      0.008910      

In [111]:
# the correlation matrix
correlation_matrix = train.corr()
print(correlation_matrix)

                             duration  protocol_type   service      flag  \
duration                     1.000000       0.036421  0.101301 -0.066634   
protocol_type                0.036421       1.000000  0.025322  0.094926   
service                      0.101301       0.025322  1.000000 -0.295491   
flag                        -0.066634       0.094926 -0.295491  1.000000   
src_bytes                    0.084864      -0.001286  0.008554 -0.006599   
dst_bytes                    0.013258      -0.004734 -0.013782  0.027606   
land                        -0.001012      -0.001123 -0.006896 -0.006593   
wrong_fragment              -0.010358       0.176420  0.088169  0.068693   
urgent                      -0.000486      -0.000794  0.011325  0.004744   
hot                          0.004202      -0.011589 -0.056165  0.067027   
num_failed_logins            0.011108      -0.003305  0.029601 -0.010920   
logged_in                   -0.063703      -0.101810 -0.138824  0.587882   
num_compromi

In [112]:
# selecting the 10 most important features required for model's decision making
X_train = train.drop(['class'], axis = 1)
Y_train = train['class']

rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)

feature_importance = rfc.feature_importances_
sorted_indices = feature_importance.argsort()[: : -1] #descending order

selected_indices = sorted_indices[: 10]
selected_features = X_train.columns[selected_indices]

print(selected_features)

Index(['src_bytes', 'dst_bytes', 'flag', 'dst_host_same_srv_rate',
       'same_srv_rate', 'dst_host_srv_count', 'logged_in', 'protocol_type',
       'diff_srv_rate', 'count'],
      dtype='object')


In [113]:
# splitting the dataset for training and testing
X = train[selected_features]
Y = train['class']

print(X)
print(Y)

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size = 0.80, random_state = 42)

       src_bytes  dst_bytes  flag  dst_host_same_srv_rate  same_srv_rate  \
0            491          0     9                    0.17           1.00   
1            146          0     9                    0.00           0.08   
2              0          0     5                    0.10           0.05   
3            232       8153     9                    1.00           1.00   
4            199        420     9                    1.00           1.00   
...          ...        ...   ...                     ...            ...   
25187          0          0     2                    0.03           0.07   
25188        334          0     9                    1.00           1.00   
25189          0          0     1                    0.05           0.07   
25190          0          0     5                    0.08           0.14   
25191          0          0     5                    0.19           0.24   

       dst_host_srv_count  logged_in  protocol_type  diff_srv_rate  count  
0          

In [114]:
# LOGISTIC REGRESSION
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

logistic_accuracy = accuracy_score(y_test, y_pred)
logistic_precision = precision_score(y_test, y_pred)
logistic_recall = recall_score(y_test, y_pred)
logistic_f1 = f1_score(y_test, y_pred)
logistic_conf_matrix = confusion_matrix(y_test, y_pred)
logistic_report = classification_report(y_test,y_pred)

print("Accuracy : ", logistic_accuracy)
print("Precision : ", logistic_precision)
print("Recall : ", logistic_recall)
print("F1 Score : ", logistic_f1)
print("Confusion Matrix : ", logistic_conf_matrix)
print("Classification Report:")
print(logistic_report)

Accuracy :  0.874975193490772
Precision :  0.8830584707646177
Recall :  0.881077038145101
F1 Score :  0.8820666417072257
Confusion Matrix :  [[2053  312]
 [ 318 2356]]
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      2365
           1       0.88      0.88      0.88      2674

    accuracy                           0.87      5039
   macro avg       0.87      0.87      0.87      5039
weighted avg       0.87      0.87      0.87      5039



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [115]:
# reducing the number of features using RFECV while still maintaining or improving the accuracy of the training model
logistic_model = LogisticRegression()
logistic_rfecv = RFECV(estimator = logistic_model, step = 1, cv = StratifiedKFold(n_splits = 5), scoring = 'accuracy')
logistic_rfecv.fit(X_train, y_train)
print("Optimal number of features : ", logistic_rfecv.n_features_)
logistic_reduced_features = X_train.columns[logistic_rfecv.support_]
print("Reduced features : ", logistic_reduced_features)

X_train_reduced = logistic_rfecv.transform(X_train)
X_test_reduced = logistic_rfecv.transform(X_test)
logistic_model.fit(X_train_reduced, y_train)
y_pred_reduced = logistic_model.predict(X_test_reduced)

logistic_accuracy_reduced_rfecv = accuracy_score(y_test, y_pred_reduced)
logistic_precision_reduced_rfecv = precision_score(y_test, y_pred_reduced)
logistic_recall_reduced_rfecv = recall_score(y_test, y_pred_reduced)
logistic_f1_reduced_rfecv = f1_score(y_test, y_pred_reduced)
logistic_conf_matrix_reduced_rfecv = confusion_matrix(y_test, y_pred_reduced)

# Print evaluation metrics
print("Accuracy (reduced features): ", logistic_accuracy_reduced_rfecv)
print("Precision (reduced features): ", logistic_precision_reduced_rfecv)
print("Recall (reduced features): ", logistic_recall_reduced_rfecv)
print("F1 Score (reduced features): ", logistic_f1_reduced_rfecv)
print("Confusion Matrix (reduced features): ", logistic_conf_matrix_reduced_rfecv)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Optimal number of features :  6
Reduced features :  Index(['flag', 'dst_host_same_srv_rate', 'same_srv_rate', 'logged_in',
       'protocol_type', 'diff_srv_rate'],
      dtype='object')
Accuracy (reduced features):  0.944433419329232
Precision (reduced features):  0.9469753547423451
Recall (reduced features):  0.9483919222139118
F1 Score (reduced features):  0.9476831091180867
Confusion Matrix (reduced features):  [[2223  142]
 [ 138 2536]]


In [116]:
# XGBOOST GRADIENT
XGB_model = XGBClassifier()
XGB_model.fit(X_train, y_train)
y_pred = XGB_model.predict(X_test)

XGB_accuracy = accuracy_score(y_test, y_pred)
XGB_precision = precision_score(y_test, y_pred)
XGB_recall = recall_score(y_test, y_pred)
XGB_f1 = f1_score(y_test, y_pred)
XGB_conf_matrix = confusion_matrix(y_test, y_pred)
XGB_report = classification_report(y_test,y_pred)

print("Accuracy : ", XGB_accuracy)
print("Precision : ", XGB_precision)
print("Recall : ", XGB_recall)
print("F1 Score : ", XGB_f1)
print("Confusion Matrix : ", XGB_conf_matrix)
print("Classification Report:")
print(XGB_report)

Accuracy :  0.9968247668188133
Precision :  0.9973802395209581
Recall :  0.9966342557965595
F1 Score :  0.9970071081182192
Confusion Matrix :  [[2358    7]
 [   9 2665]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2365
           1       1.00      1.00      1.00      2674

    accuracy                           1.00      5039
   macro avg       1.00      1.00      1.00      5039
weighted avg       1.00      1.00      1.00      5039



In [124]:
XGB_model = XGBClassifier()
XGB_rfecv = RFECV(estimator = XGB_model, step = 1, cv = StratifiedKFold(n_splits = 5), scoring = 'accuracy')
XGB_rfecv.fit(X_train, y_train)
print("Optimal number of features : ", XGB_rfecv.n_features_)
XGB_reduced_features = X_train.columns[XGB_rfecv.support_]
print("Reduced features : ", XGB_reduced_features)

X_train_reduced = XGB_rfecv.transform(X_train)
X_test_reduced = XGB_rfecv.transform(X_test)
XGB_model.fit(X_train_reduced, y_train)
y_pred_reduced = XGB_model.predict(X_test_reduced)

XGB_accuracy_reduced_rfecv = accuracy_score(y_test, y_pred_reduced)
XGB_precision_reduced_rfecv = precision_score(y_test, y_pred_reduced)
XGB_recall_reduced_rfecv = recall_score(y_test, y_pred_reduced)
XGB_f1_reduced_rfecv = f1_score(y_test, y_pred_reduced)
XGB_conf_matrix_reduced_rfecv = confusion_matrix(y_test, y_pred_reduced)

print("Accuracy (reduced features): ", XGB_accuracy_reduced_rfecv)
print("Precision (reduced features): ", XGB_precision_reduced_rfecv)
print("Recall (reduced features): ", XGB_recall_reduced_rfecv)
print("F1 Score (reduced features): ", XGB_f1_reduced_rfecv)
print("Confusion Matrix (reduced features): ", XGB_conf_matrix_reduced_rfecv)

Optimal number of features :  9
Reduced features :  Index(['src_bytes', 'dst_bytes', 'flag', 'dst_host_same_srv_rate',
       'dst_host_srv_count', 'logged_in', 'protocol_type', 'diff_srv_rate',
       'count'],
      dtype='object')
Accuracy (reduced features):  0.9972216709664616
Precision (reduced features):  0.9977544910179641
Recall (reduced features):  0.9970082273747195
F1 Score (reduced features):  0.9973812196034418
Confusion Matrix (reduced features):  [[2359    6]
 [   8 2666]]


In [125]:
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

# Fit PCA on the training data
pca = PCA(n_components = 9)  # Set the number of components you want to retain
pca.fit(X_train)

# Transform training and test data
X_train_reduced = pca.transform(X_train)
X_test_reduced = pca.transform(X_test)

# Initialize XGBoost Classifier
XGB_model = XGBClassifier()

# Train XGBoost Classifier on reduced features
XGB_model.fit(X_train_reduced, y_train)

# Make predictions on test data
y_pred_reduced = XGB_model.predict(X_test_reduced)

# Evaluate the model
XGB_accuracy_reduced_pca = accuracy_score(y_test, y_pred_reduced)
XGB_precision_reduced_pca = precision_score(y_test, y_pred_reduced)
XGB_recall_reduced_pca = recall_score(y_test, y_pred_reduced)
XGB_f1_reduced_pca = f1_score(y_test, y_pred_reduced)
XGB_conf_matrix_reduced_pca = confusion_matrix(y_test, y_pred_reduced)

# Print evaluation metrics
print("Accuracy (reduced features): ", XGB_accuracy_reduced_pca)
print("Precision (reduced features): ", XGB_precision_reduced_pca)
print("Recall (reduced features): ", XGB_recall_reduced_pca)
print("F1 Score (reduced features): ", XGB_f1_reduced_pca)
print("Confusion Matrix (reduced features): ", XGB_conf_matrix_reduced_pca)


Accuracy (reduced features):  0.9978170271879341
Precision (reduced features):  0.9977570093457944
Recall (reduced features):  0.9981301421091997
F1 Score (reduced features):  0.9979435408487568
Confusion Matrix (reduced features):  [[2359    6]
 [   5 2669]]


In [123]:
XGB_model = XGBClassifier()

# Initialize SelectKBest with f_classif scoring function (appropriate for classification tasks)
k_best = SelectKBest(score_func=chi2, k=9)  # Set the number of features you want to select

# Fit SelectKBest on the training data
X_train_kbest = k_best.fit_transform(X_train, y_train)

# Get the indices of the selected features
kbest_indices = k_best.get_support(indices=True)

# Get the names of the selected features
kbest_features = X_train.columns[kbest_indices]

print("Selected features: ", kbest_features)

# Transform test data using the selected features
X_test_kbest = k_best.transform(X_test)

# Train XGBoost Classifier on selected features
XGB_model.fit(X_train_kbest, y_train)

# Make predictions on test data
y_pred_kbest = XGB_model.predict(X_test_kbest)

# Evaluate the model
XGB_accuracy_kbest = accuracy_score(y_test, y_pred_kbest)
XGB_precision_kbest = precision_score(y_test, y_pred_kbest)
XGB_recall_kbest = recall_score(y_test, y_pred_kbest)
XGB_f1_kbest = f1_score(y_test, y_pred_kbest)
XGB_conf_matrix_kbest = confusion_matrix(y_test, y_pred_kbest)

# Print evaluation metrics
print("Accuracy (kbest features): ", XGB_accuracy_kbest)
print("Precision (kbest features): ", XGB_precision_kbest)
print("Recall (kbest features): ", XGB_recall_kbest)
print("F1 Score (kbest features): ", XGB_f1_kbest)
print("Confusion Matrix (kbest features): ", XGB_conf_matrix_kbest)

Selected features:  Index(['src_bytes', 'dst_bytes', 'flag', 'dst_host_same_srv_rate',
       'same_srv_rate', 'dst_host_srv_count', 'logged_in', 'diff_srv_rate',
       'count'],
      dtype='object')
Accuracy (kbest features):  0.9956340543758683
Precision (kbest features):  0.9962574850299402
Recall (kbest features):  0.9955123410620793
F1 Score (kbest features):  0.9958847736625515
Confusion Matrix (kbest features):  [[2355   10]
 [  12 2662]]


In [119]:
decision_tree_model = DecisionTreeClassifier(random_state=42)

decision_tree_rfecv = RFECV(estimator = decision_tree_model, step = 1, cv = StratifiedKFold(n_splits = 5), scoring = 'accuracy')

# Fit RFECV to the training data
decision_tree_rfecv.fit(X_train, y_train)

print("Optimal number of features : ", decision_tree_rfecv.n_features_)
decision_tree_reduced_features = X_train.columns[decision_tree_rfecv.support_]
print("Reduced features : ", decision_tree_reduced_features)

X_train_reduced = decision_tree_rfecv.transform(X_train)
X_test_reduced = decision_tree_rfecv.transform(X_test)

decision_tree_model.fit(X_train_reduced, y_train)

y_pred_reduced = decision_tree_model.predict(X_test_reduced)

decision_tree_accuracy_reduced = accuracy_score(y_test, y_pred_reduced)
decision_tree_precision_reduced = precision_score(y_test, y_pred_reduced)
decision_tree_recall_reduced = recall_score(y_test, y_pred_reduced)
decision_tree_f1_reduced = f1_score(y_test, y_pred_reduced)
decision_tree_conf_matrix_reduced = confusion_matrix(y_test, y_pred_reduced)

# Print evaluation metrics
print("Accuracy (reduced features): ", decision_tree_accuracy_reduced)
print("Precision (reduced features): ", decision_tree_precision_reduced)
print("Recall (reduced features): ", decision_tree_recall_reduced)
print("F1 Score (reduced features): ", decision_tree_f1_reduced)
print("Confusion Matrix (reduced features): ", decision_tree_conf_matrix_reduced)


Optimal number of features :  8
Reduced features :  Index(['src_bytes', 'dst_bytes', 'flag', 'dst_host_same_srv_rate',
       'dst_host_srv_count', 'logged_in', 'protocol_type', 'count'],
      dtype='object')
Accuracy (reduced features):  0.9950386981543957
Precision (reduced features):  0.9955106621773289
Recall (reduced features):  0.9951383694839192
F1 Score (reduced features):  0.9953244810173929
Confusion Matrix (reduced features):  [[2353   12]
 [  13 2661]]


In [120]:
selector = SelectKBest(score_func = chi2, k = 9)

selector.fit(X_train, y_train)

X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

selected_feature_indices = selector.get_support(indices=True)
selected_features = X_train.columns[selected_feature_indices]
print("Selected Features:")
print(selected_features)

gnb_model = GaussianNB()

gnb_model.fit(X_train_selected, y_train)

y_pred = gnb_model.predict(X_test_selected)

accuracy_selected = accuracy_score(y_test, y_pred)
precision_selected = precision_score(y_test, y_pred)
recall_selected = recall_score(y_test, y_pred)
f1_selected = f1_score(y_test, y_pred)
conf_matrix_selected = confusion_matrix(y_test, y_pred)

print("Accuracy (selected features): ", accuracy_selected)
print("Precision (selected features): ", precision_selected)
print("Recall (selected features): ", recall_selected)
print("F1 Score (selected features): ", f1_selected)
print("Confusion Matrix (selected features): ", conf_matrix_selected)

Selected Features:
Index(['src_bytes', 'dst_bytes', 'flag', 'dst_host_same_srv_rate',
       'same_srv_rate', 'dst_host_srv_count', 'logged_in', 'protocol_type',
       'diff_srv_rate', 'count'],
      dtype='object')
Accuracy (selected features):  0.5737249454256796
Precision (selected features):  0.5549979088247595
Recall (selected features):  0.9925205684367988
F1 Score (selected features):  0.7119098712446352
Confusion Matrix (selected features):  [[ 237 2128]
 [  20 2654]]




In [121]:
data = [
    ["Logistic Regression", logistic_accuracy_reduced, logistic_precision_reduced, logistic_recall_reduced, logistic_f1_reduced],
    ["XGBoost", XGB_accuracy_reduced, XGB_precision_reduced, XGB_recall_reduced, XGB_f1_reduced],
    ["Decision Tree", decision_tree_accuracy_reduced, decision_tree_precision_reduced, decision_tree_recall_reduced, decision_tree_f1_reduced],
    ["Gaussian Naive Bayes", accuracy_selected, precision_selected, recall_selected, f1_selected]
]

# Define column names
col_names = ["Model", "Accuracy", "Precision", "Recall", "F1 Score"]

# Print the table
print(tabulate(data, headers = col_names, tablefmt = "fancy_grid"))

╒══════════════════════╤════════════╤═════════════╤══════════╤════════════╕
│ Model                │   Accuracy │   Precision │   Recall │   F1 Score │
╞══════════════════════╪════════════╪═════════════╪══════════╪════════════╡
│ Logistic Regression  │   0.867037 │    0.896675 │ 0.847046 │   0.871154 │
├──────────────────────┼────────────┼─────────────┼──────────┼────────────┤
│ XGBoost              │   0.997619 │    0.997384 │ 0.99813  │   0.997757 │
├──────────────────────┼────────────┼─────────────┼──────────┼────────────┤
│ Decision Tree        │   0.995039 │    0.995511 │ 0.995138 │   0.995324 │
├──────────────────────┼────────────┼─────────────┼──────────┼────────────┤
│ Gaussian Naive Bayes │   0.573725 │    0.554998 │ 0.992521 │   0.71191  │
╘══════════════════════╧════════════╧═════════════╧══════════╧════════════╛


In [128]:
data = [
    ["XGBoost+RFECV ", XGB_accuracy_reduced_rfecv, XGB_precision_reduced_rfecv, XGB_recall_reduced_rfecv, XGB_f1_reduced_rfecv],
    ["XGBoost+PCA", XGB_accuracy_reduced_pca, XGB_precision_reduced_pca, XGB_recall_reduced_pca, XGB_f1_reduced_pca],
    ["XGBoost+KBest", XGB_accuracy_kbest, XGB_precision_kbest, XGB_recall_kbest, XGB_f1_kbest],
]

# Define column names
col_names = ["Model", "Accuracy", "Precision", "Recall", "F1 Score"]

# Print the table
print(tabulate(data, headers = col_names, tablefmt = "fancy_grid"))

╒═══════════════╤════════════╤═════════════╤══════════╤════════════╕
│ Model         │   Accuracy │   Precision │   Recall │   F1 Score │
╞═══════════════╪════════════╪═════════════╪══════════╪════════════╡
│ XGBoost+RFECV │   0.997222 │    0.997754 │ 0.997008 │   0.997381 │
├───────────────┼────────────┼─────────────┼──────────┼────────────┤
│ XGBoost+PCA   │   0.997817 │    0.997757 │ 0.99813  │   0.997944 │
├───────────────┼────────────┼─────────────┼──────────┼────────────┤
│ XGBoost+KBest │   0.995634 │    0.996257 │ 0.995512 │   0.995885 │
╘═══════════════╧════════════╧═════════════╧══════════╧════════════╛
