# Network Intrusion Detection based on Machine Learning

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

# Reading the Dataset

In [2]:
data = pd.read_csv("networkintrusion.csv")

$head()$ will display the top 5 observations of the dataset

In [3]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,private,REJ,0.0,0.0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,anomaly
1,0,tcp,private,REJ,0.0,0.0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,anomaly
2,2,tcp,ftp_data,SF,12983.0,0.0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,icmp,eco_i,SF,20.0,0.0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,anomaly
4,1,tcp,telnet,RSTO,0.0,15.0,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,anomaly


### Analyzing the data

$info()$ helps to understand the data type and information about data, including the number of records in each column, data having null or not null, Data type, the memory usage of the dataset.

We can find that the network intrusion dataset has 40 columns, among them, "protocol_type", "service", "flag", and "class" columns are object type or categorical features (Not numerical)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22544 entries, 0 to 22543
Data columns (total 40 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   duration                     22544 non-null  int64  
 1   protocol_type                22544 non-null  object 
 2   service                      22544 non-null  object 
 3   flag                         22544 non-null  object 
 4   src_bytes                    22538 non-null  float64
 5   dst_bytes                    22538 non-null  float64
 6   land                         22544 non-null  int64  
 7   wrong_fragment               22544 non-null  int64  
 8   urgent                       22544 non-null  int64  
 9   hot                          22544 non-null  int64  
 10  num_failed_logins            22544 non-null  int64  
 11  logged_in                    22544 non-null  int64  
 12  num_compromised              22544 non-null  int64  
 13  root_shell      

Some machine learning algorithms can handle categorical features directly without requiring them to be converted to numerical values. For example, Decision Trees and Random Forests can handle categorical variables naturally by splitting on different categories.

However, many other algorithms, such as logistic regression, support vector machines, and neural networks, require numerical input. In such cases, converting categorical features to numerical values becomes necessary. This process is called "encoding" categorical variables.

Since we will work with all classification models, we will use LabelEncoder() to convert the categorical features to numerical values.

In [None]:
label_encoder = LabelEncoder()
data['protocol_type'] = label_encoder.fit_transform(data['protocol_type'])
data['service'] = label_encoder.fit_transform(data['service'])
data['flag'] = label_encoder.fit_transform(data['flag'])
data['class'] = label_encoder.fit_transform(data['class'])

### Checking for missing values

$isnull()$ is used to identify null/ missing values in the data
$isnull().sum()$ will show us the total number of missing records in each column. For example, "src_bytes" and "dst_bytes" each have 6 values missing.

In [7]:
data.isnull().sum() 

duration                       0
protocol_type                  0
service                        0
flag                           0
src_bytes                      6
dst_bytes                      6
land                           0
wrong_fragment                 0
urgent                         0
hot                            0
num_failed_logins              0
logged_in                      0
num_compromised                0
root_shell                     0
num_root                       0
num_shells                     0
num_access_files               0
num_outbound_cmds              0
is_host_login                  0
is_guest_login                 0
count                          9
srv_count                      9
serror_rate                    0
srv_serror_rate                0
rerror_rate                    0
srv_rerror_rate                0
same_srv_rate                  0
diff_srv_rate                  0
srv_diff_host_rate             0
dst_host_count                 0
dst_host_s

We are filling the missing values with the mean of all values in that column

In [131]:
data.fillna(data.mean(), inplace=True)

Now our dataset has only numerical values. This is easier to analyze using the machine learning classifiers

In [132]:
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,1,45,1,0.0,0.0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,0
1,0,1,45,1,0.0,0.0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,0
2,2,1,19,9,12983.0,0.0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,1
3,0,0,13,9,20.0,0.0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,0
4,1,1,55,2,0.0,15.0,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,0


Now we will split the independent variables or features from the target variables or labels. In other words, we will keep the features in $X$ variable the labels in $y$ variable.

In [133]:
X = data.drop(columns=['class'])
y = data['class'] 

We will utilize stratified k-fold to maintain a uniform class distribution as the original dataset. This ensures that each fold is representative of the overall dataset, providing a more reliable estimate of the model's performance, particularly for classification tasks. Stratified k-fold is particularly useful when there is class imbalance in the dataset.

In [8]:
skf = StratifiedKFold(n_splits = 5)

NameError: name 'X' is not defined

The numerical values of the features are still unscaled and widely varied. Therefore, we need to scale the values. Scaling ensures that all features contribute equally to the model fitting process by bringing them to a similar scale. Without scaling, features with larger magnitudes may dominate those with smaller magnitudes, leading to biased model training.

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = scaler.fit_transform(X)
X_scaled

# Logistic Regression

In [136]:
from sklearn.linear_model import LogisticRegression

In [137]:
lr = LogisticRegression()
true_labels = []
pred_labels = []
accuracies = []

In [138]:
for train_index, test_index in skf.split(X_scaled, y):
    
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    lr.fit(X_train, y_train)
    prediction_lr = lr.predict(X_test)
    
    true_labels.extend(y_test)
    pred_labels.extend(prediction_lr)
    
    accuracy = accuracy_score(y_test, prediction_lr)
    accuracies.append(accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [139]:
print('5-fold Accuracies', accuracies)
print('Mean Accuracy', np.mean(accuracies))

5-fold Accuracies [0.9350188511865158, 0.9370148591705478, 0.936127744510978, 0.936127744510978, 0.9347826086956522]
Mean Accuracy 0.9358143616149344


In [140]:
print('Accuracy of Logistic Regression classifier: {:.4f}'.format(accuracy_score(true_labels, pred_labels)))
A = confusion_matrix(true_labels, pred_labels)
print('Total testing data :', A[0][0] + A[0][1] + A[1][0] + A[1][1])
print('True Positive: ', A[0][0])
print('True Negative: ', A[1][1])
print('False Positive: ', A[0][1])
print('Flase Negative: ', A[1][0])
print(classification_report(true_labels, pred_labels, digits=4))

Accuracy of Logistic Regression classifier: 0.9358
Total testing data : 22544
True Positive:  12235
True Negative:  8862
False Positive:  598
Flase Negative:  849
              precision    recall  f1-score   support

           0     0.9351    0.9534    0.9442     12833
           1     0.9368    0.9126    0.9245      9711

    accuracy                         0.9358     22544
   macro avg     0.9359    0.9330    0.9343     22544
weighted avg     0.9358    0.9358    0.9357     22544



# Random Forest

In [141]:
from sklearn.ensemble import RandomForestClassifier

In [142]:
rf = RandomForestClassifier()
true_labels = []
pred_labels = []
accuracies = []

In [143]:
for train_index, test_index in skf.split(X_scaled, y):
    
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    rf.fit(X_train, y_train)
    prediction_rf = rf.predict(X_test)
    
    true_labels.extend(y_test)
    pred_labels.extend(prediction_rf)
    
    accuracy = accuracy_score(y_test, prediction_rf)
    accuracies.append(accuracy)

In [144]:
print('5-fold Accuracies', accuracies)
print('Mean Accuracy', np.mean(accuracies))

5-fold Accuracies [0.9880239520958084, 0.9878021734309159, 0.983810157462852, 0.9869150587713462, 0.9849157054125999]
Mean Accuracy 0.9862934094347044


In [145]:
print('Accuracy of Random Forest classifier: {:.4f}'.format(accuracy_score(true_labels, pred_labels)))
B = confusion_matrix(true_labels, pred_labels)
print('Total testing data :', B[0][0] + B[0][1] + B[1][0] + B[1][1])
print('True Positive: ', B[0][0])
print('True Negative: ', B[1][1])
print('False Positive: ', B[0][1])
print('Flase Negative: ', B[1][0])
print(classification_report(true_labels, pred_labels, digits=4))

Accuracy of Random Forest classifier: 0.9863
Total testing data : 22544
True Positive:  12682
True Negative:  9553
False Positive:  151
Flase Negative:  158
              precision    recall  f1-score   support

           0     0.9877    0.9882    0.9880     12833
           1     0.9844    0.9837    0.9841      9711

    accuracy                         0.9863     22544
   macro avg     0.9861    0.9860    0.9860     22544
weighted avg     0.9863    0.9863    0.9863     22544



# XGBoost

In [146]:
import xgboost
from xgboost import XGBClassifier

In [147]:
xgb = XGBClassifier() 
true_labels = []
pred_labels = []
accuracies = []

In [148]:
for train_index, test_index in skf.split(X_scaled, y):
    
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    xgb.fit(X_train, y_train)
    prediction_xgb = xgb.predict(X_test)
    
    true_labels.extend(y_test)
    pred_labels.extend(prediction_xgb)
    
    accuracy = accuracy_score(y_test, prediction_xgb)
    accuracies.append(accuracy)





















In [149]:
print('5-fold Accuracies', accuracies)
print('Mean Accuracy', np.mean(accuracies))

5-fold Accuracies [0.9897981814149479, 0.9889110667553781, 0.9866932801064537, 0.9878021734309159, 0.9889086069210293]
Mean Accuracy 0.988422661725745


In [150]:
print('Accuracy of Random Forest classifier: {:.4f}'.format(accuracy_score(true_labels, pred_labels)))
B = confusion_matrix(true_labels, pred_labels)
print('Total testing data :', B[0][0] + B[0][1] + B[1][0] + B[1][1])
print('True Positive: ', B[0][0])
print('True Negative: ', B[1][1])
print('False Positive: ', B[0][1])
print('Flase Negative: ', B[1][0])
print(classification_report(true_labels, pred_labels, digits=4))

Accuracy of Random Forest classifier: 0.9884
Total testing data : 22544
True Positive:  12701
True Negative:  9582
False Positive:  132
Flase Negative:  129
              precision    recall  f1-score   support

           0     0.9899    0.9897    0.9898     12833
           1     0.9864    0.9867    0.9866      9711

    accuracy                         0.9884     22544
   macro avg     0.9882    0.9882    0.9882     22544
weighted avg     0.9884    0.9884    0.9884     22544



In [None]:
As we can see, XGBoost is already performing very well, returning F1 score of 98