In [2]:
# This program requires "all_data.csv" to be located in the same directory.
# It performs feature selection optimization for Naive Bayes, QDA, and MLP algorithms.
# The optimization uses a trial-and-error method, starting with the feature list generated by "04_2_feature_selection_for_attack_files.py" (ordered by importance).
# Features are added to the list if the resulting F-measure improves upon the current maximum; otherwise, they are removed.
# The program outputs the highest achieved F-measure and the optimal feature list.

In [4]:
from google.colab import drive
drive.mount('/content/drive')
import os
base_path = "/content/drive/MyDrive/Intrusion Detection System/"

Mounted at /content/drive


In [6]:
# Import necessary modules
from sklearn import metrics, preprocessing
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import average_precision_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, recall_score, precision_score

import pandas as pd
import warnings
import time
warnings.filterwarnings("ignore")
seconds = time.time()

# Define list of all columns to be imported (20 features + Label)
features = ["Bwd Packet Length Std", "Flow Bytes/s", "Total Length of Fwd Packets", "Fwd Packet Length Std",
            "Flow IAT Std", "Flow IAT Min", "Fwd IAT Total", "Flow Duration", "Bwd Packet Length Max", "Flow IAT Max",
            "Flow IAT Mean", "Total Length of Bwd Packets", "Fwd Packet Length Min", "Bwd Packet Length Mean",
            "Flow Packets/s", "Fwd Packet Length Mean", "Total Backward Packets", "Total Fwd Packets", "Fwd Packet Length Max",
            "Bwd Packet Length Min", 'Label']

# Read CSV file from the correct directory
csv_file = os.path.join(base_path, "all_data.csv")
df = pd.read_csv(csv_file, usecols=features)

# Print feature numbers and names
print('%-17s %-17s' % ("Feature Number", "Feature"))
for i in range(len(features)-1):
    print('%-17s %-17s' % (i+1, features[i]))
print('\n\n\n')

# Convert Label: "BENIGN" becomes 1; all others become 0
attack_or_not = []
for i in df.iloc[:, -1]:
    if i == "BENIGN":
        attack_or_not.append(1)
    else:
        attack_or_not.append(0)
df.iloc[:, -1] = attack_or_not

# Explicitly convert the 'Label' column to numeric type
df['Label'] = pd.to_numeric(df['Label'])

y = df.iloc[:, -1].values  # labels
my_list = []
least = 0

# Define the machine learning algorithms to be used
ml_list = {
    "Naive Bayes": GaussianNB(),
    "QDA": QDA(),
    # "Random Forest": RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    # "ID3": DecisionTreeClassifier(max_depth=5, criterion="entropy"),
    # "AdaBoost": AdaBoostClassifier(),
    # "Nearest Neighbors": KNeighborsClassifier(3),
    "MLP": MLPClassifier(hidden_layer_sizes=(13,13,13), max_iter=500)
}

features.pop()  # Remove the Label tag (no longer needed)
print('%-17s %-30s %-10s  %-10s %-15s' % ("ML algorithm", "Feature Name", "F1-score", "Accuracy", "Feature List"))

# Iterate through each ML algorithm
for j in ml_list:
    my_list = []
    for i in features:  # iterate over each feature
        my_list.append(i)
        X = df.loc[:, my_list].values  # data for current feature set

        # Perform cross-validation (80% train, 20% test)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

        # Apply the machine learning algorithm
        clf = ml_list[j]
        clf.fit(X_train, y_train)
        predict = clf.predict(X_test)
        f1 = clf.score(X_test, y_test)
        result = f1_score(y_test, predict, average='macro')
        accuracy = round(clf.score(X_test, y_test), 2)
        temp = "["
        for ii in my_list:
            temp += str(my_list.index(ii)+1) + ", "  # translate property list to sequence numbers for brevity

        # If current F1-score is equal or higher than the previous best, keep the feature
        if result >= least:
            least = result
            print('%-17s %-30s %-10s  %-10s %-15s %-15s' % (j, i, result, accuracy, temp, "------> New feature found!!!"))
        else:
            my_list.remove(my_list[len(my_list)-1])
            print('%-17s %-30s %-10s  %-10s %-15s' % (j, i, result, accuracy, temp))
    print("F1 =", least, j, "The most efficient feature list =", my_list, "\n\n")

print("operation time: = ", time.time() - seconds, "seconds")

Feature Number    Feature          
1                 Bwd Packet Length Std
2                 Flow Bytes/s     
3                 Total Length of Fwd Packets
4                 Fwd Packet Length Std
5                 Flow IAT Std     
6                 Flow IAT Min     
7                 Fwd IAT Total    
8                 Flow Duration    
9                 Bwd Packet Length Max
10                Flow IAT Max     
11                Flow IAT Mean    
12                Total Length of Bwd Packets
13                Fwd Packet Length Min
14                Bwd Packet Length Mean
15                Flow Packets/s   
16                Fwd Packet Length Mean
17                Total Backward Packets
18                Total Fwd Packets
19                Fwd Packet Length Max
20                Bwd Packet Length Min




ML algorithm      Feature Name                   F1-score    Accuracy   Feature List   
Naive Bayes       Bwd Packet Length Std          0.7258939444619579  0.88       [1,          