In [2]:
import pandas as pd

df = pd.read_csv('cleaned_data.csv')

In [3]:
from sklearn.model_selection import train_test_split

# Identify feature and label columns
label_columns = [col for col in df.columns if col.startswith('attack_cat')]
feature_columns = [col for col in df.columns if not col.startswith('attack_cat')]

# Separate features (X) and labels (y)
X = df[feature_columns]
y = df[label_columns]

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

Training set shape (X_train, y_train): (128379, 55) (128379, 1)
Testing set shape (X_test, y_test): (32095, 55) (32095, 1)


In [4]:
# Import necessary libraries for data handling, analysis, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import scikit-learn modules for preprocessing, model selection, and metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Import the classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [5]:
# Instantiate the K-Nearest Neighbors classifier
# n_neighbors=5 is a common starting point
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the original, unscaled training data
knn.fit(X_train, y_train)

# Make predictions on the unscaled test data
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("K-Nearest Neighbors Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_knn, average='weighted'):.4f}")
print("\nClassification Report:")
# Set zero_division=0 to avoid warnings for labels with no predicted samples
print(classification_report(y_test, y_pred_knn, zero_division=0))

  return self._fit(X, y)


K-Nearest Neighbors Performance:
Accuracy: 0.8740
Weighted F1-Score: 0.8753

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.10      0.23      0.14       358
      Backdoor       0.10      0.13      0.11       297
           DoS       0.35      0.21      0.27      1070
      Exploits       0.79      0.82      0.81      5526
       Fuzzers       0.84      0.86      0.85      4048
       Generic       0.96      0.82      0.89      1472
        Normal       1.00      0.99      1.00     17114
Reconnaissance       0.71      0.71      0.71      1889
     Shellcode       0.57      0.41      0.48       289
         Worms       0.67      0.06      0.11        32

      accuracy                           0.87     32095
     macro avg       0.61      0.52      0.54     32095
  weighted avg       0.88      0.87      0.88     32095



In [6]:
# Instantiate the Decision Tree classifier
# Setting a max_depth helps to prevent the tree from overfitting
tree = DecisionTreeClassifier(max_depth=10, random_state=42)

# Train the model on the original, unscaled training data
tree.fit(X_train, y_train)

# Make predictions on the unscaled test data
y_pred_tree = tree.predict(X_test)

# Evaluate the model
print("Decision Tree Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tree):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_tree, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tree))

Decision Tree Performance:
Accuracy: 0.8895
Weighted F1-Score: 0.8883

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.31      0.27      0.29       358
      Backdoor       0.28      0.21      0.24       297
           DoS       0.24      0.24      0.24      1070
      Exploits       0.81      0.83      0.82      5526
       Fuzzers       0.81      0.90      0.85      4048
       Generic       0.95      0.84      0.90      1472
        Normal       1.00      1.00      1.00     17114
Reconnaissance       0.87      0.73      0.79      1889
     Shellcode       0.47      0.51      0.49       289
         Worms       0.40      0.06      0.11        32

      accuracy                           0.89     32095
     macro avg       0.61      0.56      0.57     32095
  weighted avg       0.89      0.89      0.89     32095



In [7]:
# Instantiate the Random Forest classifier
# n_estimators is the number of trees in the forest. n_jobs=-1 uses all available cores.
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model on the training data
# .values.ravel() is used to convert the y_train DataFrame to a 1D array, which is expected by the fit method
rf.fit(X_train, y_train.values.ravel())

# Make predictions on the test data
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Performance:
Accuracy: 0.8983
Weighted F1-Score: 0.8934

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.24      0.20      0.22       358
      Backdoor       0.17      0.15      0.16       297
           DoS       0.38      0.22      0.28      1070
      Exploits       0.80      0.87      0.83      5526
       Fuzzers       0.87      0.91      0.89      4048
       Generic       0.94      0.87      0.90      1472
        Normal       1.00      1.00      1.00     17114
Reconnaissance       0.75      0.76      0.76      1889
     Shellcode       0.58      0.54      0.56       289
         Worms       0.60      0.28      0.38        32

      accuracy                           0.90     32095
     macro avg       0.63      0.58      0.60     32095
  weighted avg       0.89      0.90      0.89     32095

