In [7]:
import pandas as pd

df = pd.read_csv('cleaned_data.csv')

In [8]:
from sklearn.model_selection import train_test_split

# Identify feature and label columns
label_columns = [col for col in df.columns if col.startswith('attack_cat')]
feature_columns = [col for col in df.columns if not col.startswith('attack_cat')]

# Separate features (X) and labels (y)
X = df[feature_columns]
y = df[label_columns]

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

Training set shape (X_train, y_train): (206138, 82) (206138, 1)
Testing set shape (X_test, y_test): (51535, 82) (51535, 1)


In [9]:
# Import necessary libraries for data handling, analysis, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import scikit-learn modules for preprocessing, model selection, and metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Import the classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [10]:
# Instantiate the K-Nearest Neighbors classifier
# n_neighbors=5 is a common starting point
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the original, unscaled training data
knn.fit(X_train, y_train)

# Make predictions on the unscaled test data
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("K-Nearest Neighbors Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_knn, average='weighted'):.4f}")
print("\nClassification Report:")
# Set zero_division=0 to avoid warnings for labels with no predicted samples
print(classification_report(y_test, y_pred_knn, zero_division=0))

  return self._fit(X, y)


K-Nearest Neighbors Performance:
Accuracy: 0.7667
Weighted F1-Score: 0.7630

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.14      0.08      0.10       538
      Backdoor       0.07      0.02      0.03       471
           DoS       0.34      0.38      0.36      3293
      Exploits       0.63      0.69      0.66      8853
       Fuzzers       0.51      0.47      0.49      4742
       Generic       0.99      0.97      0.98     11849
        Normal       0.87      0.90      0.88     18675
Reconnaissance       0.64      0.55      0.60      2770
     Shellcode       0.54      0.23      0.32       318
         Worms       0.50      0.12      0.19        26

      accuracy                           0.77     51535
     macro avg       0.52      0.44      0.46     51535
  weighted avg       0.76      0.77      0.76     51535



In [11]:
# Instantiate the Decision Tree classifier
# Setting a max_depth helps to prevent the tree from overfitting
tree = DecisionTreeClassifier(max_depth=10, random_state=42)

# Train the model on the original, unscaled training data
tree.fit(X_train, y_train)

# Make predictions on the unscaled test data
y_pred_tree = tree.predict(X_test)

# Evaluate the model
print("Decision Tree Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tree):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_tree, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tree))

Decision Tree Performance:
Accuracy: 0.8064
Weighted F1-Score: 0.7731

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.67      0.04      0.08       538
      Backdoor       0.90      0.04      0.08       471
           DoS       0.52      0.06      0.11      3293
      Exploits       0.60      0.91      0.72      8853
       Fuzzers       0.67      0.33      0.44      4742
       Generic       1.00      0.98      0.99     11849
        Normal       0.85      0.96      0.90     18675
Reconnaissance       0.93      0.71      0.80      2770
     Shellcode       0.53      0.50      0.52       318
         Worms       0.67      0.23      0.34        26

      accuracy                           0.81     51535
     macro avg       0.73      0.48      0.50     51535
  weighted avg       0.80      0.81      0.77     51535



In [12]:
# Instantiate the Random Forest classifier
# n_estimators is the number of trees in the forest. n_jobs=-1 uses all available cores.
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model on the training data
# .values.ravel() is used to convert the y_train DataFrame to a 1D array, which is expected by the fit method
rf.fit(X_train, y_train.values.ravel())

# Make predictions on the test data
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Performance:
Accuracy: 0.8297
Weighted F1-Score: 0.8200

Classification Report:
                precision    recall  f1-score   support

      Analysis       0.93      0.12      0.21       538
      Backdoor       0.75      0.08      0.14       471
           DoS       0.36      0.24      0.28      3293
      Exploits       0.64      0.83      0.72      8853
       Fuzzers       0.69      0.63      0.66      4742
       Generic       1.00      0.98      0.99     11849
        Normal       0.92      0.94      0.93     18675
Reconnaissance       0.92      0.76      0.83      2770
     Shellcode       0.63      0.57      0.60       318
         Worms       0.50      0.31      0.38        26

      accuracy                           0.83     51535
     macro avg       0.73      0.54      0.57     51535
  weighted avg       0.83      0.83      0.82     51535

