In [30]:
import pandas as pd

df = pd.read_csv('cleaned_data.csv')

In [31]:
from sklearn.model_selection import train_test_split

# Identify feature and label columns
label_columns = [col for col in df.columns if col.startswith('attack_cat')]
feature_columns = [col for col in df.columns if not col.startswith('attack_cat')]

# Separate features (X) and labels (y)
X = df[feature_columns]
y = df[label_columns]

# Split the data into training and testing sets (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape (X_train, y_train):", X_train.shape, y_train.shape)
print("Testing set shape (X_test, y_test):", X_test.shape, y_test.shape)

Training set shape (X_train, y_train): (744000, 40) (744000, 1)
Testing set shape (X_test, y_test): (186000, 40) (186000, 1)


In [32]:
# Import necessary libraries for data handling, analysis, and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import scikit-learn modules for preprocessing, model selection, and metrics
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Import the classifier models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [33]:
# Instantiate the K-Nearest Neighbors classifier
# n_neighbors=5 is a common starting point
knn = KNeighborsClassifier(n_neighbors=5)

# Train the model on the original, unscaled training data
knn.fit(X_train, y_train)

# Make predictions on the unscaled test data
y_pred_knn = knn.predict(X_test)

# Evaluate the model
print("K-Nearest Neighbors Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_knn, average='weighted'):.4f}")
print("\nClassification Report:")
# Set zero_division=0 to avoid warnings for labels with no predicted samples
print(classification_report(y_test, y_pred_knn, zero_division=0))

  return self._fit(X, y)


K-Nearest Neighbors Performance:
Accuracy: 0.8576
Weighted F1-Score: 0.8593

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.99      0.98     18457
           1       0.67      0.80      0.73     18531
           2       0.58      0.68      0.63     18480
           3       0.74      0.61      0.67     18468
           4       0.83      0.85      0.84     18889
           5       1.00      0.98      0.99     18747
           6       0.96      0.84      0.90     18578
           7       0.93      0.84      0.88     18564
           8       0.97      0.99      0.98     18785
           9       0.99      0.99      0.99     18501

    accuracy                           0.86    186000
   macro avg       0.87      0.86      0.86    186000
weighted avg       0.87      0.86      0.86    186000



In [34]:
# Instantiate the Decision Tree classifier
# Setting a max_depth helps to prevent the tree from overfitting
tree = DecisionTreeClassifier(max_depth=10, random_state=42)

# Train the model on the original, unscaled training data
tree.fit(X_train, y_train)

# Make predictions on the unscaled test data
y_pred_tree = tree.predict(X_test)

# Evaluate the model
print("Decision Tree Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_tree):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_tree, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_tree))

Decision Tree Performance:
Accuracy: 0.7540
Weighted F1-Score: 0.7546

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18457
           1       0.62      0.34      0.44     18531
           2       0.38      0.70      0.50     18480
           3       0.79      0.44      0.57     18468
           4       0.65      0.78      0.71     18889
           5       1.00      0.97      0.99     18747
           6       0.99      0.74      0.85     18578
           7       0.78      0.76      0.77     18564
           8       0.77      0.81      0.79     18785
           9       0.89      0.98      0.93     18501

    accuracy                           0.75    186000
   macro avg       0.79      0.75      0.75    186000
weighted avg       0.79      0.75      0.75    186000



In [35]:
# Instantiate the Random Forest classifier
# n_estimators is the number of trees in the forest. n_jobs=-1 uses all available cores.
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Train the model on the training data
# .values.ravel() is used to convert the y_train DataFrame to a 1D array, which is expected by the fit method
rf.fit(X_train, y_train.values.ravel())

# Make predictions on the test data
y_pred_rf = rf.predict(X_test)

# Evaluate the model
print("Random Forest Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"Weighted F1-Score: {f1_score(y_test, y_pred_rf, average='weighted'):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

Random Forest Performance:
Accuracy: 0.8756
Weighted F1-Score: 0.8777

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18457
           1       0.75      0.74      0.74     18531
           2       0.57      0.87      0.69     18480
           3       0.89      0.59      0.71     18468
           4       0.85      0.85      0.85     18889
           5       1.00      0.98      0.99     18747
           6       0.97      0.88      0.92     18578
           7       0.93      0.85      0.89     18564
           8       0.97      1.00      0.98     18785
           9       1.00      1.00      1.00     18501

    accuracy                           0.88    186000
   macro avg       0.89      0.88      0.88    186000
weighted avg       0.89      0.88      0.88    186000

