In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline


In [2]:
# Load cleaned dataset

df = pd.read_csv("CICIDS2017-SVM-ready.csv")
df = df.sample(200000, random_state=42)  # you can adjust 200000 up/down
X = df.drop(columns=["Label"])
y = df["Label"]

In [3]:
# META COLUMNS (2017 dataset-specific)

META_COLS = [
    "Dst Port",
    "Init Fwd Win Byts",
    "Init Bwd Win Byts",
    "Fwd Act Data Pkts",
    "Fwd Seg Size Min",
    "Subflow Fwd Byts",
    "Active Mean", "Active Max", "Active Min",
    "Idle Mean", "Idle Max", "Idle Min"
]

# Split into META vs NO-META feature sets
X_meta = X.copy()                # keep all features
X_nometa = X.drop(columns=META_COLS, errors="ignore")  # remove metadata

print("Original feature count:", X.shape[1])
print("META feature count:", X_meta.shape[1])
print("NO-META feature count:", X_nometa.shape[1])

Original feature count: 52
META feature count: 52
NO-META feature count: 40


In [4]:
# Train-test split

def split_data(X, y):
    return train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

X_train_meta, X_test_meta, y_train, y_test = split_data(X_meta, y)
X_train_nometa, X_test_nometa, _, _ = split_data(X_nometa, y)

In [5]:
# Build SVM model

def build_svm():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("svm", LinearSVC(
            class_weight="balanced",
            max_iter=5000
        ))
    ])

In [6]:
# Train WITH_META

svm_meta = build_svm()
svm_meta.fit(X_train_meta, y_train)
pred_meta = svm_meta.predict(X_test_meta)

print("\n========================")
print("=== WITH META RESULTS ===")
print("========================")
print(confusion_matrix(y_test, pred_meta))
print(classification_report(y_test, pred_meta, digits=4))


=== WITH META RESULTS ===
[[31256  1968]
 [  236  6540]]
              precision    recall  f1-score   support

           0     0.9925    0.9408    0.9659     33224
           1     0.7687    0.9652    0.8558      6776

    accuracy                         0.9449     40000
   macro avg     0.8806    0.9530    0.9109     40000
weighted avg     0.9546    0.9449    0.9473     40000



In [7]:
# Train NO_META

svm_nometa = build_svm()
svm_nometa.fit(X_train_nometa, y_train)
pred_nometa = svm_nometa.predict(X_test_nometa)

print("\n========================")
print("=== NO META RESULTS ===")
print("========================")
print(confusion_matrix(y_test, pred_nometa))
print(classification_report(y_test, pred_nometa, digits=4))


=== NO META RESULTS ===
[[31247  1977]
 [  447  6329]]
              precision    recall  f1-score   support

           0     0.9859    0.9405    0.9627     33224
           1     0.7620    0.9340    0.8393      6776

    accuracy                         0.9394     40000
   macro avg     0.8739    0.9373    0.9010     40000
weighted avg     0.9480    0.9394    0.9418     40000

