In [3]:
#This script tests and trains on CICIDS2018 only

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline


In [4]:
# Load cleaned dataset

df = pd.read_csv("CICIDS2018-SVM-ready.csv")
df = df.sample(200000, random_state=42)  # you can adjust 200000 up/down
X = df.drop(columns=["Label"])
y = df["Label"]

In [5]:
# META COLUMNS (same as 2017)

META_COLS = [
    "Dst Port",
    "Init Fwd Win Byts",
    "Init Bwd Win Byts",
    "Fwd Act Data Pkts",
    "Fwd Seg Size Min",
    "Subflow Fwd Byts",
    "Active Mean", "Active Max", "Active Min",
    "Idle Mean", "Idle Max", "Idle Min"
]

# Split into META vs NO-META feature sets
X_meta = X.copy()                # keep all features
X_nometa = X.drop(columns=META_COLS, errors="ignore")  # remove metadata

print("Original feature count:", X.shape[1])
print("META feature count:", X_meta.shape[1])
print("NO-META feature count:", X_nometa.shape[1])

Original feature count: 79
META feature count: 79
NO-META feature count: 67


In [6]:
# Train-test split

def split_data(X, y):
    return train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y
    )

X_train_meta, X_test_meta, y_train, y_test = split_data(X_meta, y)
X_train_nometa, X_test_nometa, _, _ = split_data(X_nometa, y)

In [7]:
# Build SVM model

def build_svm():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("svm", LinearSVC(
            class_weight="balanced",
            max_iter=5000
        ))
    ])

In [8]:
# Train WITH_META

svm_meta = build_svm()
svm_meta.fit(X_train_meta, y_train)
pred_meta = svm_meta.predict(X_test_meta)

print("\n========================")
print("=== WITH META RESULTS ===")
print("========================")
print(confusion_matrix(y_test, pred_meta))
print(classification_report(y_test, pred_meta, digits=4))


=== WITH META RESULTS ===
[[28795  5377]
 [ 1429  4399]]
              precision    recall  f1-score   support

           0     0.9527    0.8426    0.8943     34172
           1     0.4500    0.7548    0.5638      5828

    accuracy                         0.8298     40000
   macro avg     0.7013    0.7987    0.7291     40000
weighted avg     0.8795    0.8298    0.8462     40000



In [9]:
# Train NO_META

svm_nometa = build_svm()
svm_nometa.fit(X_train_nometa, y_train)
pred_nometa = svm_nometa.predict(X_test_nometa)

print("\n========================")
print("=== NO META RESULTS ===")
print("========================")
print(confusion_matrix(y_test, pred_nometa))
print(classification_report(y_test, pred_nometa, digits=4))


=== NO META RESULTS ===
[[28211  5961]
 [ 1409  4419]]
              precision    recall  f1-score   support

           0     0.9524    0.8256    0.8845     34172
           1     0.4257    0.7582    0.5453      5828

    accuracy                         0.8157     40000
   macro avg     0.6891    0.7919    0.7149     40000
weighted avg     0.8757    0.8157    0.8350     40000

