In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline

# ============================================
# CONFIG: META COLUMNS (SAME ACROSS DATASETS)
# ============================================
META_COLS = [
    "Dst Port",
    "Init Fwd Win Byts",
    "Init Bwd Win Byts",
    "Fwd Act Data Pkts",
    "Fwd Seg Size Min",
    "Subflow Fwd Byts",
    "Active Mean", "Active Max", "Active Min",
    "Idle Mean", "Idle Max", "Idle Min"
]

# ============================================
# HELPER: BUILD SVM PIPELINE
# ============================================
def build_svm():
    return Pipeline([
        ("scaler", StandardScaler()),
        ("svm", LinearSVC(
            class_weight="balanced",
            max_iter=5000
        ))
    ])

# ============================================
# HELPER: RUN ONE CROSS-DATASET EXPERIMENT
# ============================================
def run_cross_svm(
    train_file,
    test_file,
    train_name,
    test_name,
    sample_train=200000,
    sample_test=200000,
    label_col="Label"
):
    print("\n========================================")
    print(f"TRAIN: {train_name}  →  TEST: {test_name}")
    print("========================================")

    # ---------- LOAD ----------
    train_df = pd.read_csv(train_file)
    test_df  = pd.read_csv(test_file)

    # Optional sampling for speed
    if sample_train is not None and len(train_df) > sample_train:
        train_df = train_df.sample(sample_train, random_state=42)
    if sample_test is not None and len(test_df) > sample_test:
        test_df = test_df.sample(sample_test, random_state=42)

    print(f"Train shape after sampling: {train_df.shape}")
    print(f"Test shape after sampling:  {test_df.shape}")

    # ---------- SPLIT FEATURES / LABEL ----------
    X_train_full = train_df.drop(columns=[label_col])
    y_train = train_df[label_col]

    X_test_full = test_df.drop(columns=[label_col])
    y_test = test_df[label_col]

    # ---------- ALIGN COLUMNS (IMPORTANT!) ----------
    # use only columns shared by both datasets
    common_cols = sorted(list(set(X_train_full.columns) & set(X_test_full.columns)))
    X_train_full = X_train_full[common_cols]
    X_test_full  = X_test_full[common_cols]

    print(f"Common feature count (before removing meta): {len(common_cols)}")

    # ---------- WITH META ----------
    X_train_meta = X_train_full.copy()
    X_test_meta  = X_test_full.copy()

    print("\n--- WITH META ---")
    print(f"Feature count (WITH META): {X_train_meta.shape[1]}")

    svm_meta = build_svm()
    svm_meta.fit(X_train_meta, y_train)
    pred_meta = svm_meta.predict(X_test_meta)

    print("\n[WITH META] Confusion Matrix:")
    print(confusion_matrix(y_test, pred_meta))
    print("\n[WITH META] Classification Report:")
    print(classification_report(y_test, pred_meta, digits=4))

    # ---------- NO META ----------
    # drop meta columns if present, then re-align in case some meta cols were missing
    X_train_nometa = X_train_full.drop(columns=META_COLS, errors="ignore")
    X_test_nometa  = X_test_full.drop(columns=META_COLS, errors="ignore")

    # ensure same order and intersection again
    common_nometa = sorted(list(set(X_train_nometa.columns) & set(X_test_nometa.columns)))
    X_train_nometa = X_train_nometa[common_nometa]
    X_test_nometa  = X_test_nometa[common_nometa]

    print("\n--- NO META ---")
    print(f"Feature count (NO META): {X_train_nometa.shape[1]}")

    svm_nometa = build_svm()
    svm_nometa.fit(X_train_nometa, y_train)
    pred_nometa = svm_nometa.predict(X_test_nometa)

    print("\n[NO META] Confusion Matrix:")
    print(confusion_matrix(y_test, pred_nometa))
    print("\n[NO META] Classification Report:")
    print(classification_report(y_test, pred_nometa, digits=4))

    print("\n========================================")
    print(f"END: TRAIN {train_name}  →  TEST {test_name}")
    print("========================================\n")


# ============================================
# RUN ALL FOUR CROSS-TESTS
# ============================================
TRAIN_2017 = "CICIDS2017-SVM-ready.csv"
TRAIN_2018 = "CICIDS2018-SVM-ready.csv"
TEST_2017  = "CICIDS2017-SVM-ready.csv"
TEST_2018  = "CICIDS2018-SVM-ready.csv"
TEST_2019  = "CICDDoS2019-SVM-ready.csv"

# 1) Train 2017 → Test 2018
run_cross_svm(
    train_file=TRAIN_2017,
    test_file=TEST_2018,
    train_name="CICIDS2017",
    test_name="CICIDS2018"
)

# 2) Train 2017 → Test 2019
run_cross_svm(
    train_file=TRAIN_2017,
    test_file=TEST_2019,
    train_name="CICIDS2017",
    test_name="CICDDoS2019"
)

# 3) Train 2018 → Test 2017
run_cross_svm(
    train_file=TRAIN_2018,
    test_file=TEST_2017,
    train_name="CICIDS2018",
    test_name="CICIDS2017"
)

# 4) Train 2018 → Test 2019
run_cross_svm(
    train_file=TRAIN_2018,
    test_file=TEST_2019,
    train_name="CICIDS2018",
    test_name="CICDDoS2019"
)



TRAIN: CICIDS2017  →  TEST: CICIDS2018
Train shape after sampling: (200000, 53)
Test shape after sampling:  (200000, 80)
Common feature count (before removing meta): 52

--- WITH META ---
Feature count (WITH META): 52

[WITH META] Confusion Matrix:
[[149498  21360]
 [ 26520   2622]]

[WITH META] Classification Report:
              precision    recall  f1-score   support

           0     0.8493    0.8750    0.8620    170858
           1     0.1093    0.0900    0.0987     29142

    accuracy                         0.7606    200000
   macro avg     0.4793    0.4825    0.4803    200000
weighted avg     0.7415    0.7606    0.7508    200000


--- NO META ---
Feature count (NO META): 40

[NO META] Confusion Matrix:
[[142959  27899]
 [ 26252   2890]]

[NO META] Classification Report:
              precision    recall  f1-score   support

           0     0.8449    0.8367    0.8408    170858
           1     0.0939    0.0992    0.0964     29142

    accuracy                         0.7292  