In [1]:
import pandas as pd
import numpy as np

In [2]:
INPUT_FILE = "CICDDoS2019-combined.csv"
OUTPUT_FILE = "CICDDoS2019-SVM-ready.csv"

In [3]:
#1. LOAD COMBINED 2019 DATA
print(f"Loading {INPUT_FILE} ...")
df = pd.read_csv(INPUT_FILE)
print("Original shape:", df.shape)
print("First 5 columns:", df.columns[:5].tolist())
print("Last 5 columns:", df.columns[-5:].tolist())

Loading CICDDoS2019-combined.csv ...
Original shape: (431371, 78)
First 5 columns: ['Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packets Length Total']
Last 5 columns: ['Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min', 'Label']


In [4]:
# 2. NORMALIZE COLUMN NAMES (align with 2017/2018)

rename_map = {
    # Counts
    "Total Fwd Packets": "Tot Fwd Pkts",
    "Total Backward Packets": "Tot Bwd Pkts",

    # Length totals
    "Fwd Packets Length Total": "TotLen Fwd Pkts",
    "Bwd Packets Length Total": "TotLen Bwd Pkts",

    # Forward packet length stats
    "Fwd Packet Length Max": "Fwd Pkt Len Max",
    "Fwd Packet Length Min": "Fwd Pkt Len Min",
    "Fwd Packet Length Mean": "Fwd Pkt Len Mean",
    "Fwd Packet Length Std": "Fwd Pkt Len Std",

    # Backward packet length stats
    "Bwd Packet Length Max": "Bwd Pkt Len Max",
    "Bwd Packet Length Min": "Bwd Pkt Len Min",
    "Bwd Packet Length Mean": "Bwd Pkt Len Mean",
    "Bwd Packet Length Std": "Bwd Pkt Len Std",

    # Flow-level rates
    "Flow Bytes/s": "Flow Byts/s",
    "Flow Packets/s": "Flow Pkts/s",

    # IAT totals
    "Fwd IAT Total": "Fwd IAT Tot",
    "Bwd IAT Total": "Bwd IAT Tot",

    # Header lengths
    "Fwd Header Length": "Fwd Header Len",
    "Bwd Header Length": "Bwd Header Len",

    # Packets/s
    "Fwd Packets/s": "Fwd Pkts/s",
    "Bwd Packets/s": "Bwd Pkts/s",

    # Packet length stats
    "Packet Length Min": "Pkt Len Min",
    "Packet Length Max": "Pkt Len Max",
    "Packet Length Mean": "Pkt Len Mean",
    "Packet Length Std": "Pkt Len Std",
    "Packet Length Variance": "Pkt Len Var",

    # Flag counts
    "FIN Flag Count": "FIN Flag Cnt",
    "SYN Flag Count": "SYN Flag Cnt",
    "RST Flag Count": "RST Flag Cnt",
    "PSH Flag Count": "PSH Flag Cnt",
    "ACK Flag Count": "ACK Flag Cnt",
    "URG Flag Count": "URG Flag Cnt",
    "ECE Flag Count": "ECE Flag Cnt",

    # Packet size / subflow
    "Avg Packet Size": "Pkt Size Avg",
    "Subflow Fwd Packets": "Subflow Fwd Pkts",
    "Subflow Fwd Bytes": "Subflow Fwd Byts",
    "Subflow Bwd Packets": "Subflow Bwd Pkts",
    "Subflow Bwd Bytes": "Subflow Bwd Byts",

    # Window bytes
    "Init Fwd Win Bytes": "Init Fwd Win Byts",
    "Init Bwd Win Bytes": "Init Bwd Win Byts",

    # Forward active data
    "Fwd Act Data Packets": "Fwd Act Data Pkts",

    # Active / Idle (names already aligned, but we keep them here for clarity)
    "Active Mean": "Active Mean",
    "Active Std": "Active Std",
    "Active Max": "Active Max",
    "Active Min": "Active Min",
    "Idle Mean": "Idle Mean",
    "Idle Std": "Idle Std",
    "Idle Max": "Idle Max",
    "Idle Min": "Idle Min",
}

df = df.rename(columns=rename_map)
print("Columns after rename (first 10):", df.columns[:10].tolist())

Columns after rename (first 10): ['Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std']


In [5]:
# 3. BINARY LABEL: 0 = BENIGN, 1 = ATTACK


def binarize_label(x):
    s = str(x).upper().strip()
    # treat these as benign
    if s in ("BENIGN", "NORMAL", "NORMAL TRAFFIC"):
        return 0
    # everything else is an attack type (MSSQL, LDAP, UDPLag, etc.)
    return 1

if "Label" not in df.columns:
    raise ValueError("No 'Label' column found in the 2019 dataset!")

df["Label"] = df["Label"].apply(binarize_label)

print("\nBinary label counts (0=BENIGN, 1=ATTACK):")
print(df["Label"].value_counts())


Binary label counts (0=BENIGN, 1=ATTACK):
Label
1    333540
0     97831
Name: count, dtype: int64


In [6]:
# 4. CONVERT FEATURES TO NUMERIC


for col in df.columns:
    if col != "Label":
        df[col] = pd.to_numeric(df[col], errors="coerce")

# Replace infinities with NaN
df = df.replace([np.inf, -np.inf], np.nan)

# Fill NaNs with 0 (SVM cannot handle NaNs)
df = df.fillna(0)


In [7]:
print("\nFinal dtypes (tail):")
print(df.dtypes.tail())

print("\nAny NaNs left?:", df.isna().sum().sum())
print("Final shape:", df.shape)

# Ensure Label is integer
df["Label"] = df["Label"].astype(int)

# 6. SAVE SVM-READY FILE

df.to_csv(OUTPUT_FILE, index=False)
print(f"\nSVM-ready 2019 dataset saved as: {OUTPUT_FILE}")


Final dtypes (tail):
Idle Mean    float64
Idle Std     float64
Idle Max     float64
Idle Min     float64
Label          int64
dtype: object

Any NaNs left?: 0
Final shape: (431371, 78)

SVM-ready 2019 dataset saved as: CICDDoS2019-SVM-ready.csv


In [None]:
df = pd.read_csv("CICDDoS2019-SVM-ready.csv")

print("Shape:", df.shape)
print("\nLabel counts:")
print(df["Label"].value_counts())
print("\nDtypes (tail):")
print(df.dtypes.tail())