In [7]:
import pandas as pd
import numpy as np

In [8]:
# 1. Combine raw 2018 CSV files

FILES_2018 = [
    "02-23-2018.csv",
    "02-28-2018.csv",
    "03-01-2018.csv",
    "03-02-2018.csv",
]

def load_2018(files):
    dfs = [pd.read_csv(f) for f in files]
    df = pd.concat(dfs, ignore_index=True)
    print("Combined 2018 shape:", df.shape)
    return df


In [9]:
# 2. Normalize column names

def normalize_2018_columns(df):
    """
    Makes 2018 column names consistent with 2017 naming wherever needed.
    If some columns don't exist, rename() just skips them.
    """

    rename_map = {
        "Destination Port": "Dst Port",
        "Total Fwd Packets": "Tot Fwd Pkts",
        "Total Backward Packets": "Tot Bwd Pkts",
        "Total Length of Fwd Packets": "TotLen Fwd Pkts",
        "Total Length of Bwd Packets": "TotLen Bwd Pkts",
        "Fwd Packet Length Max": "Fwd Pkt Len Max",
        "Fwd Packet Length Min": "Fwd Pkt Len Min",
        "Fwd Packet Length Mean": "Fwd Pkt Len Mean",
        "Fwd Packet Length Std": "Fwd Pkt Len Std",
        "Bwd Packet Length Max": "Bwd Pkt Len Max",
        "Bwd Packet Length Min": "Bwd Pkt Len Min",
        "Bwd Packet Length Mean": "Bwd Pkt Len Mean",
        "Bwd Packet Length Std": "Bwd Pkt Len Std",
        "Flow Bytes/s": "Flow Byts/s",
        "Flow Packets/s": "Flow Pkts/s",
        "Fwd IAT Total": "Fwd IAT Tot",
        "Bwd IAT Total": "Bwd IAT Tot",
        "Fwd Header Length": "Fwd Header Len",
        "Bwd Header Length": "Bwd Header Len",
        "Fwd Packets/s": "Fwd Pkts/s",
        "Bwd Packets/s": "Bwd Pkts/s",
        "Min Packet Length": "Pkt Len Min",
        "Max Packet Length": "Pkt Len Max",
        "Packet Length Mean": "Pkt Len Mean",
        "Packet Length Std": "Pkt Len Std",
        "Packet Length Variance": "Pkt Len Var",
        "FIN Flag Count": "FIN Flag Cnt",
        "PSH Flag Count": "PSH Flag Cnt",
        "ACK Flag Count": "ACK Flag Cnt",
        "Average Packet Size": "Pkt Size Avg",
        "Subflow Fwd Bytes": "Subflow Fwd Byts",
        "Init_Win_bytes_forward": "Init Fwd Win Byts",
        "Init_Win_bytes_backward": "Init Bwd Win Byts",
        "act_data_pkt_fwd": "Fwd Act Data Pkts",
        "min_seg_size_forward": "Fwd Seg Size Min",
        # Label is usually already "Label" for 2018, but just in case:
        "Attack Type": "Label",
    }

    df = df.rename(columns=rename_map)
    print("Columns after normalization (first 10):")
    print(list(df.columns)[:10])
    return df

In [10]:
# 3. Map fine-grained labels to coarse classes

def map_labels(df):
    mapping = {
        # BENIGN
        "Benign": "BENIGN",
        "Normal Traffic": "BENIGN",

        # BOT
        "Bot": "BOT",
        "Bots": "BOT",

        # INFILTRATION
        "Infiltration": "INFILTRATION",

        # BRUTE FORCE
        "Brute Force -Web": "BRUTEFORCE",
        "Brute Force -XSS": "BRUTEFORCE",
        "Brute Force": "BRUTEFORCE",

        # WEB ATTACKS
        "SQL Injection": "WEB ATTACK",
        "Web Attacks": "WEB ATTACK",

        # DOS
        "DoS": "DOS",
        "Syn": "DOS",
        "UDP": "DOS",
        "UDP-lag": "DOS",
        "UDPLag": "DOS",

        # DDOS
        "DDoS": "DDOS",
        "DrDoS_NTP": "DDOS",
        "DrDoS_UDP": "DDOS",
        "DrDoS_MSSQL": "DDOS",
        "DrDoS_DNS": "DDOS",
        "DrDoS_SNMP": "DDOS",
        "DrDoS_LDAP": "DDOS",
        "DrDoS_NetBIOS": "DDOS",
        "TFTP": "DDOS",
        "MSSQL": "DDOS",
        "WebDDoS": "DDOS",

        # PORT SCAN
        "Port Scanning": "PORTSCAN",
        "Portmap": "PORTSCAN",

        # OTHER (rare classes)
        "NetBIOS": "OTHER",
    }

    df["Label"] = df["Label"].replace(mapping)

    # Remove weird header rows if any
    df = df[df["Label"] != "Label"]
    return df

In [11]:
# 4. Make SVM-ready: numeric-only + binary labels

def make_svm_ready_2018(df, output_file="CICIDS2018-SVM-ready.csv"):
    # 1. Binary label: 0 = BENIGN, 1 = ATTACK
    df["Label"] = df["Label"].apply(lambda x: 0 if str(x).upper() == "BENIGN" else 1)

    print("Binary label distribution (2018):")
    print(df["Label"].value_counts())

    # 2. Convert all feature columns to numeric FIRST
    for col in df.columns:
        if col != "Label":
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # 3. Now detect any leftover non-numeric columns (should be none, but just in case)
    non_numeric = df.select_dtypes(exclude=[np.number]).columns.tolist()
    non_numeric = [c for c in non_numeric if c != "Label"]

    if non_numeric:
        print("Dropping leftover non-numeric feature columns:", non_numeric)
        df = df.drop(columns=non_numeric)

    # 4. Replace inf / -inf and fill NaNs
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)

    print("Final dtypes (2018) — tail:")
    print(df.dtypes.tail())
    print("Any NaNs left?", df.isna().sum().sum())
    print("Final shape:", df.shape)

    df.to_csv(output_file, index=False)
    print(f"SVM-ready 2018 dataset saved as: {output_file}")
    return df


In [12]:
# 5. Run full pipeline

if __name__ == "__main__":
    df_2018 = load_2018(FILES_2018)
    df_2018 = normalize_2018_columns(df_2018)
    df_2018 = map_labels(df_2018)
    print("Label counts after mapping:")
    print(df_2018["Label"].value_counts())

    df_2018_svm = make_svm_ready_2018(df_2018)

  dfs = [pd.read_csv(f) for f in files]
  dfs = [pd.read_csv(f) for f in files]


Combined 2018 shape: (3041379, 80)
Columns after normalization (first 10):
['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min']
Label counts after mapping:
Label
BENIGN           2592630
BOT               286191
Infilteration     161934
BRUTEFORCE           513
WEB ATTACK            53
Name: count, dtype: int64
Binary label distribution (2018):
Label
0    2592630
1     448691
Name: count, dtype: int64
Final dtypes (2018) — tail:
Idle Mean    float64
Idle Std     float64
Idle Max     float64
Idle Min     float64
Label          int64
dtype: object
Any NaNs left? 0
Final shape: (3041321, 80)
SVM-ready 2018 dataset saved as: CICIDS2018-SVM-ready.csv
