In [1]:
import pandas as pd
import numpy as np

In [2]:
#1. Normalize Column Names

def normalize_file(INPUT_FILE, OUTPUT_FILE="CIC-IDS2017-normalized.csv"):

    df = pd.read_csv(INPUT_FILE)

    rename_map = {
        "Destination Port": "Dst Port",
        "Total Fwd Packets": "Tot Fwd Pkts",
        "Total Length of Fwd Packets": "TotLen Fwd Pkts",
        "Fwd Packet Length Max": "Fwd Pkt Len Max",
        "Fwd Packet Length Min": "Fwd Pkt Len Min",
        "Fwd Packet Length Mean": "Fwd Pkt Len Mean",
        "Fwd Packet Length Std": "Fwd Pkt Len Std",
        "Bwd Packet Length Max": "Bwd Pkt Len Max",
        "Bwd Packet Length Min": "Bwd Pkt Len Min",
        "Bwd Packet Length Mean": "Bwd Pkt Len Mean",
        "Bwd Packet Length Std": "Bwd Pkt Len Std",
        "Flow Bytes/s": "Flow Byts/s",
        "Flow Packets/s": "Flow Pkts/s",
        "Fwd IAT Total": "Fwd IAT Tot",
        "Bwd IAT Total": "Bwd IAT Tot",
        "Fwd Header Length": "Fwd Header Len",
        "Bwd Header Length": "Bwd Header Len",
        "Fwd Packets/s": "Fwd Pkts/s",
        "Bwd Packets/s": "Bwd Pkts/s",
        "Min Packet Length": "Pkt Len Min",
        "Max Packet Length": "Pkt Len Max",
        "Packet Length Mean": "Pkt Len Mean",
        "Packet Length Std": "Pkt Len Std",
        "Packet Length Variance": "Pkt Len Var",
        "FIN Flag Count": "FIN Flag Cnt",
        "PSH Flag Count": "PSH Flag Cnt",
        "ACK Flag Count": "ACK Flag Cnt",
        "Average Packet Size": "Pkt Size Avg",
        "Subflow Fwd Bytes": "Subflow Fwd Byts",
        "Init_Win_bytes_forward": "Init Fwd Win Byts",
        "Init_Win_bytes_backward": "Init Bwd Win Byts",
        "act_data_pkt_fwd": "Fwd Act Data Pkts",
        "min_seg_size_forward": "Fwd Seg Size Min",
        "Attack Type": "Label",
    }

    df = df.rename(columns=rename_map)
    df.to_csv(OUTPUT_FILE, index=False)

    print(f"Saved normalized file to {OUTPUT_FILE}")
    return OUTPUT_FILE



In [3]:
# 2. Label Mapping Function

def map_labels(df):

    mapping = {
        "Benign": "BENIGN",
        "Normal Traffic": "BENIGN",
        "Bot": "BOT",
        "Bots": "BOT",
        "Infiltration": "INFILTRATION",
        "Brute Force -Web": "BRUTEFORCE",
        "Brute Force -XSS": "BRUTEFORCE",
        "Brute Force": "BRUTEFORCE",
        "SQL Injection": "WEB ATTACK",
        "Web Attacks": "WEB ATTACK",
        "DoS": "DOS",
        "Syn": "DOS",
        "UDP": "DOS",
        "UDP-lag": "DOS",
        "UDPLag": "DOS",
        "DDoS": "DDOS",
        "DrDoS_NTP": "DDOS",
        "DrDoS_UDP": "DDOS",
        "DrDoS_MSSQL": "DDOS",
        "DrDoS_DNS": "DDOS",
        "DrDoS_SNMP": "DDOS",
        "DrDoS_LDAP": "DDOS",
        "DrDoS_NetBIOS": "DDOS",
        "TFTP": "DDOS",
        "MSSQL": "DDOS",
        "WebDDoS": "DDOS",
        "Port Scanning": "PORTSCAN",
        "Portmap": "PORTSCAN",
        "NetBIOS": "OTHER"
    }

    df["Label"] = df["Label"].replace(mapping)
    df = df[df["Label"] != "Label"]   # Remove header junk
    return df


In [4]:
# 3. SVM-Ready Cleaning

def make_svm_ready(df, OUTPUT_FILE="CICIDS2017-SVM-ready.csv"):

    # Binary labels
    df["Label"] = df["Label"].apply(lambda x: 0 if str(x).upper() == "BENIGN" else 1)
    print("Binary labels:\n", df["Label"].value_counts())

    # Drop non-numeric columns except Label
    non_numeric = df.select_dtypes(exclude=[np.number]).columns.tolist()
    non_numeric = [c for c in non_numeric if c != "Label"]
    if non_numeric:
        print("Dropping non-numeric:", non_numeric)
        df = df.drop(columns=non_numeric)

    # Convert all to numeric
    for col in df.columns:
        if col != "Label":
            df[col] = pd.to_numeric(df[col], errors="coerce")

    # Replace NaN/inf
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)

    df.to_csv(OUTPUT_FILE, index=False)
    print(f"SVM-ready dataset saved as: {OUTPUT_FILE}")

    return df

In [5]:
#Run
normalized = normalize_file("cleaned_cicids2017_by_burhan.csv")
df = pd.read_csv(normalized)

df = map_labels(df)
df = make_svm_ready(df)

Saved normalized file to CIC-IDS2017-normalized.csv
Binary labels:
 Label
0    2095057
1     425694
Name: count, dtype: int64
SVM-ready dataset saved as: CICIDS2017-SVM-ready.csv
