In [None]:
# =========================
# Imports + Config
# =========================

import pandas as pd
import numpy as np

RANDOM_SEED = 42
TARGET_COL = "Attack_type"
POSITIVE_CLASS = "DOS_SYN_Hping"   # 1 = DOS_SYN_Hping, 0 = everything else

def print_section(title):
    print("\n" + "="*80)
    print(title)
    print("="*80)


In [6]:
data_path = "../data/raw/RT_IOT2022"
df = pd.read_csv(data_path)

In [None]:
# =========================
# Create Binary Label (DOS vs Not)
# =========================

def add_binary_target(df, target_col=TARGET_COL, positive_class=POSITIVE_CLASS, new_col="is_dos_syn_hping"):
    """
    Adds a binary target:
      1 if Attack_type == DOS_SYN_Hping
      0 otherwise
    """
    df2 = df.copy()
    df2[new_col] = (df2[target_col] == positive_class).astype(int)
    return df2

df_fe = add_binary_target(df, new_col="is_dos_syn_hping")
print_section("Binary Target Check")
print(df_fe["is_dos_syn_hping"].value_counts(dropna=False))
print(df_fe["is_dos_syn_hping"].value_counts(normalize=True).round(4) * 100)



Binary Target Check
is_dos_syn_hping
1    94659
0    28458
Name: count, dtype: int64
is_dos_syn_hping
1    76.89
0    23.11
Name: proportion, dtype: float64


In [None]:
# =========================
#  Drop Non-Signal Columns + Constant Feature
# =========================

def drop_non_signal_columns(df, cols_to_drop=None):
    """
    Drops columns that are index-like or known constant/non-signal.
    """
    df2 = df.copy()
    if cols_to_drop:
        existing = [c for c in cols_to_drop if c in df2.columns]
        df2 = df2.drop(columns=existing)
    return df2

# Recommended drops based on your EDA
cols_to_drop = ["Unnamed: 0", "bwd_URG_flag_count"]  # constant + index
df_fe = drop_non_signal_columns(df_fe, cols_to_drop=cols_to_drop)

print_section("Columns Dropped")
print("Dropped:", [c for c in cols_to_drop if c in df.columns])
print("New shape:", df_fe.shape)



Columns Dropped
Dropped: ['Unnamed: 0', 'bwd_URG_flag_count']
New shape: (123117, 84)


In [None]:
# =========================
# Guardrail - Validate Numeric Integrity (No negatives)
# =========================

def validate_numeric_integrity(df, exclude_cols=None):
    """
    Checks numeric columns for negative values (generally invalid for these flow metrics).
    Returns list of columns containing negatives.
    """
    exclude_cols = set(exclude_cols or [])
    num_cols = df.select_dtypes(include=["number"]).columns
    num_cols = [c for c in num_cols if c not in exclude_cols]

    neg_cols = []
    for c in num_cols:
        if (df[c] < 0).any():
            neg_cols.append(c)
    return neg_cols

neg_cols = validate_numeric_integrity(df_fe, exclude_cols=[TARGET_COL])
print_section("Numeric Integrity Check")
print("Columns with negatives:", neg_cols if neg_cols else "None found")



Numeric Integrity Check
Columns with negatives: None found


In [None]:
# =========================
# Port Shortcut Control
# =========================

def drop_ports(df, port_cols=("id.orig_p", "id.resp_p")):
    df2 = df.copy()
    existing = [c for c in port_cols if c in df2.columns]
    return df2.drop(columns=existing), existing

def bucket_ports(df, port_cols=("id.orig_p", "id.resp_p")):
    """
    Converts raw ports into coarse buckets:
      - well_known: 0-1023
      - registered: 1024-49151
      - ephemeral: 49152-65535
    Drops original port columns afterward.
    """
    df2 = df.copy()

    def bucket_series(s):
        s = s.fillna(0).astype(int)
        return pd.cut(
            s,
            bins=[-1, 1023, 49151, 65535],
            labels=["well_known", "registered", "ephemeral"]
        )

    created = []
    for c in port_cols:
        if c in df2.columns:
            newc = f"{c}_bucket"
            df2[newc] = bucket_series(df2[c]).astype("category")
            created.append(newc)

    dropped = [c for c in port_cols if c in df2.columns]
    df2 = df2.drop(columns=dropped)
    return df2, created, dropped

# ---- Option A: DROP PORTS (Recommended for credibility/generalization)
df_fe, dropped_ports = drop_ports(df_fe, port_cols=("id.orig_p", "id.resp_p"))
print_section("Port Handling - Option A (Drop Ports)")
print("Dropped ports:", dropped_ports)
print("New shape:", df_fe.shape)

# ---- Option B: BUCKET PORTS (Comment Option A and use Option B instead)
# df_fe, created_buckets, dropped_ports = bucket_ports(df_fe, port_cols=("id.orig_p", "id.resp_p"))
# print_section("Port Handling - Option B (Bucket Ports)")
# print("Created:", created_buckets)
# print("Dropped:", dropped_ports)
# print("New shape:", df_fe.shape)



Port Handling - Option A (Drop Ports)
Dropped ports: ['id.orig_p', 'id.resp_p']
New shape: (123117, 82)


In [None]:
# =========================
# Encode Categorical Features (proto, service)
# =========================

def encode_categoricals(df, categorical_cols=("proto", "service")):
    """
    Converts categorical columns to category dtype (lightweight).
    One-hot encoding will be done later before model training.
    """
    df2 = df.copy()
    for c in categorical_cols:
        if c in df2.columns:
            df2[c] = df2[c].astype("category")
    return df2

df_fe = encode_categoricals(df_fe, categorical_cols=("proto", "service"))
print_section("Categorical Columns Types")
print(df_fe[["proto", "service"]].dtypes if all(c in df_fe.columns for c in ["proto","service"]) else "proto/service not found")



Categorical Columns Types
proto      category
service    category
dtype: object


In [None]:
# =========================
#  Add Derived Features Focused on SYN Flood Behavior
# =========================

def add_dos_derived_features(df):
    df2 = df.copy()
    eps = 1e-9

    # Total packet and payload (you already validated this is informative)
    if "fwd_pkts_tot" in df2.columns and "bwd_pkts_tot" in df2.columns:
        df2["total_pkts"] = df2["fwd_pkts_tot"] + df2["bwd_pkts_tot"]

    if "fwd_pkts_payload.tot" in df2.columns and "bwd_pkts_payload.tot" in df2.columns:
        df2["total_payload"] = df2["fwd_pkts_payload.tot"] + df2["bwd_pkts_payload.tot"]

    if "total_payload" in df2.columns and "total_pkts" in df2.columns:
        df2["payload_per_pkt"] = df2["total_payload"] / (df2["total_pkts"] + eps)

    # SYN/ACK dynamics (behavior-first, not port-first)
    if "flow_SYN_flag_count" in df2.columns and "flow_ACK_flag_count" in df2.columns:
        df2["syn_to_ack_ratio"] = (df2["flow_SYN_flag_count"] + eps) / (df2["flow_ACK_flag_count"] + eps)

    # RST presence can be meaningful in scan/flood traffic
    if "flow_RST_flag_count" in df2.columns:
        df2["rst_present"] = (df2["flow_RST_flag_count"] > 0).astype(int)

    # Duration stability guard
    if "flow_duration" in df2.columns:
        df2["is_zero_duration"] = (df2["flow_duration"] <= 0).astype(int)

    return df2

df_fe = add_dos_derived_features(df_fe)
print_section("Derived Feature Columns Added")
added = [c for c in ["total_pkts","total_payload","payload_per_pkt","syn_to_ack_ratio","rst_present","is_zero_duration"] if c in df_fe.columns]
print("Added:", added)
print("New shape:", df_fe.shape)



Derived Feature Columns Added
Added: ['total_pkts', 'total_payload', 'payload_per_pkt', 'syn_to_ack_ratio', 'rst_present', 'is_zero_duration']
New shape: (123117, 88)


In [None]:
# =========================
# Handle Heavy Tails (Log1p + Optional Clipping)
# =========================

def log1p_transform(df, cols):
    df2 = df.copy()
    for c in cols:
        if c in df2.columns:
            # Ensure non-negative before log1p
            df2[c] = np.log1p(np.clip(df2[c], a_min=0, a_max=None))
    return df2

def clip_by_quantile(df, cols, q=0.999):
    """
    Clips each column to its q-quantile upper bound (lower bound fixed at 0).
    Returns (df_clipped, clip_thresholds_dict)
    """
    df2 = df.copy()
    thresholds = {}
    for c in cols:
        if c in df2.columns:
            ub = df2[c].quantile(q)
            thresholds[c] = float(ub)
            df2[c] = np.clip(df2[c], 0, ub)
    return df2, thresholds

# Candidate heavy-tail columns (based on your EDA)
heavy_cols = [
    "flow_duration",
    "flow_pkts_per_sec",
    "payload_bytes_per_second",
    "fwd_pkts_tot", "bwd_pkts_tot",
    "fwd_pkts_payload.tot", "bwd_pkts_payload.tot",
    "total_pkts", "total_payload",
    "flow_iat.tot", "flow_iat.max", "flow_iat.std",
    "idle.tot", "idle.max", "idle.avg", "idle.std"
]

# 1) Optional clipping first, for stability
df_fe, clip_thresholds = clip_by_quantile(df_fe, cols=heavy_cols, q=0.999)
print_section("Clipping Thresholds (p99.9)")
display(pd.Series(clip_thresholds).sort_values(ascending=False).head(20))

# 2) Log transform after clipping
df_fe = log1p_transform(df_fe, cols=heavy_cols)

print_section("Log Transform Applied")
print("Transformed columns (existing):", [c for c in heavy_cols if c in df_fe.columns])



Clipping Thresholds (p99.9)


flow_iat.tot                1.899466e+08
idle.tot                    1.768227e+08
payload_bytes_per_second    1.258291e+08
idle.max                    5.991468e+07
flow_iat.max                5.991468e+07
idle.avg                    5.991367e+07
flow_iat.std                1.657125e+07
idle.std                    1.320453e+07
flow_pkts_per_sec           2.097152e+06
total_payload               6.737408e+04
bwd_pkts_payload.tot        4.156350e+04
fwd_pkts_payload.tot        1.063456e+04
flow_duration               1.899465e+02
total_pkts                  1.673040e+02
bwd_pkts_tot                8.288400e+01
fwd_pkts_tot                7.400000e+01
dtype: float64


Log Transform Applied
Transformed columns (existing): ['flow_duration', 'flow_pkts_per_sec', 'payload_bytes_per_second', 'fwd_pkts_tot', 'bwd_pkts_tot', 'fwd_pkts_payload.tot', 'bwd_pkts_payload.tot', 'total_pkts', 'total_payload', 'flow_iat.tot', 'flow_iat.max', 'flow_iat.std', 'idle.tot', 'idle.max', 'idle.avg', 'idle.std']


In [None]:
# =========================
# Correlation-Based Pruning
# =========================

def prune_redundant_features(df, drop_list):
    df2 = df.copy()
    existing = [c for c in drop_list if c in df2.columns]
    df2 = df2.drop(columns=existing)
    return df2, existing

# Based on your correlation output: keep flow_pkts_per_sec, drop fwd/bwd rates
redundant_drop = ["fwd_pkts_per_sec", "bwd_pkts_per_sec"]  # keep flow_pkts_per_sec
df_fe, dropped = prune_redundant_features(df_fe, redundant_drop)

print_section("Redundancy Pruning")
print("Dropped:", dropped)
print("New shape:", df_fe.shape)



Redundancy Pruning
Dropped: ['fwd_pkts_per_sec', 'bwd_pkts_per_sec']
New shape: (123117, 86)


In [None]:
# =========================
# Final Dataset Assembly
# =========================

def build_model_ready(df, label_col="is_dos_syn_hping", drop_cols=(TARGET_COL,)):
    """
    Prepares final X/y-ready dataframe:
      - drops original multi-class label
      - keeps binary label
    """
    df2 = df.copy()
    existing_drop = [c for c in drop_cols if c in df2.columns]
    df2 = df2.drop(columns=existing_drop)
    # Ensure label exists
    assert label_col in df2.columns, f"Missing label col: {label_col}"
    return df2

df_model_ready = build_model_ready(df_fe, label_col="is_dos_syn_hping", drop_cols=(TARGET_COL,))
print_section("Final Model-Ready Dataset")
print("Shape:", df_model_ready.shape)
print("Label distribution:")
print(df_model_ready["is_dos_syn_hping"].value_counts())

# Export
import os
out_path = "../data/processed/rt_iot2022_dos_syn_model_ready.csv"
os.makedirs(os.path.dirname(out_path), exist_ok=True)
df_model_ready.to_csv(out_path, index=False)
print("Saved:", out_path)



Final Model-Ready Dataset
Shape: (123117, 85)
Label distribution:
is_dos_syn_hping
1    94659
0    28458
Name: count, dtype: int64
Saved: ../data/processed/rt_iot2022_dos_syn_model_ready.csv
