In [2]:
"""
Project: AD4IDS - Anomaly Detection for Intrusion Detection Systems
Subproject: Challenge 2
Stage: 2- Evaluation
Authors: MONNIER Killian & BAKKARI Ikrame
Date: 01/2024
"""
import ipaddress
import pandas as pd

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

port_bins = [0, 1023, 49151, 65535]
port_labels = ["WellKnownPorts", "RegisteredPorts", "DynamicPrivatePorts"]

packets_bins = [
    0,
    100,
    500,
    float("inf"),
]
packets_labels = ["Low", "Medium", "High"]

bytes_bins = [
    0,
    10000,
    50000,
    float("inf"),
]
bytes_labels = ["Small", "Medium", "Large"]


def parse_csv_to_dataframe(file_name):
    print("Parsing", file_name, "CSV file...")
    df = pd.read_csv(file_name)
    print("Done parsing CSV file in dataframe.")
    return df


def map_ip_to_interval(ip):
    try:
        ip = ipaddress.IPv4Address(ip)
    except ValueError:
        return "UnknownNetwork"
    if ip <= ipaddress.IPv4Address("128.0.0.0"):
        return "PrivateNetwork"
    elif ip <= ipaddress.IPv4Address("192.0.0.0"):
        return "PublicNetwork"
    elif ip <= ipaddress.IPv4Address("224.0.0.0"):
        return "MulticastNetwork"
    else:
        return "UnknownNetwork"


def map_port_to_interval(port):
    port = convert_to_number(port)
    return pd.cut([port], bins=port_bins, labels=port_labels, include_lowest=True)[0]


def map_packets_to_interval(packets):
    packets = convert_to_number(packets)
    return pd.cut(
        [packets], bins=packets_bins, labels=packets_labels, include_lowest=True
    )[0]


def map_bytes_to_interval(bytes):
    bytes = convert_to_number(bytes)
    return pd.cut([bytes], bins=bytes_bins, labels=bytes_labels, include_lowest=True)[0]


def parse_flags_to_list(flags_list):
    # split the flags string into a list of flags
    flags_list = [flag.strip() for flag in flags_list]
    flags_dict = {}
    for flag in flags_list:
        if flag != ".":
            flags_dict[f"flags_{flag}"] = 1
    return flags_dict


def convert_to_number(value):
    try:
        return int(value)
    except ValueError:
        # Si contient un M, multiplier par 1 000 000
        if value[-1] == "M":
            return int(float(value[:-1]) * 1000000)
        # Si contient un K, multiplier par 1 000
        elif value[-1] == "K":
            return int(float(value[:-1]) * 1000)
        else:
            # print(f"Unknown value: {value}")
            return 0


def preprocess_df(df):
    # Encodage One-Hot pour les adresses IP, en utilisant la fonction map_ip_to_interval
    df["src_ip"] = df["Src_IP_Add"].apply(map_ip_to_interval)
    df["dst_ip"] = df["Dst_IP_Add"].apply(map_ip_to_interval)
    df["src_port"] = df["Src_Pt"].apply(map_port_to_interval)
    df["dst_port"] = df["Dst_Pt"].apply(map_port_to_interval)
    df["packets"] = df["Packets"].apply(map_packets_to_interval)
    df["bytes"] = df["Bytes"].apply(map_bytes_to_interval)
    df["protocol"] = df["Protocol"]
    
    df.drop(columns=["Src_IP_Add", "Dst_IP_Add", "Src_Pt", "Dst_Pt", "Packets", "Bytes", "Protocol"], inplace=True)
    print(df.head())
    df = pd.get_dummies(df, columns=["src_ip", "dst_ip", "src_port", "dst_port", "packets", "bytes", "protocol"])

    # Pour les flags, vous pouvez les splitter et créer des colonnes séparées
    for flag in ["A", "F", "S", "R", "P"]:
        df[f"flags_{flag}"] = df["Flags"].str.contains(flag).astype(int)
    df.drop(columns=["Flags"], inplace=True)

    # Convertir les champs numériques
    # df["duration"] = (
    #     df["Duration"]
    #     .apply(lambda x: float(x) if x.replace(".", "", 1).isdigit() else 0)
    #     .astype(int)
    # )
    df["duration"] = df["Duration"].astype(float)
    df["flows"] = df["Flows"].astype(int)
    df["tos"] = df["Tos"].astype(int)
    df.drop(columns=["Duration", "Flows", "Tos", "Timestamp"], inplace=True)

    # Encodage One-Hot pour le tag
    df["tag_normal"] = (df["Tag"] == "normal").astype(int)
    df["tag_attack"] = (df["Tag"] == "attack").astype(int)
    df["tag_victim"] = (df["Tag"] == "victim").astype(int)
    df.drop(columns=["Tag"], inplace=True)
    
    return df

In [3]:
csv_file = "data/traffic_os_TRAIN.csv"
# csv_file = "challenge2/data/traffic_os_TEST.csv"
df_file = "data/df_traffic_train.pkl"
# df_file = "challenge2/data/df_traffic_test.pkl"

df = parse_csv_to_dataframe(csv_file)
df.head()

Parsing data/traffic_os_TRAIN.csv CSV file...


  df = pd.read_csv(file_name)


Done parsing CSV file in dataframe.


Unnamed: 0,Timestamp,Duration,Protocol,Src_IP_Add,Src_Pt,Dst_IP_Add,Dst_Pt,Packets,Bytes,Flows,Flags,Tos,Tag
0,2724100000.0,0.0,TCP,192.168.100.5,445,192.168.220.16,58844.0,1,108,1,.AP...,0,normal
1,2724100000.0,0.0,TCP,192.168.100.5,445,192.168.220.15,48888.0,1,108,1,.AP...,0,normal
2,2724100000.0,0.004,TCP,192.168.220.15,48888,192.168.100.5,445.0,2,174,1,.AP...,0,normal
3,2724100000.0,0.004,TCP,192.168.220.16,58844,192.168.100.5,445.0,2,174,1,.AP...,0,normal
4,2724100000.0,0.0,TCP,192.168.100.5,445,192.168.220.15,48888.0,1,108,1,.AP...,0,normal


In [4]:
# Supposons que df est votre DataFrame initial
df_subset = df.head(100000).copy()  # Sélectionner les 100 000 premières lignes

# Maintenant, appliquez le prétraitement sur df_subset
df_subset = preprocess_df(df_subset)

In [None]:
df_subset.head()

Unnamed: 0,src_ip_MulticastNetwork,src_ip_UnknownNetwork,dst_ip_MulticastNetwork,src_port_DynamicPrivatePorts,src_port_RegisteredPorts,src_port_WellKnownPorts,dst_port_DynamicPrivatePorts,dst_port_RegisteredPorts,dst_port_WellKnownPorts,packets_Low,bytes_Small,protocol_TCP,flags_A,flags_F,flags_S,flags_R,flags_P,duration,flows,tos,tag_normal,tag_attack,tag_victim
0,1,0,1,0,0,1,1,0,0,1,1,1,1,0,0,0,1,0.0,1,0,1,0,0
1,1,0,1,0,0,1,0,1,0,1,1,1,1,0,0,0,1,0.0,1,0,1,0,0
2,1,0,1,0,1,0,0,0,1,1,1,1,1,0,0,0,1,0.004,1,0,1,0,0
3,1,0,1,1,0,0,0,0,1,1,1,1,1,0,0,0,1,0.004,1,0,1,0,0
4,1,0,1,0,0,1,0,1,0,1,1,1,1,0,0,0,1,0.0,1,0,1,0,0


Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Supposons que votre DataFrame s'appelle df
X = df.drop(['tag_normal', 'tag_attack', 'tag_victim'], axis=1)
y = df[['tag_normal', 'tag_attack', 'tag_victim']]
y.head()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))


NameError: name 'df' is not defined