In [1]:
import pandas as pd
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
import math
import glob
import csv
import time
from datetime import datetime
from datetime import timedelta
import numpy as np
import os
from sys import platform
import math

In [2]:
def format_csv_for_labelling(df):
    df['id'] = range(df.shape[0])
    #df['Timestamp'] = pd.to_datetime(df['bidirectional_first_seen_ms'], unit='ms')
    df['Attack'] = "NeedManualLabel"
    df["Flow ID"] = df['src_ip']+'-'+df['dst_ip']+'-'+str(df['src_port'])+'-'+str(df['dst_port'])+'-'+str(df['protocol'])
    return df

def read_csvs_from_path_and_reformat(path):
    df = pd.read_csv(path)

    df = format_csv_for_labelling(df)
    print("labels after pre-processing:", df["Attack"].value_counts())

    df["Attempted Category"] = -1
    #df['bidirectional_first_seen_ms'] = df['bidirectional_first_seen_ms'] - 3600000 

    return df

In [3]:
# Main labelling function. Only used for labelling Malicious and Malicious - Attempted flows.
# Timestamps are in NANOSECONDS (!) Unix time. Note that the CSV files are in the UTC timezone.
# df = dataframe with flows. Note that labelling happens inplace on the 'df' parameter, and so this function doesn't return anything
# label = the label that will be given to flows matching the criteria specified in the function
# additional_filters = add any additional constraints that cannot be covered by the already provided function arguments
# see examples in the actual labelling logic for correct syntax
# attempted_category = please consult our website (https://intrusion-detection.distrinet-research.be/CNS2022/Tools_Documentation.html)
# for details on how the "Attempted" categories are defined.
# payload_filter = When set to true, this will automatically add a constraint ["Total Length of Fwd Packet"] == 0. Note that
# the Attempted label and category still need to be specified manually
def label_flows(df, label, attack_start_time_nanoseconds, attack_end_time_nanoseconds, src_ip_list=None,
                dst_ip_list= None, src_port_list=None, dst_port_list=None, additional_filters=[], attempted_category=-1, payload_filter=False):


    # Create initial mask for whole df with all values set to True. Squeeze is necessary to remove second axis (with value 1)
    # The reason is that a df of shape (X,) gets converted to (1,X) if you '&' it with a df of shape (X,1)
    mask = pd.DataFrame(True,index=df.index,columns=[df.columns[0]]).squeeze()
    attack_start_time_nanoseconds = int(attack_start_time_nanoseconds * 1e-6)
    attack_end_time_nanoseconds = int(attack_end_time_nanoseconds * 1e-6)
    
    
    attack_start_datetime = attack_start_time_nanoseconds #pd.to_datetime(attack_start_time_nanoseconds, unit='ms') #+  timedelta(hours=1)
    attack_end_datetime = attack_end_time_nanoseconds#pd.to_datetime(attack_end_time_nanoseconds, unit='ms')  #+  timedelta(hours=1) 

    mask &= (df["bidirectional_first_seen_ms"] >= attack_start_datetime)
    mask &= (df["bidirectional_first_seen_ms"] <= attack_end_datetime)

    if src_ip_list is not None:
        mask &= (df["src_ip"].isin(src_ip_list))
    if dst_ip_list is not None:
        mask &= (df["dst_ip"].isin(dst_ip_list))

    if src_port_list is not None:
        mask &= (df["src_port"].isin(src_port_list))
    if dst_port_list is not None:
        mask &= (df["dst_port"].isin(dst_port_list))

    if payload_filter:
        mask &= (df["udps.src2dst_payload_bytes"] == 0)

    for filter in additional_filters:
        mask &= filter

    df["Attack"].mask(mask, label, inplace=True)
    df["Attempted Category"].mask(mask, attempted_category, inplace=True)

# This function is called when all labelling of malicious flows is completed. Anything that has not yet received a label
# so far is labelled as Benign.
def label_rest_as_benign(df):
    df["Attack"].mask(df["Attack"] == "NeedManualLabel", "Benign", inplace=True)

    # Relabel artefact flows with [Flow Id] = '8.0.6.4-8.6.0.1-0-0-0' to label = 0
    df["Attack"].mask(df["Flow ID"] == '8.0.6.4-8.6.0.1-0-0-0', "Benign", inplace=True)

    print("label count after labelling:\r\n", df["Attack"].value_counts())
    print("Attempted Category count after labelling:\r\n", df["Attempted Category"].value_counts())

    # Adds line numbers in the first column if print_index is set to true
    df = df[df['Attempted Category']==-1]
    df = df.drop(['Attempted Category', 'Flow ID'], axis=1)
    return df

In [4]:
timeouts = [(3,30)]

In [5]:
for timeout in timeouts:
    idle, active = timeout 
    DATASET_PATH = f'/home/abdelkader.elmahdaou/lustre/data_sec-um6p-st-sccs-6sevvl76uja/IDS/mahdaouy/fixed_timeouts_v2/new_idle_{idle}min_active_{active}min/CIC-IDS-2017/temp/'
    out_dir = f'/home/abdelkader.elmahdaou/lustre/data_sec-um6p-st-sccs-6sevvl76uja/IDS/mahdaouy/fixed_timeouts_v2/new_idle_{idle}min_active_{active}min/CIC-IDS-2017'
    monday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "Monday-WorkingHours.csv")
    monday_df = label_rest_as_benign(monday_df)
    #--------------------+
    # TUESDAY 04-07-2017 |
    #--------------------+

    tuesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "Tuesday-WorkingHours.csv")

    # FTP-PATATOR
    # -----------

    label_flows(tuesday_df, "FTP-Patator", 1499170672838272000, 1499174416931403000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[21])

    # Default payload filter
    label_flows(tuesday_df, "FTP-Patator - Attempted", 1499170672838272000, 1499174416931403000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[21], payload_filter=True, attempted_category=0)

    label_flows(tuesday_df, "FTP-Patator - Attempted", 1499170672838272000, 1499174416931403000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[21], additional_filters=[(tuesday_df["src_port"] == 52108)],
                attempted_category=2)

    # SSH-Patator
    # -----------

    label_flows(tuesday_df, "SSH-Patator", 1499188141049616000, 1499195059018486000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[22])

    label_flows(tuesday_df, "SSH-Patator - Attempted", 1499188141049616000, 1499195059018486000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[22], payload_filter=True, attempted_category=0)

    label_flows(tuesday_df, "SSH-Patator - Attempted", 1499188141049616000, 1499195059018486000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[22], additional_filters=
                [
                    (tuesday_df["udps.src2dst_payload_bytes"] <=32) & (tuesday_df["udps.dst2src_payload_bytes"] == 0)
                ], attempted_category=3)

    tuesday_df = label_rest_as_benign(tuesday_df)
    #----------------------+
    # WEDNESDAY 05-07-2017 |
    #----------------------+

    wednesday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "Wednesday-WorkingHours.csv")

    # DoS Slowloris
    # -------------

    label_flows(wednesday_df, "DoS Slowloris", 1499258926211817000, 1499260236498684000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], additional_filters=[
            ~(wednesday_df["src_port"].isin([33358, 33360, 33362]))
        ])

    # port 33358, 33360 and 33362 contain attack teardown flows
    label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258926211817000, 1499260236498684000, ["172.16.0.1"],
                ["192.168.10.50"], src_port_list=[33358, 33360, 33362], dst_port_list=[80], attempted_category=2)

    #Payload filter (order is important, this part needs to come before Attempted category 6)
    label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258926211817000, 1499260236498684000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[
            ~(wednesday_df["src_port"].isin([33358, 33360, 33362]))
        ])

    #Target unresponsive because of DoS, no payloads in these flows
    label_flows(wednesday_df, "DoS Slowloris - Attempted", 1499258926211817000, 1499260236498684000,
                ["192.168.10.50"], ["172.16.0.1"], src_port_list=[80], attempted_category=6, additional_filters=[
            ~(wednesday_df["dst_port"].isin([33358, 33360, 33362])) & (wednesday_df["udps.dst2src_payload_bytes"] == 0)
            & (wednesday_df["bidirectional_duration_ms"] >= .199800)
        ])

    # DoS Slowhttptest
    # ----------------

    label_flows(wednesday_df, "DoS Slowhttptest", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], additional_filters=[
            ~(wednesday_df["src_port"].isin([33372]))
        ]
        )


    # Attack startup artefact
    label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
                ["192.168.10.50"], src_port_list=[33372], dst_port_list=[80], attempted_category=2)

    # Payload filter
    label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=[
            ~(wednesday_df["src_port"].isin([33372]))])

    # Retransmissions because target web server is brought down
    label_flows(wednesday_df, "DoS Slowhttptest - Attempted", 1499260537936810000, 1499261869331517000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], attempted_category=6, additional_filters=[
            ~(wednesday_df["src_port"].isin([33372])) & (wednesday_df["udps.src2dst_payload_bytes"] == 0) &
            (wednesday_df["bidirectional_duration_ms"] >= .199984) & (wednesday_df["dst2src_packets"] == 0)
        ]
        )


    # DoS Hulk
    # --------

    label_flows(wednesday_df, "DoS Hulk", 1499262203194704000, 1499263641326171000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], additional_filters=[
            ~(wednesday_df["src_port"].isin([48678 , 43664]))
        ])

    # Payload filter
    label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499263641326171000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], payload_filter=True, attempted_category=0, additional_filters=[
            ~(wednesday_df["src_port"].isin([48678 , 43664]))])

    #Attack artefact - likely authors checking webserver mid-attack.
    label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499262299999999999, ["172.16.0.1"],
                ["192.168.10.50"], src_port_list=[48678 , 43664], dst_port_list=[80], attempted_category=2)

    # Artefacts caused by either attack tool or non-empty TCP appendices. Reasoning is that 282 is minimum size of malicious payload
    label_flows(wednesday_df, "DoS Hulk - Attempted", 1499262203194704000, 1499263641326171000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], attempted_category=3, additional_filters=[
            ~(wednesday_df["src_port"].isin([48678 , 43664])) & (wednesday_df["udps.src2dst_payload_bytes"] > 0)
            & (wednesday_df["udps.src2dst_payload_bytes"] < 282)
        ])

    # DoS GoldenEye
    # -------------

    label_flows(wednesday_df, "DoS GoldenEye", 1499263803231753000, 1499264408915718000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80])

    label_flows(wednesday_df, "DoS GoldenEye - Attempted", 1499263803231753000, 1499264408915718000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True)

    # Heartbleed
    # ----------

    label_flows(wednesday_df, "Heartbleed", 1499278335650811000, 1499279563294455000, ["172.16.0.1"],
                ["192.168.10.51"], dst_port_list=[444], additional_filters=[
            (wednesday_df["src_port"] ==  45022)
        ])

    label_flows(wednesday_df, "Heartbleed - Attempted", 1499278335650811000, 1499279563294455000, ["172.16.0.1"],
                ["192.168.10.51"], dst_port_list=[444], attempted_category=0, payload_filter=True, additional_filters=[
            (wednesday_df["src_port"] == 45022)])

    wednesday_df = label_rest_as_benign(wednesday_df)
    #---------------------+
    # THURSDAY 06-07-2017 |
    #---------------------+

    thursday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "Thursday-WorkingHours.csv")

    # Web Attack - Brute Force
    # ------------------------

    label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343354880049000, 1499343531179279000,
                ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=2)

    label_flows(thursday_df, "Web Attack - Brute Force", 1499343567660566000, 1499346011622209000,
                ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], additional_filters=
                [
                    (thursday_df["src2dst_packets"] > 20) | (thursday_df["src_port"] == 44464)
                ])

    label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343567660566000, 1499346011622209000,
                ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], payload_filter=True, attempted_category=0,
                additional_filters=
                [~((thursday_df["src2dst_packets"] > 20) | (thursday_df["src_port"] == 44464))])

    label_flows(thursday_df, "Web Attack - Brute Force - Attempted", 1499343567660566000, 1499346011622209000,
                ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=4,
                additional_filters=
                [
                    (thursday_df["udps.src2dst_payload_bytes"] > 0) & ~(thursday_df["src_port"] == 44464) &
                    (thursday_df["src2dst_packets"] == 5) & (thursday_df["dst2src_packets"] == 5)
                ])

    # Web Attack - XSS
    # ----------------

    label_flows(thursday_df, "Web Attack - XSS", 1499346935283859000, 1499348121341704000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], additional_filters=
                [
                    ~(thursday_df["src_port"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &
                    (thursday_df["src2dst_packets"] >= 150)
                ])

    label_flows(thursday_df, "Web Attack - XSS - Attempted", 1499346935283859000, 1499348121341704000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], attempted_category=0, payload_filter=True, additional_filters=
                [~(thursday_df["src_port"].isin([36180, 36182, 36184, 36186, 36188, 36190]))])

    label_flows(thursday_df, "Web Attack - XSS - Attempted", 1499346935283859000, 1499348121341704000, ["172.16.0.1"],
                ["192.168.10.50"], dst_port_list=[80], attempted_category=2, additional_filters=
                [
                    ~(thursday_df["src_port"].isin([36180, 36182, 36184, 36186, 36188, 36190])) &
                    (thursday_df["udps.src2dst_payload_bytes"] > 0) & (thursday_df["src2dst_packets"] < 150)
                ])

    # Web Attack - SQL Injection
    # --------------------------

    label_flows(thursday_df, "Web Attack - SQL Injection - Attempted", 1499348127852814000, 1499348145720612000,
                ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=2,
                additional_filters=[
                    thursday_df["src_port"].isin([36180, 36182, 36184, 36186, 36188])
                ])

    label_flows(thursday_df, "Web Attack - SQL Injection", 1499348145732950000, 1499348575320284000,
                ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80],
                additional_filters=[
                    ~(thursday_df["src_port"].isin([36180, 36182, 36184, 36186, 36188]))
                ])

    label_flows(thursday_df, "Web Attack - SQL Injection - Attempted", 1499348127852814000, 1499348145720612000,
                ["172.16.0.1"], ["192.168.10.50"], dst_port_list=[80], attempted_category=0,
                payload_filter=True)


    # Infiltration
    # 5.1 Dropbox Download
    # ------------


    label_flows(thursday_df, "Infiltration", 1499361542547210000, 1499366769364731000, ["192.168.10.8"], ["205.174.165.73"])

    label_flows(thursday_df, "Infiltration - Attempted", 1499361542547210000, 1499366769364731000, ["192.168.10.8"],
                ["205.174.165.73"], attempted_category=0, payload_filter=True)

    label_flows(thursday_df, "Infiltration - Attempted", 1499361228830533000, 1499361301251276000 , ["192.168.10.9"],
                ["205.174.165.73"], attempted_category=2)

    # 5.2 Cooldisk Mac

    label_flows(thursday_df, "Infiltration", 1499363616453990000, 1499371339347892000, ["192.168.10.25"], ["205.174.165.73"])

    label_flows(thursday_df, "Infiltration - Attempted", 1499363616453990000, 1499371339347892000, ["192.168.10.25"],
                ["205.174.165.73"], attempted_category=0, payload_filter=True)

    # 5.3 NMAP + Portscan

    # Round 1

    label_flows(thursday_df, "Infiltration - Portscan", 1499360431706755000, 1499360445728887000, ["172.16.0.1"],
                ["192.168.10.51"], additional_filters=[
            (thursday_df["src_port"] == 50122) | (thursday_df["src_port"] == 50133)
        ])

    # Round 2

    label_flows(thursday_df, "Infiltration - Portscan", 1499362410884008000, 1499362444285175000, ["192.168.10.8"],
                ["192.168.10.5"])

    # Round 3

    label_flows(thursday_df, "Infiltration - Portscan", 1499364314425162000, 1499366764331875000, ["192.168.10.8"],
                ["192.168.10.5", "192.168.10.9", "192.168.10.12", "192.168.10.14", "192.168.10.15", "192.168.10.16",
                "192.168.10.17", "192.168.10.19", "192.168.10.25", "192.168.10.50", "192.168.10.51"], additional_filters= [
            ~((thursday_df["dst_ip"] == "192.168.10.50")) &
            ~((thursday_df["udps.src2dst_payload_bytes"].isin([176, 20514])) & (thursday_df["dst_ip"] == "192.168.10.50"))
        ]
    )

    thursday_df = label_rest_as_benign(thursday_df)
    #---------------------+
    # FRIDAY 07-07-2017 |
    #---------------------+

    friday_df = read_csvs_from_path_and_reformat(DATASET_PATH + "Friday-WorkingHours.csv")

    # Portscan
    # --------

    #First round
    label_flows(friday_df, "Portscan", 1499446532117090000, 1499447948582083000, ["172.16.0.1"], ["192.168.10.50"])


    #Second round
    label_flows(friday_df, "Portscan", 1499449905450532000, 1499451841699238000, ["172.16.0.1"], ["192.168.10.50"])

    # Botnet
    # ------

    label_flows(friday_df, "Botnet", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
                "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"])

    label_flows(friday_df, "Botnet - Attempted", 1499432653990571000, 1499436122903736000, ["192.168.10.15", "192.168.10.9",
                "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=0,
                payload_filter=True)

    label_flows(friday_df, "Botnet - Attempted", 1499436180000000000, 1499457684606663000, ["192.168.10.15", "192.168.10.9",
                "192.168.10.14", "192.168.10.5", "192.168.10.8"], ["205.174.165.73"], attempted_category=1)


    # DDoS
    # ----

    label_flows(friday_df, "DDoS", 1499453791796937000, 1499454972216560000, ["172.16.0.1"], ["192.168.10.50"])

    label_flows(friday_df, "DDoS - Attempted", 1499453791796937000, 1499454972216560000, ["172.16.0.1"], ["192.168.10.50"],
                attempted_category=0, payload_filter=True)

    friday_df = label_rest_as_benign(friday_df)
    
    df_all = pd.concat([monday_df, tuesday_df, wednesday_df, thursday_df, friday_df])
    print('===================================================')
    print(df_all.Attack.value_counts())
    #df_all = df_all.drop('Flow ID', axis=1)
    #df_all.to_csv(out_dir+"/CIC-IDS-2017.csv", index=False, header=True)

labels after pre-processing: Attack
NeedManualLabel    349571
Name: count, dtype: int64
label count after labelling:
 Attack
Benign    349571
Name: count, dtype: int64
Attempted Category count after labelling:
 Attempted Category
-1    349571
Name: count, dtype: int64
labels after pre-processing: Attack
NeedManualLabel    302811
Name: count, dtype: int64
label count after labelling:
 Attack
Benign                     295858
FTP-Patator                  3972
SSH-Patator                  2961
SSH-Patator - Attempted        19
FTP-Patator - Attempted         1
Name: count, dtype: int64
Attempted Category count after labelling:
 Attempted Category
-1    302791
 3        19
 2         1
Name: count, dtype: int64
labels after pre-processing: Attack
NeedManualLabel    331817
Name: count, dtype: int64
label count after labelling:
 Attack
Benign                          302317
DoS Hulk                         13903
DoS GoldenEye                     7470
DoS Slowhttptest - Attempted      2809
Do

In [6]:
pd.set_option('display.max_columns', 300)

In [7]:
df_all['Attack'].value_counts()

Attack
Benign                        1496130
Portscan                       159196
DDoS                            62768
Infiltration - Portscan         60462
DoS Hulk                        13903
DoS GoldenEye                    7470
FTP-Patator                      3972
SSH-Patator                      2961
DoS Slowloris                    2243
DoS Slowhttptest                 1407
Botnet                            736
Web Attack - Brute Force           73
Web Attack - XSS                   18
Web Attack - SQL Injection         13
Infiltration                        7
Heartbleed                          1
Name: count, dtype: int64