In [1]:
import pandas as pd
import numpy as np
import polars as pl
import glob
import re
import mapply
from math import isnan
pd.set_option('display.max_columns', 400)
pd.set_option('display.max_rows', 400)
pl.Config.set_tbl_cols(400)

from numpy import asarray
from numpy import savetxt

In [2]:
mapply.init(n_workers=-1)

In [3]:
main_dir='/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/publicCSVs/CNS22_cic17'

In [4]:
def extract_id(row):
    return row['id.orig_h']+'-'+row['id.resp_h']+'-'+str(row['id.orig_p']).replace('.0', '')+'-'+str(row['id.resp_p']).replace('.0', '')+'-'+str(row['protocol'])

In [5]:
dic_proto = {
    'tcp': 6,              # Transmission Control Protocol
    'udp': 17,             # User Datagram Protocol
    'icmp': 1,             # Internet Control Message Protocol
    'igmp': 2,             # Internet Group Management Protocol
    'ipv6-icmp': 58,       # ICMP for IPv6
    'sctp': 132,           # Stream Control Transmission Protocol
    'rtp': 5004,           # Real-time Transport Protocol (Typically uses port 5004/5005)
    'rtcp': 5005,          # Real-time Transport Control Protocol (Typically uses port 5005)
    'udt': None,           # UDT is an application-level protocol, not an IP protocol, so no number
    'arp': 'EtherType 0x0806',  # Address Resolution Protocol, uses EtherType 0x0806
    'lldp': 'EtherType 0x88CC', # Link Layer Discovery Protocol, uses EtherType 0x88CC
    'llc': 'Sub-layer of Data Link Layer',  # Logical Link Control is a sublayer, not a protocol number
    'man': None            # No specific IP protocol number, possibly a custom or context-specific protocol
}

In [6]:
days = ['Friday-WorkingHours',  'Monday-WorkingHours',  'Thursday-WorkingHours',  'Tuesday-WorkingHours',  'Wednesday-workingHours']    
timeouts = [0.5, 1, 2, 3, 4, 5, 6, 10, 30, 60]

In [15]:
def load_years_dfs(path):
    dfs = []
    for day in days:
        dd = day.split('-')[0].lower()
        file = path+f'{day}_conn.log'
        dfi = pd.read_csv(file, sep="\t", comment='#',
                         names=['ts', 'uid', 'id.orig_h', 'id.orig_p',
                                'id.resp_h', 'id.resp_p', 'proto', 'service',
                                'duration',  'orig_bytes', 'resp_bytes',
                                'conn_state', 'local_orig', 'local_resp',
                                'missed_bytes',  'history', 'orig_pkts',
                                'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes',
                                'tunnel_parents'])
        
        dfi['protocol'] = dfi['proto'].map(dic_proto)
        dfi['Flow ID'] = dfi.mapply(extract_id, axis=1)
        
        df_labels = pd.read_csv(f'{main_dir}/{dd}.csv')
        df_labels = df_labels[['Flow ID', 'Label', 'Attempted Category']]
        print('labels shape before dedup', df_labels.shape)
        df_labels = df_labels.drop_duplicates()
        print('labels shape after dedup', df_labels.shape)
        df_labeled = pd.merge(dfi, df_labels, on='Flow ID')
        
        df_labeled = df_labeled.drop_duplicates()

        dfs.append(df_labeled)
    df = pd.concat(dfs)  
    return df

In [16]:
for timeout in timeouts:
    print("Processing timeout ", timeout, "...")
    #out_dir = f'/home/abdelkader.elmahdaou/lustre/data_sec-um6p-st-sccs-6sevvl76uja/IDS/mahdaouy/fixed_timeouts_v2/new_idle_{idle}min_active_{active}min/CUPID'
    out_dir=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Zeek/timeout{timeout}/'
    path=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Zeek/timeout{timeout}/conn_logs/'

    df = load_years_dfs(path)
    df = df.rename(columns={"Label": "Attack"})
    df.loc[df['Attack'] == 'BENIGN', 'Attack' ] = 'Benign'
    df = df[df['Attempted Category'] == -1]
    df = df.drop(columns=['Flow ID', 'Attempted Category', 'protocol'])
    print(df['Attack'].value_counts())
    df.to_csv(f'{out_dir}/CIC-IDS-2017_zeek_{timeout}.csv', index=False, header=True)
    print("______________________________________________")
    

Processing timeout  0.5 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1700398
DoS Hulk                       199162
Portscan                       159747
DoS GoldenEye                  108901
DDoS                            94884
Infiltration - Portscan         68493
DoS Slowloris                   35198
DoS Slowhttptest                21245
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Infiltration                       41
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  1 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1588174
DoS Hulk                       196001
Portscan                       159747
DoS GoldenEye                  107304
DDoS                            94826
Infiltration - Portscan         68493
DoS Slowloris                   33182
DoS Slowhttptest                20872
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Infiltration                       16
Web Attack - SQL Injection         13
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  2 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1570251
DoS Hulk                       190837
Portscan                       159747
DoS GoldenEye                  104781
DDoS                            94783
Infiltration - Portscan         68493
DoS Slowloris                   30694
DoS Slowhttptest                18992
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        7
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  3 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1563559
DoS Hulk                       190221
Portscan                       159747
DoS GoldenEye                  104772
DDoS                            94783
Infiltration - Portscan         68493
DoS Slowloris                   30087
DoS Slowhttptest                18985
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        7
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  4 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1562947
DoS Hulk                       190208
Portscan                       159747
DoS GoldenEye                  104770
DDoS                            94783
Infiltration - Portscan         68493
DoS Slowloris                   30076
DoS Slowhttptest                18983
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        7
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  5 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1557772
DoS Hulk                       190208
Portscan                       159747
DoS GoldenEye                  104770
DDoS                            94783
Infiltration - Portscan         68493
DoS Slowloris                   30076
DoS Slowhttptest                18983
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        7
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  6 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1556712
DoS Hulk                       189908
Portscan                       159747
DoS GoldenEye                  104620
DDoS                            94783
Infiltration - Portscan         68493
DoS Slowloris                   29997
DoS Slowhttptest                18833
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        7
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  10 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1553464
DoS Hulk                       189828
Portscan                       159747
DoS GoldenEye                  104618
DDoS                            94783
Infiltration - Portscan         68492
DoS Slowloris                   29947
DoS Slowhttptest                18831
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        6
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  30 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1547140
DoS Hulk                       189705
Portscan                       159747
DoS GoldenEye                  104496
DDoS                            94783
Infiltration - Portscan         68492
DoS Slowloris                   29947
DoS Slowhttptest                18705
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        6
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________
Processing timeout  60 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Benign                        1546954
DoS Hulk                       189705
Portscan                       159747
DoS GoldenEye                  104496
DDoS                            94783
Infiltration - Portscan         68492
DoS Slowloris                   29947
DoS Slowhttptest                18705
FTP-Patator                      3972
SSH-Patator                      2960
Botnet                            757
Web Attack - Brute Force           74
Web Attack - XSS                   21
Web Attack - SQL Injection         13
Infiltration                        6
Heartbleed                          1
Name: Attack, dtype: int64
______________________________________________


Processing with default timeout parameter

In [None]:
print("Processing with default timeout parameters ...")
#out_dir = f'/home/abdelkader.elmahdaou/lustre/data_sec-um6p-st-sccs-6sevvl76uja/IDS/mahdaouy/fixed_timeouts_v2/new_idle_{idle}min_active_{active}min/CUPID'
out_dir=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Zeek/default/'
path=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Zeek/default/conn_logs/'

df = load_years_dfs(path)
df = df.rename(columns={"Label": "Attack"})
df.loc[df['Attack'] == 'BENIGN', 'Attack' ] = 'Benign'
df = df[df['Attempted Category'] == -1]
df = df.drop(columns=['Flow ID', 'Attempted Category', 'protocol'])
print(df['Attack'].value_counts())
df.to_csv(f'{out_dir}/CIC-IDS-2017_zeek_default.csv', index=False, header=True)
print("______________________________________________")
