In [1]:
import pandas as pd
import numpy as np
import polars as pl
import glob
import re
import mapply
from math import isnan
pd.set_option('display.max_columns', 400)
pd.set_option('display.max_rows', 400)
pl.Config.set_tbl_cols(400)

from numpy import asarray
from numpy import savetxt

In [2]:
mapply.init(n_workers=-1)

In [3]:
main_dir='/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/publicCSVs/CNS22_cic17'

In [4]:
def extract_id(row):
    return row['id.orig_h']+'-'+row['id.resp_h']+'-'+str(row['id.orig_p']).replace('.0', '')+'-'+str(row['id.resp_p']).replace('.0', '')+'-'+str(row['protocol'])

In [5]:
dic_proto = {
    'tcp': 6,              # Transmission Control Protocol
    'udp': 17,             # User Datagram Protocol
    'icmp': 1,             # Internet Control Message Protocol
    'igmp': 2,             # Internet Group Management Protocol
    'ipv6-icmp': 58,       # ICMP for IPv6
    'sctp': 132,           # Stream Control Transmission Protocol
    'rtp': 5004,           # Real-time Transport Protocol (Typically uses port 5004/5005)
    'rtcp': 5005,          # Real-time Transport Control Protocol (Typically uses port 5005)
    'udt': None,           # UDT is an application-level protocol, not an IP protocol, so no number
    'arp': 'EtherType 0x0806',  # Address Resolution Protocol, uses EtherType 0x0806
    'lldp': 'EtherType 0x88CC', # Link Layer Discovery Protocol, uses EtherType 0x88CC
    'llc': 'Sub-layer of Data Link Layer',  # Logical Link Control is a sublayer, not a protocol number
    'man': None            # No specific IP protocol number, possibly a custom or context-specific protocol
}

In [6]:
days = ['Friday-WorkingHours',  'Monday-WorkingHours',  'Thursday-WorkingHours',  'Tuesday-WorkingHours',  'Wednesday-workingHours']    
timeouts = [0.5, 1, 2, 3, 4, 5, 6, 10, 30, 60]

# Process data extracted by Argus

The following cells are to update the dict of port mapping

In [42]:
def read_dataset_argus(out_dir):
    files = glob.glob(out_dir+"/features/*.csv")
    dfs = []
    for file in files:
        name = file.split('/')[-1].split('.')[0]
        name = name.split('_features')[0]
        dfi = pd.read_csv(file)
        dfi['name'] = name
        dfs.append(dfi)
    df = pd.concat(dfs)
    # df['Sport'] = pd.to_numeric(df['Sport'], errors='coerce', downcast='float')
    # df['Dport'] = pd.to_numeric(df['Dport'], errors='coerce', downcast='float')
    return df

In [43]:
out_dir=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Argus/timeout1/'
df = read_dataset_argus(out_dir)

In [44]:
from numpy import asarray
from numpy import savetxt

list_ports = []
def get_port_names(value):
    try:
        # Try to convert to an integer with base 10
        int_value = int(value)
        return int_value
    except ValueError:
        # If ValueError occurs, check if it's a valid hexadecimal
        if str(value).startswith("0x"):
            try:
                int_value = int(value, 16)  # Convert using base 16 for hex
                return int_value
            except ValueError:
                list_ports.append(value)
                return value
        else:
            list_ports.append(value)
            return value

In [45]:
df['Sport'].apply(get_port_names)
unique_Sports = list(set(list_ports))
Sports_data = [x for x in unique_Sports if x == x] # This is to remove nan rows
savetxt('Sport_cic17_strings.txt', Sports_data , delimiter=',', fmt='%s')

In [46]:
list_ports = []
df['Dport'].apply(get_port_names)

unique_Dports = list(set(list_ports))
Dports_data = [x for x in unique_Dports if x == x] # This is to remove nan rows
savetxt('Dport_cic17_strings.txt', Dports_data , delimiter=',', fmt='%s')

In [55]:
combines_Sports_Dports = Dports_data + list(set(Sports_data) - set(Dports_data))
lports = list(port_mapping.keys())

diff = list(set(combines_Sports_Dports) - set(lports))
diff

[]

In [7]:
port_mapping = {
    # Ports from AutomaticallyGeneratedAttacks
    'domain': 53,
    'bgpd': 179,
    'tcpmux': 1,
    'ldaps': 636,
    'kshell': 544,
    'auth': 113,
    'microsoft-ds': 445,
    'whois': 43,
    'ntp': 123,
    'ssh': 22,
    'enbd-cstatd': 2020,
    'x11-4': 6004,
    'spamd': 783,
    'xmpp-client': 5222,
    'ospfapi': 10101,
    'imaps': 993,
    'rmiregistry': 1099,
    'chargen': 19,
    'exec': 512,
    'bootps': 67,
    'pop3': 110,
    'time': 37,
    'cmip-man': 101,
    'imap2': 143,
    'sip-tls': 5061,
    'https': 443,
    'proofd': 4456,
    'xmpp-server': 5269,
    'http': 80,
    'telnets': 23,
    'bacula-fd': 9102,
    'x11-2': 6002,
    'amandaidx': 10080,
    'netbios-dgm': 138,
    'klogin': 543,
    'moira-update': 8766,
    'ripd': 520,
    'afs3-callback': 7001,
    'radmin-port': 4899,
    'nfs': 2049,
    'sane-port': 6566,
    'webmin': 10000,
    'kerberos': 88,
    'ingreslock': 1524,
    'rmtcfg': 4777,
    'font-service': 7100,
    'mdns': 5353,
    'afs3-kaserver': 7000,
    'poppassd': 106,
    'kpasswd': 464,
    'bacula-dir': 9101,
    'sunrpc': 111,
    'afs3-bos': 7002,
    'bootpc': 68,
    'shell': 514,
    'sip': 5060,
    'socks': 1080,
    'ircd': 6667,
    'radius': 1812,
    'svrloc': 427,
    'zope-ftp': 8021,
    'bacula-sd': 9103,
    'postgresql': 5432,
    'hostmon': 8080,
    'ms-sql-s': 1433,
    'isisd': 4786,
    'nrpe': 5666,
    'x11-7': 6007,
    'cisco-sccp': 2000,
    'zebra': 2601,
    'x11-5': 6005,
    'ms-wbt-server': 3389,
    'afpovertcp': 548,
    'x11-1': 6001,
    'ms-sql-m': 1434,
    'smux': 199,
    'login': 513,
    'gris': 254,
    'tacacs': 49,
    'daap': 3689,
    'cvspserver': 2401,
    'submissions': 587,
    'afs3-fileserver': 7003,
    'telnet': 23,
    'kerberos-adm': 749,
    'bgp': 179,
    'gsigatekeeper': 31112,
    'epmap': 135,
    'rootd': 1097,
    'nntps': 563,
    'ftps': 990,
    'x11': 6000,
    'ospfd': 520,
    'ftp': 21,
    'smtp': 25,
    'echo': 7,
    'ldap': 389,
    'netbios-ns': 137,
    'discard': 9,
    'daytime': 13,
    'svn': 3690,
    'nntp': 119,
    'gopher': 70,
    'printer': 515,
    'ipp': 631,
    'afs3-prserver': 7004,
    'mysql': 3306,
    'tproxy': 12345,
    'finger': 79,
    'git': 9418,
    'gsiftp': 2811,
    'iprop': 4444,
    'snmp': 161,
    'rsync': 873,
    'submission': 587,
    'nut': 3493,
    'gnutella-svc': 6346,
    'iscsi-target': 3260,
    'netbios-ssn': 139,
    'pop3s': 995,
    'cfinger': 10010,
    'snpp': 444,
    'x11-6': 6006,
    'dhcpv6-server': 547,
    'rtsp': 554,
    'http-alt': 8080,
    'lotusnote': 1352,
    'ftp-data': 20,
    'qotd': 17,
    'omniorb': 2809,
    'x11-3': 6003,
    
    'dircproxy': 12345,  # Example port number for dircproxy; adjust if needed
    'dhcpv6-client': 546,  # Standard port number for DHCPv6 client
    
    # This is the updated ports from Baselines
    'zabbix-agent': 10050,      # Zabbix agent port
    'zope': 9673,              # Zope application server port
    'xinetd': 513,             # xinetd (internet services daemon) port
    'binkp': 2201,             # BINKP (BINKP mail transfer protocol) port
    'sgi-cad': 5000,           # SGI CAD (Computer Aided Design) port
    'xmms2': 5010,            # XMMS2 media player port
    'asp': 80,                 # ASP (Active Server Pages) port (same as HTTP)
    'nbd': 10809,              # NBD (Network Block Device) port
    'dcap': 22128,             # DCAP (Distributed Component Object Model) port
    'amidxtape': 3233,         # AMIDXTAPE (Amiga MIDI tape) port
    'wnn6': 2040,              # WNN6 (Japanese input method) port
    'db-lsp': 9999,            # DB-LSP (Database Language Service Protocol) port
    'zabbix-trapper': 10051,   # Zabbix trapper port
    'gnutella-rtr': 6346,      # Gnutella router port
    'tfido': 6010,             # TFIDO (T-FIDO mail transfer protocol) port
    'hkp': 11371,              # HKP (HTTP Keyserver Protocol) port
    'kamanda': 9101,           # Kamanda (backup software) port
    'gsidcap': 6500,           # GSIDCAP (Grid Security Infrastructure) port
    'afs3-vlserver': 7005,     # AFS3 VLserver (Andrew File System version 3) port
    'xpilot': 1790,            # XPilot (networked multiplayer game) port
    'smsqp': 9010,             # SMSQP (SMS Queue Protocol) port
    'amanda': 10080,           # Amanda (backup software) port
    'clc-build-daemon': 10010, # CLC Build Daemon port
    'fido': 3600,               # Fido (FidoNet mail transfer) port
    
    
    #  the updated ports from HumanGeneratedAttackData
    'csync2': 3480,            # CSYNC2 (file synchronization) port
    'epmd': 4369,              # EPMD (Erlang Port Mapper Daemon) port
    'qmqp': 209,               # QMQP (Quick Mail Queue Protocol) port
    'skkserv': 117,            # SKK (Japanese Input Method) server port
    'gpsd': 2947,              # GPSD (GPS Daemon) port
    'afs3-errors': 7006,       # AFS3 Errors (Andrew File System version 3) port
    'silc': 706,               # SILC (Secure Internet Live Conferencing) port
    'radius-acct': 1813,       # RADIUS Accounting port
    'cfengine': 5308,          # CFEngine (configuration management) port
    'noclog': 6789,            # Noclog (network monitoring) port
    'systat': 11,              # SYSTAT (system status) port
    'support': 8081,           # SUPPORT (customer support) port
    'acr-nema': 104,           # ACR-NEMA (DICOM) port
    'iax': 4569,              # IAX (Inter-Asterisk eXchange) port
    'iso-tsap': 2000,          # ISO-TSAP (ISO Transport Service Access Point) port
    'kerberos4': 750,          # Kerberos 4 (authentication) port
    'codasrv-se': 3200,        # CODASRV-SE (CODA file system) port
    'datametrics': 15500,      # DataMetrics (data metrics) port
    'mon': 6250,               # MON (monitoring) port
    'snmp-trap': 162,          # SNMP Trap port
    'gnunet': 2086,            # GNUnet (decentralized network) port
    'dict': 2628,              # DICT (Dictionary server) port
    'mtn': 3790,               # MTN (Monotone) port
    'hylafax': 4559,           # Hylafax (fax server) port
    'amqp': 5672,             # AMQP (Advanced Message Queuing Protocol) port
    'zebrasrv': 2601,          # Zebra (routing) server port
    'afs3-update': 7007,       # AFS3 Update (Andrew File System version 3) port
    'qmtp': 209,               # QMTP (Quick Mail Transfer Protocol) port
    'puppet': 8140,            # Puppet (configuration management) port
    'venus-se': 3200,          # Venus SE (Coda file system) port
    'zserv': 3100,             # Zserv (Zebra server) port
    'amqps': 5671,             # AMQPS (AMQP Secure) port
    'uucp': 540,               # UUCP (Unix-to-Unix Copy Protocol) port
    'icpv2': 65000,            # ICPv2 (InterCluster Protocol version 2) port
    'domain-s': 53,            # DOMAIN-S (alternative port for DNS) port
    'venus': 2430,             # Venus (Coda file system) port
    'ircs-u': 6697,            # IRCS-U (secure IRC) port
    'sa-msg-port': 12000,      # SA MSG PORT (Message Port) port
    'sysrqd': 5280,            # SYSRQD (System Request Daemon) port
    'munin': 4949,             # Munin (network monitoring) port
    'isdnlog': 3306,           # ISDN Log (ISDN logging) port
    'ripngd': 521,             # RIPng (Routing Information Protocol next generation) port
    'pcrd': 3001,              # PCRD (Pervasive Computing Radio Device) port
    'bgpsim': 4500,            # BGPSIM (BGP Simulator) port
    'afmbackup': 10001,        # AFM Backup (Advanced File Manager) port
    'pawserv': 9100,           # PAWS (Printer Access and Web Services) port
    'codaauth2': 389,          # CODA Authentication 2 (Coda file system) port
    'freeciv': 5555,           # Freeciv (game server) port
    'ospf6d': 2601,            # OSPF6D (Open Shortest Path First version 6) port
    'codasrv': 3201,            # CODASRV (CODA file system) port
    
    ## Updated using CIC-IDS17 dataset
    'vboxd': 20012,            # VBoxD (VirtualBox Daemon)
    'sgi-gcd': 472,            # SGI GCD (Graphics Cluster Daemon)
    'ipsec-nat-t': 4500,       # IPsec NAT-T (Network Address Translation Traversal)
    'zephyr-clt': 2103,        # Zephyr Client
    'sgi-cmsd': 490,           # SGI CMSD (Content Management Service Daemon)
    'babel': 6696,             # Babel (routing protocol)
    'f5-globalsite': 2792,     # F5 Global Site
    'supfiledbg': 1127,        # SUPFILD DBG
    'afs3-volser': 7005,       # AFS3 Volser (Andrew File System)
    'canna': 5680,             # Canna (Japanese Input Method)
    'mandelspawn': 9359,       # Mandelspawn
    'suucp': 4031,             # SUUCP
    'distmp3': 4600,           # Distmp3 (Distributed MP3 Encoding)
    'sge-execd': 6445,         # SGE Execd (Sun Grid Engine execution daemon)
    'afs3-rmtsys': 7500,       # AFS3 RmtSys (Andrew File System)
    'f5-iquery': 4353,         # F5 iQuery
    'distcc': 3632,            # Distcc (Distributed C Compiler)
    'rplay': 5555,             # RPlay (audio player)
    'groupwise': 1677,         # GroupWise (Novell)
    'mysql-proxy': 4040,       # MySQL Proxy
    'sieve': 4190,             # Sieve (Email Filtering)
    'remctl': 4373,            # Remctl (Remote Command Execution)
    'nsca': 5667,              # NSCA (Nagios Service Check Acceptor)
    'predict': 1210,           # Predict
    'fsp': 21,                 # FSP (File Service Protocol)
    'dicom': 104,              # DICOM (Digital Imaging and Communications in Medicine)
    'syslog-tls': 6514,        # Syslog over TLS
    'kermit': 1649,            # Kermit (file transfer protocol)
    'xtelw': 1313,             # XTELW
    'sge-qmaster': 6444,       # SGE Qmaster (Sun Grid Engine master daemon)
    'sgi-crsd': 491,           # SGI CRSD (Content Repository Service Daemon)
    'mrtd': 5725,              # MRTD (Multicast Routing)
    'gds-db': 3050,            # GDS DB (Firebird)
    'fax': 4557,               # Fax (HylaFAX)
    'afbackup': 2988,          # AFBackup (Client-Server Backup System)
    'zephyr-srv': 2102,        # Zephyr Server
    'isns': 3205,              # iSNS (Internet Storage Name Service)
    'enbd-sstatd': 4910,       # ENBD SSTATD (Enhanced Network Block Device)
    'openvpn': 1194,           # OpenVPN
    'xtel': 1313,              # XTEL
    'zephyr-hm': 2104,         # Zephyr HostManager
    'rtcm-sc104': 2101,        # RTCM-SC104 (Real-Time Correction Messages)
    'l2f': 1701,               # L2F (Layer 2 Forwarding)
    'snap': 2030               # SNAP (Simple Network Audio Protocol)
    
}

Now we have an updated dict, we can continue ...

In [8]:
def extract_id(row):
    return row['SrcAddr']+'-'+row['DstAddr']+'-'+str(row['Sport_number']).replace('.0', '')+'-'+str(row['Dport_number']).replace('.0', '')+'-'+str(row['protocol'])

In [9]:
# Function to check if the value is a string, decimal number, or hexadecimal number
def get_port_numbers(value):
    if value in port_mapping:
        return port_mapping[value]
    
    try:
        # Try to convert to an integer with base 10
        int_value = int(value)
        return int_value
    except ValueError:
        # If ValueError occurs, check if it's a valid hexadecimal
        if str(value).startswith("0x"):
            try:
                int_value = int(value, 16)  # Convert using base 16 for hex
                return int_value
            except ValueError:
                return value
        else:
            return value


In [10]:
def load_years_dfs(path):
    dfs = []
    for day in days:
        dd = day.split('-')[0].lower()
        file = path+f'{day}_features.csv'
        dfi = pd.read_csv(file)
        
        dfi['protocol'] = dfi['Proto'].map(dic_proto)
        dfi['Dport_number'] = dfi['Dport'].apply(lambda x: pd.Series(get_port_numbers(x)))
        dfi['Sport_number'] = dfi['Sport'].apply(lambda x: pd.Series(get_port_numbers(x)))
        dfi['Dport_number'] = dfi['Dport_number'].astype('Int64')
        dfi['Sport_number'] = dfi['Sport_number'].astype('Int64')
        dfi = dfi.drop(columns=['Label']) # Drop the column 'Label' generated by Argus which is empty (NaN), and this to prevent problems when merging (Label_y)
        dfi['Flow ID'] = dfi.mapply(extract_id, axis=1)
        
        df_labels = pd.read_csv(f'{main_dir}/{dd}.csv')
        df_labels = df_labels[['Flow ID', 'Label', 'Attempted Category']]
        print('labels shape before dedup', df_labels.shape)
        df_labels = df_labels.drop_duplicates()
        print('labels shape after dedup', df_labels.shape)
        df_labeled = pd.merge(dfi, df_labels, on='Flow ID')
        
        df_labeled = df_labeled.drop_duplicates()

        dfs.append(df_labeled)
    df = pd.concat(dfs)  
    return df

In [11]:
for timeout in timeouts:
    print("Processing timeout ", timeout, "...")
    #out_dir = f'/home/abdelkader.elmahdaou/lustre/data_sec-um6p-st-sccs-6sevvl76uja/IDS/mahdaouy/fixed_timeouts_v2/new_idle_{idle}min_active_{active}min/CUPID'
    out_dir=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Argus/timeout{timeout}/'
    path=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Argus/timeout{timeout}/features/'

    df = load_years_dfs(path)
    print(df.columns)
    df = df.rename(columns={"Label": "Attack"})
    df.loc[df['Attack'] == 'BENIGN', 'Attack' ] = 'Benign'
    df = df[df['Attempted Category'] == -1]
    df = df.drop(columns=['Flow ID', 'Attempted Category', 'protocol', 'Dport_number', 'Sport_number'])
    print(df['Attack'].value_counts())
    df.to_csv(f'{out_dir}/CIC-IDS-2017_zeek_{timeout}.csv', index=False, header=True)
    print("______________________________________________")
    

Processing timeout  0.5 ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351486
DoS Hulk                       225202
Portscan                       157550
DDoS                           155487
DoS GoldenEye                  128904
Infiltration - Portscan         60812
DoS Slowloris                   45804
DoS Slowhttptest                25875
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3349757
DoS Hulk                       223064
Portscan                       157549
DDoS                           155412
DoS GoldenEye                  127964
Infiltration - Portscan         60154
DoS Slowloris                   44008
DoS Slowhttptest                25616
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3350246
DoS Hulk                       223086
Portscan                       157549
DDoS                           155412
DoS GoldenEye                  127980
Infiltration - Portscan         60208
DoS Slowloris                   44006
DoS Slowhttptest                25622
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3350889
DoS Hulk                       223086
Portscan                       157549
DDoS                           155416
DoS GoldenEye                  127980
Infiltration - Portscan         60405
DoS Slowloris                   44004
DoS Slowhttptest                25624
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351007
DoS Hulk                       223084
Portscan                       157549
DDoS                           155422
DoS GoldenEye                  127978
Infiltration - Portscan         60666
DoS Slowloris                   44002
DoS Slowhttptest                25624
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351014
DoS Hulk                       223084
Portscan                       157549
DDoS                           155435
DoS GoldenEye                  127978
Infiltration - Portscan         60669
DoS Slowloris                   44002
DoS Slowhttptest                25624
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351051
DoS Hulk                       223078
Portscan                       157549
DDoS                           155435
DoS GoldenEye                  127972
Infiltration - Portscan         60669
DoS Slowloris                   43996
DoS Slowhttptest                25624
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351190
DoS Hulk                       225138
Portscan                       157549
DDoS                           155435
DoS GoldenEye                  128883
Infiltration - Portscan         60669
DoS Slowloris                   45777
DoS Slowhttptest                25862
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351463
DoS Hulk                       225179
Portscan                       157549
DDoS                           155435
DoS GoldenEye                  128891
Infiltration - Portscan         60812
DoS Slowloris                   45803
DoS Slowhttptest                25870
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351486
DoS Hulk                       225202
Portscan                       157550
DDoS                           155487
DoS GoldenEye                  128904
Infiltration - Portscan         60812
DoS Slowloris                   45804
DoS Slowhttptest                25875
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 

Processing with default timeout parameter

In [12]:
print("Processing with default timeout parameters ...")
#out_dir = f'/home/abdelkader.elmahdaou/lustre/data_sec-um6p-st-sccs-6sevvl76uja/IDS/mahdaouy/fixed_timeouts_v2/new_idle_{idle}min_active_{active}min/CUPID'
out_dir=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Argus/default/'
path=f'/home/meryem.janati/lustre/nlp_team-um6p-st-sccs-id7fz1zvotk/IDS/janati/IDS/Datasets/cicids17/Argus/default/features/'

df = load_years_dfs(path)
print(df.columns)
df = df.rename(columns={"Label": "Attack"})
df.loc[df['Attack'] == 'BENIGN', 'Attack' ] = 'Benign'
df = df[df['Attempted Category'] == -1]
df = df.drop(columns=['Flow ID', 'Attempted Category', 'protocol', 'Dport_number', 'Sport_number'])
print(df['Attack'].value_counts())
df.to_csv(f'{out_dir}/CIC-IDS-2017_zeek_default.csv', index=False, header=True)
print("______________________________________________")


Processing with default timeout parameters ...


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (547557, 3)
labels shape after dedup (396761, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (371624, 3)
labels shape after dedup (249126, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (362076, 3)
labels shape after dedup (258728, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (322078, 3)
labels shape after dedup (211729, 3)


  0%|          | 0/232 [00:00<?, ?it/s]

labels shape before dedup (496641, 3)
labels shape after dedup (243486, 3)
Index(['SrcId', 'Rank', 'StartTime', 'LastTime', 'Trans', 'Flgs', 'Seq', 'Dur',
       'RunTime', 'IdleTime',
       ...
       'TcpRtt', 'SynAck', 'AckDat', 'TcpOpt', 'protocol', 'Dport_number',
       'Sport_number', 'Flow ID', 'Label', 'Attempted Category'],
      dtype='object', length=114)
Benign                        3351486
DoS Hulk                       225202
Portscan                       157550
DDoS                           155487
DoS GoldenEye                  128904
Infiltration - Portscan         60812
DoS Slowloris                   45804
DoS Slowhttptest                25875
SSH-Patator                      8717
FTP-Patator                      7941
Botnet                            742
Web Attack - Brute Force          504
Infiltration                      288
Web Attack - XSS                  253
Heartbleed                        246
Web Attack - SQL Injection         26
Name: Attack, dtype: 