### 00. Initial imports and PATH dirs definition

In [10]:
import random
from pathlib import Path
import polars as pl
import time
import os
from shutil import rmtree
from prerequisites import attack_update, label_casting

MERGED_PATH = Path(Path.cwd(), "02_attacks_merged")
SHUFFLED_PATH = Path(Path.cwd(), "03_attacks_shuffled")

os.makedirs(MERGED_PATH, exist_ok=True)

try:
    os.makedirs(SHUFFLED_PATH)
except FileExistsError:
    rmtree(SHUFFLED_PATH)
    os.makedirs(SHUFFLED_PATH)
    
DS_SCHEMA = label_casting("00_type_cast_data/type_list.txt")

### 01. Appending source files

In [11]:
def whitelist_check(attack_name: str, whitelist: list, blacklist: list = []) -> bool:   
    # if there is a benign traffic - pass it
    if attack_name.lower().find("benign") != -1:
        return True
    # if there is blacklisted element
    for el in blacklist:
        if attack_name.find(el) != -1:
            return False
    # if given element is on the whitelist
    for el in whitelist:
        if attack_name.find(el) != -1:
            return True
        
    # if there is no element on "whitelist" - everything is passed
    if len(whitelist) == 0:
        return True

attack_names = []
attack_paths = []

whitelist = []#["DoS", "Mirai"]
blacklist = []#["DDoS"]

for path_attack in MERGED_PATH.iterdir():
    if not whitelist_check(path_attack.stem, whitelist, blacklist):
        continue
    if path_attack.stem.find("Benign") != -1:
        attack_names.insert(0, path_attack.stem)
        attack_paths.insert(0, path_attack)
        continue
    
    attack_names.append(path_attack.stem)
    attack_paths.append(path_attack)

In [12]:
print(attack_names)

['00_Benign_Final', 'Backdoor_Malware', 'BrowserHijacking', 'CommandInjection', 'DDoS-ACK_Fragmentation', 'DDoS-HTTP_Flood', 'DDoS-ICMP_Flood', 'DDoS-ICMP_Fragmentation', 'DDoS-SlowLoris', 'DDoS-TCP_Flood', 'DDoS-UDP_Flood', 'DDoS-UDP_Fragmentation', 'DictionaryBruteForce', 'DNS_Spoofing', 'DoS-HTTP_Flood', 'DoS-TCP_Flood', 'DoS-UDP_Flood', 'Mirai-greeth_flood', 'Mirai-greip_flood', 'Mirai-udpplain', 'MITM-ArpSpoofing', 'Recon-HostDiscovery', 'Recon-OSScan', 'Recon-PingSweep', 'Recon-PortScan', 'SqlInjection', 'Uploading_Attack', 'VulnerabilityScan', 'XSS']


### 02. Attack shuffling

Defining the function

In [13]:
def shuffle_attack(attack_path: Path, benign_path: Path, att_name: str, it, att_list_len):
    with open(Path(SHUFFLED_PATH, "{}-shuffled.csv".format(att_name)), "w") as file_shuffled:
        log = "SHUFFLING attack {}/{}: {}".format(it, att_list_len, att_name)
        attack_update(log)
        print(log)
        
        if_benign_finished = False
        if_attack_finished = False
        
        benign_file = open(benign_path, "r")
        attack_file = open(attack_path, "r")
        
        header_benign = benign_file.readline()
        attack_file.readline()
        
        file_shuffled.write(header_benign)
        
        attack_num_counter = 0
        benign_num_counter = 0
        
        time_processing_start = time.time()
        
        while(True):
            if random.randint(1, 10) > 3 and not if_benign_finished:
                line = benign_file.readline()
                # if benign file reaches EOF
                if line == "":
                    if_benign_finished = True
                    continue
                benign_num_counter += 1
                file_shuffled.write(line)
            elif not if_attack_finished:
                line = attack_file.readline()
                # if attack file reaches EOF
                if line == "":
                    if_attack_finished = True
                    continue
                attack_num_counter += 1
                file_shuffled.write(line)
            else:
                break
                
        benign_file.close()
        attack_file.close()
        
        time_processing_stop = time.time()
        
        INFO_end = "INFO: dataset creation of an attack {} completed with\n attacks: {}\n benign: {}\nTIME ELAPSED: {}s".format(att_name, attack_num_counter, benign_num_counter, round(time_processing_stop-time_processing_start, 2))
        
        print(INFO_end)

Iterate over each `attack_path` and shuffle distinct attacks with benign traffic

In [14]:
for it in range(1, len(attack_paths)):
    shuffle_attack(
        attack_path = attack_paths[it],
        benign_path = attack_paths[0],
        att_name = attack_names[it],
        it = it,
        att_list_len = len(attack_paths)-1
    )

SHUFFLING attack 1/28: Backdoor_Malware
INFO: dataset creation of an attack Backdoor_Malware completed with
 attacks: 622
 benign: 1278
TIME ELAPSED: 0.05s
SHUFFLING attack 2/28: BrowserHijacking
INFO: dataset creation of an attack BrowserHijacking completed with
 attacks: 933
 benign: 2084
TIME ELAPSED: 0.08s
SHUFFLING attack 3/28: CommandInjection
INFO: dataset creation of an attack CommandInjection completed with
 attacks: 847
 benign: 2100
TIME ELAPSED: 0.08s
SHUFFLING attack 4/28: DDoS-ACK_Fragmentation
INFO: dataset creation of an attack DDoS-ACK_Fragmentation completed with
 attacks: 8821164
 benign: 135740
TIME ELAPSED: 208.25s
SHUFFLING attack 5/28: DDoS-HTTP_Flood
INFO: dataset creation of an attack DDoS-HTTP_Flood completed with
 attacks: 591468
 benign: 135740
TIME ELAPSED: 19.78s
SHUFFLING attack 6/28: DDoS-ICMP_Flood
INFO: dataset creation of an attack DDoS-ICMP_Flood completed with
 attacks: 71242
 benign: 135740
TIME ELAPSED: 4.92s
SHUFFLING attack 7/28: DDoS-ICMP_Fragm

### 03. Test/Train division

In [15]:
import numpy as np

INPUT_PATH = Path(Path.cwd(), "03.2_attacks_shuffled_divided")
DIVIDED_TRAIN_PATH = Path(INPUT_PATH, "train")
DIVIDED_TEST_PATH = Path(INPUT_PATH, "test")

try:
    os.makedirs(INPUT_PATH)
    os.makedirs(DIVIDED_TRAIN_PATH)
    os.makedirs(DIVIDED_TEST_PATH)
except FileExistsError:
    rmtree(INPUT_PATH)
    os.makedirs(INPUT_PATH)
    os.makedirs(DIVIDED_TRAIN_PATH)
    os.makedirs(DIVIDED_TEST_PATH)

Defining the function

In [16]:
def divide_test_train(att_path: Path, att_name: str, it, att_list_len):
    DIVIDED_TRAIN_ATT_PATH = Path(DIVIDED_TRAIN_PATH, "train_{}.csv".format(att_name))
    DIVIDED_TEST_ATT_PATH = Path(DIVIDED_TEST_PATH, "test_{}.csv".format(att_name))
    
    divided_train = open(DIVIDED_TRAIN_ATT_PATH, "w")
    divided_test = open(DIVIDED_TEST_ATT_PATH, "w")
    attack_source = open(att_path, "r")
    
    # creating header
    header = attack_source.readline()
    divided_test.write(header)
    divided_train.write(header)
    
    while(True):
        line = attack_source.readline()
        if line == "":
            break
        if random.randint(1, 10) > 8:
            divided_test.write(line)
        else:
            divided_train.write(line)
    
    # closing
    attack_source.close()
    divided_train.close()
    divided_test.close()
    
    # quick check of attack-to-benign ratio
    # --------------------------------------
    train_df = pl.read_csv(DIVIDED_TRAIN_ATT_PATH, schema_overrides=DS_SCHEMA)
    test_df = pl.read_csv(DIVIDED_TEST_ATT_PATH, schema_overrides=DS_SCHEMA)
    
    unique_train_names, unique_train_counts = np.unique(train_df.select('label').to_numpy(), return_counts=True)
    unique_test_names, unique_test_counts = np.unique(test_df.select('label').to_numpy(), return_counts=True)
    
    
    print("{}/{} - {}\n======================".format(it, att_list_len, att_name))
    
    print("TRAIN dataset:\n\t{} - {} ({}%)\n\t{} - {} ({}%)".format(
        unique_train_names[0], unique_train_counts[0], round((unique_train_counts[0]/(unique_train_counts[0]+unique_train_counts[1]))*100, 2),
        unique_train_names[1], unique_train_counts[1], round((unique_train_counts[1]/(unique_train_counts[0]+unique_train_counts[1]))*100, 2)
    ))
    
    print("TEST dataset:\n\t{} - {} ({}%)\n\t{} - {} ({}%)\n\n".format(
        unique_test_names[0], unique_test_counts[0], round((unique_test_counts[0]/(unique_test_counts[0]+unique_test_counts[1]))*100, 2),
        unique_test_names[1], unique_test_counts[1], round((unique_test_counts[1]/(unique_test_counts[0]+unique_test_counts[1]))*100, 2)
    ))
    # --------------------------------------

Dividing the dataset into test and train parts

In [17]:
ATT_LIST_LEN = len([att for att in SHUFFLED_PATH.iterdir() if att.is_file()])

counter = 1
for el in SHUFFLED_PATH.iterdir():
    if not el.is_file():
        continue
    divide_test_train(
        el,
        el.stem,
        counter,
        ATT_LIST_LEN
    )
    counter += 1

1/28 - Backdoor_Malware-shuffled
TRAIN dataset:
	Backdoor_Malware - 509 (33.38%)
	BenignTraffic - 1016 (66.62%)
TEST dataset:
	Backdoor_Malware - 113 (30.13%)
	BenignTraffic - 262 (69.87%)


2/28 - BrowserHijacking-shuffled
TRAIN dataset:
	BenignTraffic - 1684 (69.24%)
	BrowserHijacking - 748 (30.76%)
TEST dataset:
	BenignTraffic - 400 (68.38%)
	BrowserHijacking - 185 (31.62%)


3/28 - CommandInjection-shuffled
TRAIN dataset:
	BenignTraffic - 1668 (71.56%)
	CommandInjection - 663 (28.44%)
TEST dataset:
	BenignTraffic - 432 (70.13%)
	CommandInjection - 184 (29.87%)


4/28 - DDoS-ACK_Fragmentation-shuffled
TRAIN dataset:
	BenignTraffic - 108874 (1.52%)
	DDoS-ACK_Fragmentation - 7057223 (98.48%)
TEST dataset:
	BenignTraffic - 26866 (1.5%)
	DDoS-ACK_Fragmentation - 1763941 (98.5%)


5/28 - DDoS-HTTP_Flood-shuffled
TRAIN dataset:
	BenignTraffic - 108642 (18.68%)
	DDoS-HTTP_Flood- - 472952 (81.32%)
TEST dataset:
	BenignTraffic - 27098 (18.61%)
	DDoS-HTTP_Flood- - 118516 (81.39%)


6/28 - DDo