### 00. Initial imports and PATH dirs definition

In [9]:
import random
from pathlib import Path
import polars as pl
import time
import statistics
import os
from shutil import rmtree
import concurrent.futures
from prerequisites import *

MERGED_PATH = Path(Path.cwd(), "02_attacks_merged")
SHUFFLED_PATH = Path(Path.cwd(), "03_attacks_shuffled")

os.makedirs(MERGED_PATH, exist_ok=True)
try:
    os.makedirs(SHUFFLED_PATH)
except FileExistsError:
    rmtree(SHUFFLED_PATH)
    os.makedirs(SHUFFLED_PATH)
    
DS_SCHEMA = label_casting("00_type_cast_data/type_list.txt")

### 01. Appending source files and reading to polars dataframe

In [11]:
def whitelist_check(attack_name: list, whitelist: list, blacklist: list = []):   
    # if there is a benign traffic - pass it
    if attack_name.lower().find("benign") != -1:
        return True
    # if there is blacklisted element
    for el in blacklist:
        if attack_name.find(el) != -1:
            return False
    # if given element is on the whitelist
    for el in whitelist:
        if attack_name.find(el) != -1:
            return True
        
    # if there is no element on "whitelist" - everything is passed
    if len(whitelist) == 0:
        return True

dfs = []
attack_names = []

whitelist = []#["Sql", "Uploading", "Vulnerability", "XSS", "Dict"]
blacklist = []

for path_attack in MERGED_PATH.iterdir():
    if not whitelist_check(path_attack.stem, whitelist):
        continue
    if path_attack.stem.find("Benign") != -1:
        attack_names.insert(0, path_attack.stem)
        dfs.insert(0, pl.read_csv(path_attack, schema_overrides = DS_SCHEMA))
        continue
    
    attack_names.append(path_attack.stem)
    dfs.append(pl.read_csv(path_attack, schema_overrides = DS_SCHEMA))

### 02. Attack shuffling

In [12]:
def shuffle_attack(attack: pl.DataFrame, benign: pl.DataFrame, it: int, skip_concat = True, rechunking = True):
    # INIT INFO
    curr_data = "SHUFFLING attack {}/{}: {}".format(it, len(dfs)-1, attack_names[it])
    attack_update(curr_data)
    print("\nSHUFFLING ATTACK: {}\n".format(attack_names[it]))

    merged_dataframe = attack.clear()

    benign_rows = len(benign)
    attack_rows = len(attack)

    benign_row_counter = 0
    attack_row_counter = 0

    # TIME STATS
    mean_time_10k = []
    mean_time_100k = []
    start_benign_t = time.time()
    start_att_t = time.time()

    start_chunk = time.time()

    while(True):
        if random.randint(1, 10) > 3 and benign_row_counter < benign_rows:
            merged_dataframe = pl.concat([merged_dataframe, benign[benign_row_counter]], rechunk = rechunking)
            benign_row_counter += 1
            if benign_row_counter % 10000 == 0:
                curr_benign_t = time.time()
                mean_time_10k.append(round(curr_benign_t-start_benign_t, 2))

                attack_current_state(attack_names[it], round((benign_row_counter/benign_rows), 2), round((attack_row_counter/attack_rows), 2))

                print(f"{attack_names[it]} ## BENIGN: {benign_row_counter} | ATTACK: {attack_row_counter} || MEAN_T: {statistics.mean(mean_time_10k)}s, CURR_T: {curr_benign_t-start_benign_t}s")
                start_benign_t = time.time()
        elif attack_row_counter < attack_rows:
            # if there are no benign traffic left - just copy the rest of the attack flows
            if benign_row_counter >= benign_rows:
                merged_dataframe = pl.concat([merged_dataframe, attack[attack_row_counter:]], rechunk = rechunking)
                attack_row_counter = attack_rows
                break
            merged_dataframe = pl.concat([merged_dataframe, attack[attack_row_counter]], rechunk = rechunking)
            attack_row_counter += 1
            if attack_row_counter % 10000 == 0:
                curr_att_t = time.time()
                mean_time_100k.append(round(curr_att_t-start_att_t, 2))

                attack_current_state(attack_names[it], round((benign_row_counter/benign_rows), 2), round((attack_row_counter/attack_rows), 2))

                print(f"{attack_names[it]} ## BENIGN: {benign_row_counter} | ATTACK: {attack_row_counter} || MEAN_T: {statistics.mean(mean_time_100k)}s, CURR_T:{curr_att_t-start_att_t}s")
                start_att_t = time.time()
        else:
            break
    
    stop_chunk = time.time()

    INFO_end = "INFO: dataframe creation of an attack %s completed with\n attacks: %d\n benign: %d\nTIME ELAPSED: %d"%(attack_names[it], attack_row_counter, benign_row_counter, stop_chunk-start_chunk)
    print(INFO_end)

    # sending update to ntfy
    attack_update(INFO_end)

    merged_dataframe.write_csv(Path(SHUFFLED_PATH, "{}-shuffled.csv".format(attack_names[it])))

List attack dataframes (names and shapes)

In [13]:
print(attack_names)

for it in range(len(dfs)):
    print("{iter} | {name} - {shape}".format(iter=it, name=attack_names[it], shape = dfs[it].shape))

['BenignTraffic', 'DictionaryBruteForce', 'SqlInjection', 'Uploading_Attack', 'VulnerabilityScan', 'XSS']
0 | BenignTraffic - (1098195, 47)
1 | DictionaryBruteForce - (13064, 47)
2 | SqlInjection - (5245, 47)
3 | Uploading_Attack - (1252, 47)
4 | VulnerabilityScan - (37382, 47)
5 | XSS - (3846, 47)


### 03. Shuffling attacks with concurrency

In [20]:
WORKERS = 4

In [21]:
try:
    os.makedirs(SHUFFLED_PATH)
except FileExistsError:
    rmtree(SHUFFLED_PATH)
    os.makedirs(SHUFFLED_PATH)

df_benign = dfs[0]

work_start = time.time()

with concurrent.futures.ThreadPoolExecutor(max_workers=WORKERS) as executor:
    futures = {executor.submit(shuffle_attack, dfs[it], df_benign, it) for it in range(1, len(dfs))}
    for future in concurrent.futures.as_completed(futures):
        try:
            processed_name = future.result()

            if processed_name == None:
                continue
        except Exception as e:
            print(e)
            
work_end = time.time()

print("FOR WORKERS {} - full time: {}s".format(WORKERS, round(work_end-work_start, 2)))


SHUFFLING ATTACK: DictionaryBruteForce


SHUFFLING ATTACK: SqlInjection


SHUFFLING ATTACK: Uploading_Attack


SHUFFLING ATTACK: VulnerabilityScan

INFO: dataframe creation of an attack Uploading_Attack completed with
 attacks: 1252
 benign: 2891
TIME ELAPSED: 2

SHUFFLING ATTACK: XSS

SqlInjection ## BENIGN: 10000 | ATTACK: 4247 || MEAN_T: 17.33s, CURR_T: 17.333845615386963s
DictionaryBruteForce ## BENIGN: 10000 | ATTACK: 4242 || MEAN_T: 17.35s, CURR_T: 17.34684920310974s
VulnerabilityScan ## BENIGN: 10000 | ATTACK: 4164 || MEAN_T: 17.38s, CURR_T: 17.378854036331177s
INFO: dataframe creation of an attack XSS completed with
 attacks: 3846
 benign: 9181
TIME ELAPSED: 15
INFO: dataframe creation of an attack SqlInjection completed with
 attacks: 5245
 benign: 12365
TIME ELAPSED: 24
DictionaryBruteForce ## BENIGN: 20000 | ATTACK: 8386 || MEAN_T: 23.345s, CURR_T: 29.338913917541504s
VulnerabilityScan ## BENIGN: 20000 | ATTACK: 8461 || MEAN_T: 23.555s, CURR_T: 29.73350191116333s
Vulnerabil