## Produce Datasets in Our Format, DeepCoFFEA Format, and MixCorr Format from Raw `exp06` Dataset

In [1]:
import numpy as np
from glob import glob
from os import makedirs
from shutil import rmtree
from datetime import datetime
from os.path import join

from lib_analyze_exps import parse_run, walk_one_level, create_mixcorr_dataset

In [2]:
### MODIFY BEGIN ###

# Specify the name of the experiment that is supposed to be processed from raw.
EXP_NAME = "exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay"

# Specify parent directory of raw experimental results directory.
RAW_EXP_DIR_PARENT = "/PRIVATE_PATH/"

###  MODIFY END  ###

In [3]:
# Construct paths to:
#   * RAW_EXP_DIR => raw experimental result files repository,
#   * PROC_EXP_DIR_OUR => processed version of the dataset in our flowpair format after running this notebook,
#   * PROC_EXP_DIR_DCOFFEA => processed version of the dataset in DeepCoFFEA flowpair format after running this notebook,
#   * PROC_EXP_DIR_MIXCORR => processed version of the dataset in MixCorr flowpair format after running this notebook.
#
# CAUTION: The filesystem locations PROC_EXP_DIR_* will be deleted if they already exist.
RAW_EXP_DIR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_raw")
PROC_EXP_DIR_OUR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_filtered-to-start-end")
PROC_EXP_DIR_DCOFFEA = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_format-deepcoffea_filtered-to-start-end")
PROC_EXP_DIR_MIXCORR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_format-mixcorr_filtered-to-start-end")

In [None]:
# Delete and recreate the final dataset directories.

rmtree(PROC_EXP_DIR_OUR, ignore_errors = True)
rmtree(PROC_EXP_DIR_DCOFFEA, ignore_errors = True)
rmtree(PROC_EXP_DIR_MIXCORR, ignore_errors = True)

makedirs(PROC_EXP_DIR_OUR, mode=0o755)
makedirs(PROC_EXP_DIR_DCOFFEA, mode=0o755)
makedirs(PROC_EXP_DIR_MIXCORR, mode=0o755)

In [None]:
# Discover all succeeded runs to process.
runs_succeeded = sorted(glob(join(RAW_EXP_DIR, "*", "exp06_curl_run_*", "SUCCEEDED")))

# For exp06, this should give: 35,801.
print(f"{len(runs_succeeded):,}")

In [None]:
# Special case for exp06: We had to 'split' the gateway-mixnodes log files as they were
# too big to push to GitHub. Thus, before we run the production steps below, we reverse
# the split step by concatenating each two pieces into one unified log file again.

idx = 1
exp_folders = walk_one_level(RAW_EXP_DIR, True)

print("Unifying split gateway-mixnodes log files into a single log file again...\n")

for exp_folder in exp_folders:
    
    gw_mix_log_one = join(exp_folder, "logs_docker-run_gateway-mixnodes.log.aa")
    gw_mix_log_two = join(exp_folder, "logs_docker-run_gateway-mixnodes.log.ab")
    gw_mix_log = join(exp_folder, "logs_docker-run_gateway-mixnodes.log")
    
    # Concatenate log one and log two into original log file.
    ! cat {gw_mix_log_one} {gw_mix_log_two} > {gw_mix_log}
    
    # Count each file's number of lines.
    lines_one = ! wc -l {gw_mix_log_one} | tr -s " " | cut -d " " -f 1
    lines_two = ! wc -l {gw_mix_log_two} | tr -s " " | cut -d " " -f 1
    lines_merged = ! wc -l {gw_mix_log} | tr -s " " | cut -d " " -f 1
    
    assert (int(lines_one[0]) + int(lines_two[0])) == int(lines_merged[0])
    
    # Extract first 10 lines from log one and the merged log to ensure correct order.
    first_ten_lines_one = ! head -n 10 {gw_mix_log_one}
    first_ten_lines_merged = ! head -n 10 {gw_mix_log}
    
    assert np.array_equal(first_ten_lines_one, first_ten_lines_merged) == True
    
    print(f"\t{idx}) Successfully merged split gateway-mixnodes log files into:\n\t{gw_mix_log}\n")
    
    idx += 1

In [None]:
print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Beginning to process raw exp06 dataset -----\n")

run_ctr = 1

for run_succeeded in runs_succeeded:
    
    parse_run(
        PROC_EXP_DIR_OUR,
        PROC_EXP_DIR_DCOFFEA,
        run_ctr,
        run_succeeded)
    
    run_ctr += 1

print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Finished processing raw exp06 dataset -----")

In [4]:
# Temporarily keep these lines until we run the entire Notebook again.
rmtree(PROC_EXP_DIR_MIXCORR, ignore_errors = True)
makedirs(PROC_EXP_DIR_MIXCORR, mode=0o755)

In [5]:
print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Beginning to produce MixCorr-formatted dataset for exp06 -----\n")

# Create dataset in MixCorr format based on existing dataset in our format.
create_mixcorr_dataset(PROC_EXP_DIR_OUR, PROC_EXP_DIR_MIXCORR)

print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Finished producing MixCorr-formatted dataset for exp06 -----")

----- [2023-02-23_16:50:39.129113] Beginning to produce MixCorr-formatted dataset for exp06 -----

Considering flowpairs of dataset split 'train' now...
Done with flowpairs of dataset split 'train'

Considering flowpairs of dataset split 'val' now...
Done with flowpairs of dataset split 'val'

Considering flowpairs of dataset split 'test' now...
Done with flowpairs of dataset split 'test'

----- [2023-02-23_17:16:42.837233] Finished producing MixCorr-formatted dataset for exp06 -----


In [6]:
# Split files in newly produced MixCorr-formatted dataset of at least 90 MiB into
# multiple files in order to be able to commit to GitHub.

files_too_large = ! find {PROC_EXP_DIR_MIXCORR} -type f -size +90M

for file_too_large in files_too_large:
    
    ! split --verbose --lines=5000 --numeric-suffixes=01 {file_too_large} {file_too_large}.
    ! cat {file_too_large}.?? > {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large
    ! cmp --verbose --print-bytes {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large {file_too_large}
    ! rm {file_too_large} {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large

creating file '/PRIVATE_PATH/dataset_exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.01'
creating file '/PRIVATE_PATH/dataset_exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.02'
creating file '/PRIVATE_PATH/dataset_exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.03'
creating file '/PRIVATE_PATH/dataset_exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.04'
creating file '/PRIVATE_PATH/dataset_exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.05'
creating file '/PRIVATE_PATH/dataset_exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay_format-mixcorr_filtered-to-sta