## Produce Datasets in Our Format, DeepCoFFEA Format, and MixCorr Format from Raw `exp01` Dataset

In [1]:
from glob import glob
from os import makedirs
from shutil import rmtree
from datetime import datetime
from os.path import join

from lib_analyze_exps import parse_run, create_mixcorr_dataset

In [2]:
### MODIFY BEGIN ###

# Specify the name of the experiment that is supposed to be processed from raw.
EXP_NAME = "exp01_nym-binaries-1.0.2_static-http-download"

# Specify parent directory of raw experimental results directory.
RAW_EXP_DIR_PARENT = "/PRIVATE_PATH/"

###  MODIFY END  ###

In [3]:
# Construct paths to:
#   * RAW_EXP_DIR => raw experimental result files repository,
#   * PROC_EXP_DIR_OUR => processed version of the dataset in our flowpair format after running this notebook,
#   * PROC_EXP_DIR_DCOFFEA => processed version of the dataset in DeepCoFFEA flowpair format after running this notebook,
#   * PROC_EXP_DIR_MIXCORR => processed version of the dataset in MixCorr flowpair format after running this notebook.
#
# CAUTION: The filesystem locations PROC_EXP_DIR_* will be deleted if they already exist.
RAW_EXP_DIR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_raw")
PROC_EXP_DIR_OUR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_filtered-to-start-end")
PROC_EXP_DIR_DCOFFEA = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_format-deepcoffea_filtered-to-start-end")
PROC_EXP_DIR_MIXCORR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_format-mixcorr_filtered-to-start-end")

In [None]:
# Delete and recreate the final dataset directories.

rmtree(PROC_EXP_DIR_OUR, ignore_errors = True)
rmtree(PROC_EXP_DIR_DCOFFEA, ignore_errors = True)
rmtree(PROC_EXP_DIR_MIXCORR, ignore_errors = True)

makedirs(PROC_EXP_DIR_OUR, mode=0o755)
makedirs(PROC_EXP_DIR_DCOFFEA, mode=0o755)
makedirs(PROC_EXP_DIR_MIXCORR, mode=0o755)

In [None]:
# Discover all succeeded runs to process.
runs_succeeded = sorted(glob(join(RAW_EXP_DIR, "*", "exp01_curl_run_*", "SUCCEEDED")))

# For exp01, this should give: 35,266.
print(f"{len(runs_succeeded):,}")

In [None]:
print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Beginning to process raw exp01 dataset -----\n")

run_ctr = 1

for run_succeeded in runs_succeeded:
    
    parse_run(
        PROC_EXP_DIR_OUR,
        PROC_EXP_DIR_DCOFFEA,
        run_ctr,
        run_succeeded)
    
    run_ctr += 1

print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Finished processing raw exp01 dataset -----")

In [4]:
# Temporarily keep these lines until we run the entire Notebook again.
rmtree(PROC_EXP_DIR_MIXCORR, ignore_errors = True)
makedirs(PROC_EXP_DIR_MIXCORR, mode=0o755)

In [5]:
print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Beginning to produce MixCorr-formatted dataset for exp01 -----\n")

# Create dataset in MixCorr format based on existing dataset in our format.
create_mixcorr_dataset(PROC_EXP_DIR_OUR, PROC_EXP_DIR_MIXCORR)

print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Finished producing MixCorr-formatted dataset for exp01 -----")

----- [2023-02-23_13:48:55.820226] Beginning to produce MixCorr-formatted dataset for exp01 -----

Considering flowpairs of dataset split 'train' now...
Done with flowpairs of dataset split 'train'

Considering flowpairs of dataset split 'val' now...
Done with flowpairs of dataset split 'val'

Considering flowpairs of dataset split 'test' now...
Done with flowpairs of dataset split 'test'

----- [2023-02-23_14:12:48.861306] Finished producing MixCorr-formatted dataset for exp01 -----


In [6]:
# Split files in newly produced MixCorr-formatted dataset of at least 90 MiB into
# multiple files in order to be able to commit to GitHub.

files_too_large = ! find {PROC_EXP_DIR_MIXCORR} -type f -size +90M

for file_too_large in files_too_large:
    
    ! split --verbose --lines=5000 --numeric-suffixes=01 {file_too_large} {file_too_large}.
    ! cat {file_too_large}.?? > {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large
    ! cmp --verbose --print-bytes {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large {file_too_large}
    ! rm {file_too_large} {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large

creating file '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0.2_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.01'
creating file '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0.2_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.02'
creating file '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0.2_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.03'
creating file '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0.2_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.04'
creating file '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0.2_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_ack.05'
creating file '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0.2_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_to_gateway_data.01'
creating file '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0