## Produce Datasets in Our Format, DeepCoFFEA Format, and MixCorr Format from Raw `exp08` Dataset

In [1]:
from glob import glob
from os import makedirs
from shutil import rmtree
from datetime import datetime
from os.path import join

from lib_analyze_exps import parse_run, create_mixcorr_dataset

In [None]:
# We expect Notebook 'exp08_1_check-raw-dataset.ipynb' to have run _before_ this Notebook is run.
# Large files are reassembled from their `split`s before in Notebook 'exp08_1_check-raw-dataset.ipynb',
# and this Notebook expects the large unified files to be present.

In [2]:
### MODIFY BEGIN ###

# Specify the name of the experiment that is supposed to be processed from raw.
EXP_NAME = "exp08_nym-binaries-v1.1.13_static-http-download"

# Specify parent directory of raw experimental results directory.
RAW_EXP_DIR_PARENT = "/PRIVATE_PATH/"

###  MODIFY END  ###

In [3]:
# Construct paths to:
#   * RAW_EXP_DIR => raw experimental result files repository,
#   * PROC_EXP_DIR_OUR => processed version of the dataset in our flowpair format after running this notebook,
#   * PROC_EXP_DIR_DCOFFEA => processed version of the dataset in DeepCoFFEA flowpair format after running this notebook,
#   * PROC_EXP_DIR_MIXCORR => processed version of the dataset in MixCorr flowpair format after running this notebook.
#
# CAUTION: The filesystem locations PROC_EXP_DIR_* will be deleted if they already exist.
RAW_EXP_DIR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_raw")
PROC_EXP_DIR_OUR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_filtered-to-start-end")
PROC_EXP_DIR_DCOFFEA = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_format-deepcoffea_filtered-to-start-end")
PROC_EXP_DIR_MIXCORR = join(RAW_EXP_DIR_PARENT, f"dataset_{EXP_NAME}_format-mixcorr_filtered-to-start-end")

In [4]:
# Delete and recreate the final dataset directories.

rmtree(PROC_EXP_DIR_OUR, ignore_errors = True)
rmtree(PROC_EXP_DIR_DCOFFEA, ignore_errors = True)
rmtree(PROC_EXP_DIR_MIXCORR, ignore_errors = True)

makedirs(PROC_EXP_DIR_OUR, mode=0o755)
makedirs(PROC_EXP_DIR_DCOFFEA, mode=0o755)
makedirs(PROC_EXP_DIR_MIXCORR, mode=0o755)

In [5]:
# Discover all succeeded runs to process.
runs_succeeded = sorted(glob(join(RAW_EXP_DIR, "*", "exp08_curl_run_*", "SUCCEEDED")))

# For exp08, this should give: 35,611.
print(f"{len(runs_succeeded):,}")

35,611


In [6]:
print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Beginning to process raw exp08 dataset -----\n")

run_ctr = 1

for run_succeeded in runs_succeeded:
    
    parse_run(
        PROC_EXP_DIR_OUR,
        PROC_EXP_DIR_DCOFFEA,
        run_ctr,
        run_succeeded)
    
    run_ctr += 1

print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Finished processing raw exp08 dataset -----")

----- [2023-04-24_16:37:40.372685] Beginning to process raw exp08 dataset -----

[0250] Processing /PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_raw/2023-04-14_16-23-39_mixcorr-nym-endpoints-ccx22-exp08-one_exp08_nym-binaries-v1.1.13_static-http-download/exp08_curl_run_00250...

[0500] Processing /PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_raw/2023-04-14_16-23-39_mixcorr-nym-endpoints-ccx22-exp08-one_exp08_nym-binaries-v1.1.13_static-http-download/exp08_curl_run_00500...

[0750] Processing /PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_raw/2023-04-14_16-23-39_mixcorr-nym-endpoints-ccx22-exp08-one_exp08_nym-binaries-v1.1.13_static-http-download/exp08_curl_run_00750...

[1000] Processing /PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_raw/2023-04-14_16-23-39_mixcorr-nym-endpoints-ccx22-exp08-one_exp08_nym-binaries-v1.1.13_static-http-download/exp08_curl_run_01000...

[1250] Processing /PRIVATE_

In [4]:
# In-between this and the above cell, Jupyter Notebook 'dataset-subsets_train-val-test.ipynb'
# needs to have been run on the newly produced dataset above.

# Thus, include this cell in order to reset the environment for production of the
# MixCorr-formatted version of this dataset.

# Make sure to run cells 1, 2, and 3 at the top first, before returning to this cell, running
# it, and continuing downwards from here.

rmtree(PROC_EXP_DIR_MIXCORR, ignore_errors = True)
makedirs(PROC_EXP_DIR_MIXCORR, mode=0o755)

In [5]:
print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Beginning to produce MixCorr-formatted dataset for exp08 -----\n")

# Create dataset in MixCorr format based on existing dataset in our format.
create_mixcorr_dataset(PROC_EXP_DIR_OUR, PROC_EXP_DIR_MIXCORR)

print(f"----- [{datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')}] Finished producing MixCorr-formatted dataset for exp08 -----")

----- [2023-04-25_10:11:24.873769] Beginning to produce MixCorr-formatted dataset for exp08 -----

Considering flowpairs of dataset split 'train' now...
Done with flowpairs of dataset split 'train'

Considering flowpairs of dataset split 'val' now...
Done with flowpairs of dataset split 'val'

Considering flowpairs of dataset split 'test' now...
Done with flowpairs of dataset split 'test'

----- [2023-04-25_10:23:10.900708] Finished producing MixCorr-formatted dataset for exp08 -----


In [6]:
# Split files in newly produced MixCorr-formatted dataset of at least 95 MiB into
# multiple files in order to be able to upload to GitHub.

files_too_large = ! find {PROC_EXP_DIR_MIXCORR} -type f -size +95M

for file_too_large in files_too_large:
    
    ! split --verbose --lines=5000 --numeric-suffixes=01 {file_too_large} {file_too_large}.
    ! cat {file_too_large}.?? > {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large
    ! cmp --verbose --print-bytes {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large {file_too_large}
    ! rm {file_too_large} {PROC_EXP_DIR_MIXCORR}/tmp_file_too_large

creating file '/PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_data.01'
creating file '/PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_data.02'
creating file '/PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_data.03'
creating file '/PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_data.04'
creating file '/PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_format-mixcorr_filtered-to-start-end/train_initiator_from_gateway_data.05'
creating file '/PRIVATE_PATH/dataset_exp08_nym-binaries-v1.1.13_static-http-download_format-mixcorr_filtered-to-start-end/train_responder_to_gateway_data.01'
creating file '/PRIVATE_PATH/dataset_exp08