## Produce TRAIN, VAL, TEST Subsets for all Experiment Datasets

In [1]:
from glob import glob
from os.path import join
from json import dump as json_dump

In [2]:
### MODIFY BEGIN ###

# List all experiment names that you want to compute the dataset subsets for.
EXP_NAMES = [
    "exp01_nym-binaries-1.0.2_static-http-download",
    "exp02_nym-binaries-1.0.2_static-http-download_no-client-cover-traffic",
    "exp05_nym-binaries-1.0.2_static-http-download_shorter-mix-delay",
    "exp06_nym-binaries-1.0.2_static-http-download_longer-mix-delay",
    "exp07_nym-binaries-1.0.2_static-http-download_network-delay",
    "exp08_nym-binaries-v1.1.13_static-http-download",
]

# Specify base path to where above processed datasets reside.
DATASET_DIRS_PARENT = "/PRIVATE_PATH/"

# Specify the target dataset size in number flowpairs.
# Pick a number that can be cleanly split according to 70%-15%-15%.
DATASET_TARGET_SIZE = 35000

###  MODIFY END  ###

In [3]:
# Determine the number of flowpairs in each subset.
DATASET_TRAIN_SIZE = int(DATASET_TARGET_SIZE * 0.7)
DATASET_VAL_SIZE = int(DATASET_TARGET_SIZE * 0.15)
DATASET_TEST_SIZE = int(DATASET_TARGET_SIZE * 0.15)

assert DATASET_TRAIN_SIZE + DATASET_VAL_SIZE + DATASET_TEST_SIZE == DATASET_TARGET_SIZE, \
    f"Dataset subset sizes ({DATASET_TRAIN_SIZE} + {DATASET_VAL_SIZE} + {DATASET_TEST_SIZE}) don't add up to {DATASET_TARGET_SIZE}"

In [4]:
def determine_subsets(dataset_dir: str, dataset_deepcoffea_dir: str, flowpairs_unsorted: list):
    """
    Takes care of splitting a given flowpairs dataset into 3 non-overlapping subsets: TRAIN, VAL, and TEST.
    The resulting partitions will be written into the relevant dataset repositories.
    """
    
    print(f"---\nDetermining TRAIN, VAL, TEST subsets for dataset '{dataset_dir}'...\n")
    print(f"[@] {len(flowpairs_unsorted)=:,}\n")
    
    # Ensure dataset is large enough.
    assert len(flowpairs_unsorted) >= DATASET_TARGET_SIZE, f"Dataset needs to have at least {DATASET_TARGET_SIZE:,} flowpairs, only got: {len(flowpairs_unsorted)=:,}"
    
    # Extract only the sorting-relevant folder name of each flowpair and sort
    # the entire list of flowpair IDs using Python's sorted default behavior.
    flowpairs = sorted(list(map(lambda path: path.rsplit(sep = "/", maxsplit = 2)[1], flowpairs_unsorted)))
    
    # Ensure that indeed every flowpair folder is unique and thus an identifier.
    flowpairs_unique = set(flowpairs)
    assert len(flowpairs) == len(flowpairs_unique), f"Some flowpair folder names occurr multiple times: {len(flowpairs)=} != {len(flowpairs_unique)=}"
    
    # Split dataset according to 70%-15%-15% rule (TRAIN, VAL, TEST).
    flowpairs_train = flowpairs[:DATASET_TRAIN_SIZE]
    flowpairs_val = flowpairs[DATASET_TRAIN_SIZE:(DATASET_TRAIN_SIZE + DATASET_VAL_SIZE)]
    flowpairs_test = flowpairs[(DATASET_TRAIN_SIZE + DATASET_VAL_SIZE):(DATASET_TRAIN_SIZE + DATASET_VAL_SIZE + DATASET_TEST_SIZE)]
    
    assert len(flowpairs_train) + len(flowpairs_val) + len(flowpairs_test) == DATASET_TARGET_SIZE
    assert len(list(set(flowpairs_train).intersection(set(flowpairs_val)))) == 0, "Unexpected overlap between TRAIN and VAL dataset subset"
    assert len(list(set(flowpairs_train).intersection(set(flowpairs_test)))) == 0, "Unexpected overlap between TRAIN and TEST dataset subset"
    assert len(list(set(flowpairs_val).intersection(set(flowpairs_test)))) == 0, "Unexpected overlap between VAL and TEST dataset subset"
    
    # Write subsets to files.
    
    with open(join(dataset_dir, "flowpairs_train.json"), "w", encoding = "utf-8") as flowpairs_train_fp:
        json_dump(flowpairs_train, flowpairs_train_fp, indent = 4)
        flowpairs_train_fp.write("\n")
        
    with open(join(dataset_deepcoffea_dir, "flowpairs_train.json"), "w", encoding = "utf-8") as flowpairs_train_fp:
        json_dump(flowpairs_train, flowpairs_train_fp, indent = 4)
        flowpairs_train_fp.write("\n")

    with open(join(dataset_dir, "flowpairs_val.json"), "w", encoding = "utf-8") as flowpairs_val_fp:
        json_dump(flowpairs_val, flowpairs_val_fp, indent = 4)
        flowpairs_val_fp.write("\n")
    
    with open(join(dataset_deepcoffea_dir, "flowpairs_val.json"), "w", encoding = "utf-8") as flowpairs_val_fp:
        json_dump(flowpairs_val, flowpairs_val_fp, indent = 4)
        flowpairs_val_fp.write("\n")
    
    with open(join(dataset_dir, "flowpairs_test.json"), "w", encoding = "utf-8") as flowpairs_test_fp:
        json_dump(flowpairs_test, flowpairs_test_fp, indent = 4)
        flowpairs_test_fp.write("\n")
    
    with open(join(dataset_deepcoffea_dir, "flowpairs_test.json"), "w", encoding = "utf-8") as flowpairs_test_fp:
        json_dump(flowpairs_test, flowpairs_test_fp, indent = 4)
        flowpairs_test_fp.write("\n")
    
    # Print boundary flowpair elements for comparison purposes.
    print(f"[@] TRAIN:\n    => number of flowpairs: {len(flowpairs_train):,}\n    => first element: {flowpairs_train[0]}\n    => last element: {flowpairs_train[-1]}\n")
    print(f"[@] VAL:\n    => number of flowpairs: {len(flowpairs_val):,}\n    => first element: {flowpairs_val[0]}\n    => last element: {flowpairs_val[-1]}\n")
    print(f"[@] TEST:\n    => number of flowpairs: {len(flowpairs_test):,}\n    => first element: {flowpairs_test[0]}\n    => last element: {flowpairs_test[-1]}\n")
    
    print("Done!\n---\n")

In [5]:
# Iterate over all defined experiments and split their datasets into TRAIN, VAL, and TEST.

for exp_name in EXP_NAMES:
    
    # Build paths to the two respective formats of this dataset.
    # The DeepCoFFEA path will only be used for storing the final flowpair lists.
    dataset_dir = join(DATASET_DIRS_PARENT, f"dataset_{exp_name}_filtered-to-start-end")
    dataset_deepcoffea_dir = join(DATASET_DIRS_PARENT, f"dataset_{exp_name}_format-deepcoffea_filtered-to-start-end")
    
    # Find all flowpairs in this dataset.
    flowpairs = glob(join(dataset_dir, "*", "flowpair.json"))
    
    # Split dataset into TRAIN, VAL, and TEST subsets.
    determine_subsets(dataset_dir, dataset_deepcoffea_dir, flowpairs)

---
Determining TRAIN, VAL, TEST subsets for dataset '/PRIVATE_PATH/dataset_exp01_nym-binaries-1.0.2_static-http-download_filtered-to-start-end'...

[@] len(flowpairs_unsorted)=35,266

[@] TRAIN:
    => number of flowpairs: 24,500
    => first element: 11TgjQ9kVbUxCUTbrkhW7mUfSU4PFVtqce1qoahhuXa_FFMoBNCmrNXo6JBMFnTTcadAAVbGJqjhaC4p2beH5HaF
    => last element: DiE5kfjw2jFPZgM8Eno13eW3hE53Q65A479vw3FBksLa_4EyCsvcNPQQPCmurVgyLnixb3cStGQUMi2EzP9mFpFPG

[@] VAL:
    => number of flowpairs: 5,250
    => first element: DiHW4d2zBHUkZT3euLnyfbE6jyzvHnNfHN8rKBhhMVcx_3QgYx1QZCBymzcuF2b3K7VYXAUzdFHXBuQJxxX76auA3
    => last element: GE1j8uRUteQB4JoCtswuXoc3CN5FZCsc5vBXNzjCtWVB_ADmXmkHYmX24hsqkG2xpcK16GUi8Z793zDzDE3f5cAdb

[@] TEST:
    => number of flowpairs: 5,250
    => first element: GE2m6XfAsCm2iMzPYFFjfXEF8Na8DNxkadvFXrNNFips_HoLCd9zqncRTMW2EuLcEDHQpFB87EURAE6CSgSNnNjq
    => last element: sMT66cxno4AiBHjd1SSPMALMMiusVJLft4E2eucUT1k_CksiyLbajLCaFYJwGnP7GuyAiVwTxJHiKXw33VpzPnW2

Done!
---

--