# Train Eval Test Split

Leverage torch datasets to concatonate all folder based datasets for the purpose of train/eval/test splitting

Per decision log entry "Data Analysis Training Split" we prefer static 

In [1]:
from pathlib import Path
import json

import torch
from torch.utils.data import (
    ConcatDataset,
    Dataset,
    random_split,
)

from blocktrain.datasets.ir_folder_image_file_system_dataset import (
    FolderImageFileSystemDataset,
)
import blocktrain

In [2]:
SEED = 123
DATASET_PATH = Path("/mnt/d/drone_data/training-data")
SPLITS = [0.60, 0.20, 0.20]

In [3]:
folder_datasets: list[Dataset] = [

    # note hardcoded[0]th index is because we expect 1 
    FolderImageFileSystemDataset(list(dataset_folder.glob("*.json"))[0], dataset_folder)

    for dataset_folder in list(DATASET_PATH.glob("*_1"))
]

# concatonate all datasets into a single dataset with all training examples
concat_dataset = ConcatDataset(folder_datasets)

In [4]:
print(f"There are {len(concat_dataset)} training examples in the entire dataset before plitting")

There are 13502 training examples in the entire dataset before plitting


In [5]:
generator = torch.Generator().manual_seed(SEED)

DATASET_INDEX_LIST: dict[str, list[int]] = dict(zip(
    ["train_indicies", "eval_indicies", "test_indicies"], 
    random_split(range(len(concat_dataset)), SPLITS, generator=generator)
))

Save dataset indicies in a stable way

In [6]:
data_folder = Path(blocktrain.__file__).parent.parent.parent/"data"
for key in DATASET_INDEX_LIST:
    json.dump(list(DATASET_INDEX_LIST[key]), (data_folder/f"{key}.json").open('w'))

In [7]:
print(f"the train/eval/test datasets have {len(DATASET_INDEX_LIST["train_indicies"])}/{len(DATASET_INDEX_LIST["eval_indicies"])}/{len(DATASET_INDEX_LIST["test_indicies"])} respectively images")

the train/eval/test datasets have 8102/2700/2700 respectively images
