# **Dataset splitter**
From a json and image folder turn it into 3 different train, val, test folders and json files respectively

In [1]:
import os
import json
import shutil
import random
from pathlib import Path

# ======================

def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_json(data, path):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

def split_data(entries, split_ratio):
    random.shuffle(entries)
    total = len(entries)
    train_end = int(split_ratio[0] / 100 * total)
    val_end = train_end + int(split_ratio[1] / 100 * total)
    return entries[:train_end], entries[train_end:val_end], entries[val_end:]

def prepare_output_dirs(base_dir):
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(base_dir, split), exist_ok=True)

def process_split(filenames, name, image_dir, output_dir, data):
    split_json = {}
    for fname in sorted(filenames):  # Sort alphabetically/numerically
        src_path = os.path.join(image_dir, fname)
        dst_path = os.path.join(output_dir, name, fname)
        if not os.path.isfile(src_path):
            print(f"[Warning] Image not found: {fname}")
            continue
        shutil.copy2(src_path, dst_path)
        split_json[fname] = data[fname]
    # Save the JSON in the main output directory, not inside the split folder
    save_json(split_json, os.path.join(output_dir, f"{name}.json"))

In [2]:
# === CONFIGURATION ===
json_path = "/home/moliveros/Datasets/faustLabels.json"       # Path to the JSON file
image_dir = "/home/moliveros/Datasets/faust"                 # Folder containing images
output_dir = "/home/moliveros/Datasets/faustSplit"              # Output directory for splits
split_ratio = [80, 10, 10]           # Split percentages for train, val, test

def main():
    if sum(split_ratio) != 100:
        raise ValueError("Split ratios must sum to 100.")

    data = load_json(json_path)
    all_files = list(data.keys())

    train, val, test = split_data(all_files, split_ratio)

    prepare_output_dirs(output_dir)

    process_split(train, "train", image_dir, output_dir, data)
    process_split(val, "val", image_dir, output_dir, data)
    process_split(test, "test", image_dir, output_dir, data)

    print(f"Done: {len(train)} train, {len(val)} val, {len(test)} test samples.")

if __name__ == "__main__":
    main()

Done: 10199 train, 1274 val, 1276 test samples.
