In [None]:
# ------------------------------------------------------------
# Split Penn-Action tokens in output/train/ into test / eval
# (keeps whole videos together)
# ------------------------------------------------------------

import random, shutil, sys
from pathlib import Path

# ---------- user-editable parameters ----------
out_root   = Path("output")   # root that already contains train/
test_frac  = 0.15             # 15 % of videos → test
eval_frac  = 0.15             # 15 % of videos → eval
seed       = 42               # make split reproducible
move_files = False            # True → move, False → copy
# ---------------------------------------------

modalities = ["tok_rgb", "tok_pose", "coords", "captions"]

# 1) list all video IDs present under train/tok_rgb/
train_rgb_dir = out_root / "train" / "tok_rgb"
videos = sorted(d.name for d in train_rgb_dir.iterdir() if d.is_dir())
if not videos:
    raise RuntimeError("No videos found in train/tok_rgb/. Check your path.")

# 2) shuffle once with a fixed seed, then slice into splits
rng = random.Random(seed)
rng.shuffle(videos)

n_total = len(videos)
n_test  = int(n_total * test_frac)
n_eval  = int(n_total * eval_frac)

test_videos = set(videos[:n_test])
eval_videos = set(videos[n_test:n_test + n_eval])

print(f"Total videos: {n_total}")
print(f"  test : {len(test_videos)} ({test_frac:.0%})")
print(f"  eval : {len(eval_videos)} ({eval_frac:.0%})")
print(f"  train: {n_total - len(test_videos) - len(eval_videos)}")

# 3) make sure destination dirs exist
for split in ["test", "eval"]:
    for m in modalities:
        (out_root / split / m).mkdir(parents=True, exist_ok=True)

# helper
def copy_or_move(src: Path, dst: Path):
    if move_files:
        shutil.move(src, dst)
    else:
        shutil.copytree(src, dst)

# 4) copy / move the directories
for vid in videos:
    if vid in test_videos:
        split = "test"
    elif vid in eval_videos:
        split = "eval"
    else:
        continue  # stays in train

    for m in modalities:
        src = out_root / "train" / m / vid
        dst = out_root / split  / m / vid
        if dst.exists():        # resume-safe
            continue
        copy_or_move(src, dst)

print("✅  Split complete.")
if not move_files:
    print("Original data were copied; set move_files=True to relocate instead.")

Total videos: 2326
  test : 348 (15%)
  eval : 348 (15%)
  train: 1630
