In [2]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm


In [3]:

# CONFIG
METADATA_PATH = "../data/processed/processed_frames_metadata.csv"
SOURCE_ROOT = "../data/processed"
OUTPUT_ROOT = "../data/split_processed"
RANDOM_SEED = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.1  # relative to total dataset (not train only)



In [4]:
# Load metadata
df = pd.read_csv(METADATA_PATH)
print(f"Loaded metadata with {len(df):,} frames and {df['category'].nunique()} categories")

# Ensure unique mapping
if 'original_file' not in df.columns or 'category' not in df.columns:
    raise KeyError("Metadata must contain 'original_file' and 'category' columns.")

# --- Step 1: Collapse to unique original files ---
file_groups = df[['original_file', 'category']].drop_duplicates()

# --- Step 2: Stratified split by category on original files ---
train_val_files, test_files = train_test_split(
    file_groups,
    test_size=TEST_SIZE,
    stratify=file_groups['category'],
    random_state=RANDOM_SEED
)

# Now split train_val into train and validation
relative_val_size = VAL_SIZE / (1 - TEST_SIZE)  # adjust proportionally
train_files, val_files = train_test_split(
    train_val_files,
    test_size=relative_val_size,
    stratify=train_val_files['category'],
    random_state=RANDOM_SEED
)

# --- Step 3: Create output folders ---
splits = ['train', 'val', 'test']
for split in splits:
    for category in df['category'].unique():
        os.makedirs(os.path.join(OUTPUT_ROOT, split, category), exist_ok=True)

# --- Step 4: Helper to copy frames belonging to given original files ---
def copy_frames(file_df, split_name):
    split_dir = os.path.join(OUTPUT_ROOT, split_name)
    frame_df = df[df['original_file'].isin(file_df['original_file'])]
    print(f"\n[{split_name.upper()}] Copying {len(frame_df):,} frames ({len(file_df):,} unique files)")
    
    for _, row in tqdm(frame_df.iterrows(), total=len(frame_df), desc=f"Copying {split_name}"):
        src = row['frame_path']
        category = row['category']
        dst = os.path.join(split_dir, category, os.path.basename(src))
        try:
            shutil.copy2(src, dst)
        except Exception as e:
            print(f"Error copying {src}: {e}")

# --- Step 5: Copy files ---
copy_frames(train_files, "train")
copy_frames(val_files, "val")
copy_frames(test_files, "test")

# --- Step 6: Save split summaries ---
split_summary = {
    "train": len(train_files),
    "val": len(val_files),
    "test": len(test_files),
    "total_unique_files": len(file_groups),
    "total_frames": len(df)
}
pd.DataFrame(split_summary, index=[0]).to_csv(os.path.join(OUTPUT_ROOT, "split_summary.csv"), index=False)

print("\nDataset successfully split and saved to:", OUTPUT_ROOT)
print(f"Unique original files → Train: {len(train_files)}, Val: {len(val_files)}, Test: {len(test_files)}")


Loaded metadata with 8,496 frames and 6 categories

[TRAIN] Copying 5,932 frames (666 unique files)


Copying train: 100%|██████████| 5932/5932 [00:28<00:00, 211.16it/s]



[VAL] Copying 824 frames (96 unique files)


Copying val: 100%|██████████| 824/824 [00:03<00:00, 209.52it/s]



[TEST] Copying 1,740 frames (191 unique files)


Copying test: 100%|██████████| 1740/1740 [00:08<00:00, 215.34it/s]


Dataset successfully split and saved to: ../data/split_processed
Unique original files → Train: 666, Val: 96, Test: 191





In [5]:
print("\n=== FINAL SPLIT SUMMARY ===")
for split, files in [("train", train_files), ("val", val_files), ("test", test_files)]:
    frame_count = len(df[df['original_file'].isin(files['original_file'])])
    print(f"{split.capitalize():5} → {len(files):,} videos, {frame_count:,} frames")


=== FINAL SPLIT SUMMARY ===
Train → 666 videos, 5,932 frames
Val   → 96 videos, 824 frames
Test  → 191 videos, 1,740 frames
