# Dataset Organization: Train / Test / Eval Splits

This notebook provides reusable functions to organize class-separated image datasets into standard train/test/eval directory splits for reproducible training and evaluation.

In [None]:
import os
import shutil
from sklearn.model_selection import train_test_split

In [None]:
def create_directory_structure(dest_dir, classes):
    """
    Create the dataset directory structure.
    
    Args:
        dest_dir: Destination directory path
        classes: List of class names
    """
    splits = ['train', 'test', 'eval']
    for split in splits:
        for cls in classes:
            os.makedirs(os.path.join(dest_dir, split, cls), exist_ok=True)
    print(f"Created directory structure in '{dest_dir}/'")

In [None]:
def split_files(files, train_ratio=0.7, test_ratio=0.15, eval_ratio=0.15, seed=42):
    """
    Split file list into train, test, and eval sets.
    
    Args:
        files: List of file names
        train_ratio: Proportion for training (default 0.7)
        test_ratio: Proportion for testing (default 0.15)
        eval_ratio: Proportion for evaluation (default 0.15)
        seed: Random seed for reproducibility
    
    Returns:
        Tuple of (train_files, test_files, eval_files)
    """
    assert abs(train_ratio + test_ratio + eval_ratio - 1.0) < 1e-6, "Ratios must sum to 1"
    
    train_files, temp_files = train_test_split(files, train_size=train_ratio, random_state=seed)
    relative_test = test_ratio / (test_ratio + eval_ratio)
    test_files, eval_files = train_test_split(temp_files, train_size=relative_test, random_state=seed)
    
    return train_files, test_files, eval_files

In [None]:
def copy_files_to_split(files, source_cls_path, dest_dir, split, cls):
    """
    Copy files to the appropriate split directory.
    
    Args:
        files: List of file names to copy
        source_cls_path: Source class directory path
        dest_dir: Destination base directory
        split: Split name ('train', 'test', or 'eval')
        cls: Class name
    """
    for f in files:
        src = os.path.join(source_cls_path, f)
        dst = os.path.join(dest_dir, split, cls, f)
        shutil.copy2(src, dst)

In [None]:
def organize_dataset(source_dir, dest_dir='dataset', train_ratio=0.7, test_ratio=0.15, eval_ratio=0.15, seed=42):
    """
    Organize class-separated images into train/test/eval splits.
    
    Args:
        source_dir: Path to source directory containing class folders
        dest_dir: Path to destination dataset directory (default 'dataset')
        train_ratio: Proportion for training set (default 0.7)
        test_ratio: Proportion for test set (default 0.15)
        eval_ratio: Proportion for eval set (default 0.15)
        seed: Random seed for reproducibility (default 42)
    
    Returns:
        Path to the organized dataset directory
    """
    # Get class directories
    classes = [d for d in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, d))]
    
    if not classes:
        raise ValueError(f"No class directories found in '{source_dir}'")
    
    # Create directory structure
    create_directory_structure(dest_dir, classes)
    
    # Process each class
    stats = {'train': 0, 'test': 0, 'eval': 0}
    
    for cls in classes:
        cls_path = os.path.join(source_dir, cls)
        files = [f for f in os.listdir(cls_path) if os.path.isfile(os.path.join(cls_path, f))]
        
        if not files:
            print(f"Warning: No files found in class '{cls}'")
            continue
        
        # Split files
        train_files, test_files, eval_files = split_files(
            files, train_ratio, test_ratio, eval_ratio, seed
        )
        
        # Copy to respective directories
        copy_files_to_split(train_files, cls_path, dest_dir, 'train', cls)
        copy_files_to_split(test_files, cls_path, dest_dir, 'test', cls)
        copy_files_to_split(eval_files, cls_path, dest_dir, 'eval', cls)
        
        stats['train'] += len(train_files)
        stats['test'] += len(test_files)
        stats['eval'] += len(eval_files)
    
    print(f"\nDataset organized successfully!")
    print(f"Classes: {len(classes)}")
    print(f"Train: {stats['train']} files")
    print(f"Test: {stats['test']} files")
    print(f"Eval: {stats['eval']} files")
    
    return dest_dir

## Usage Example

```python
# Organize dataset with default 70/15/15 split
organize_dataset('path/to/source/images')

# Custom split ratios
organize_dataset('path/to/source/images', dest_dir='my_dataset', 
                 train_ratio=0.8, test_ratio=0.1, eval_ratio=0.1)
```

In [None]:
# Uncomment and modify the path to run
# organize_dataset('path/to/your/source/images')