In [None]:
import os
import shutil
import yaml
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

DATASET_PATH = Path("/workspace/yolo_dataset_4_dec")
OUTPUT_PATH = Path("/workspace/cv_folds_5fold") 
NUM_FOLDS = 5
SEED = 42

CLASS_NAMES = ['knife', 'gun', 'rifle', 'baseball_bat']
BACKGROUND_CLASS_IDX = 4
STRATIFY_NAMES = CLASS_NAMES + ['background']

In [None]:
def get_image_label_pairs(dataset_path):
    image_dirs = ['train', 'valid', 'test']
    pairs = []
    
    for split in image_dirs:
        img_dir = dataset_path / 'images' / split
        lbl_dir = dataset_path / 'labels' / split
        
        if not img_dir.exists():
            continue
            
        for img_path in img_dir.iterdir():
            if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png']:
                lbl_path = lbl_dir / f"{img_path.stem}.txt"
                if lbl_path.exists():
                    pairs.append({
                        'image_path': str(img_path),
                        'label_path': str(lbl_path),
                        'image_name': img_path.name,
                        'original_split': split
                    })
    
    return pairs

pairs = get_image_label_pairs(DATASET_PATH)
print(f"Total image-label pairs: {len(pairs)}")

In [None]:
def get_primary_class(label_path):
    classes = []
    if os.path.exists(label_path):
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if parts:
                    classes.append(int(parts[0]))
    if not classes:
        return BACKGROUND_CLASS_IDX
    return max(set(classes), key=classes.count)

def get_all_classes(label_path):
    classes = set()
    if os.path.exists(label_path):
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if parts:
                    classes.add(int(parts[0]))
    return classes

for pair in tqdm(pairs, desc="Extracting classes"):
    pair['primary_class'] = get_primary_class(pair['label_path'])
    pair['all_classes'] = get_all_classes(pair['label_path'])
    pair['is_background'] = pair['primary_class'] == BACKGROUND_CLASS_IDX

df = pd.DataFrame(pairs)
print(f"Total samples (including hard negatives): {len(df)}")
print(f"\nClass distribution (primary class):")
print(df['primary_class'].value_counts().sort_index().rename(index=dict(enumerate(STRATIFY_NAMES))))
print(f"\nImages with objects: {(~df['is_background']).sum()}")
print(f"Hard negatives (background): {df['is_background'].sum()}")

In [None]:
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)

df['fold'] = -1
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(df, df['primary_class'])):
    df.loc[df.index[val_idx], 'fold'] = fold_idx

print("Samples per fold:")
print(df['fold'].value_counts().sort_index())

print("\nClass distribution per fold (including background/hard negatives):")
for fold in range(NUM_FOLDS):
    fold_df = df[df['fold'] == fold]
    print(f"\nFold {fold}:")
    print(fold_df['primary_class'].value_counts().sort_index().rename(index=dict(enumerate(STRATIFY_NAMES))))

In [None]:
def create_fold_directories(output_path, num_folds, df):
    output_path = Path(output_path)
    if output_path.exists():
        shutil.rmtree(output_path)
    output_path.mkdir(parents=True)
    
    for fold in range(num_folds):
        fold_dir = output_path / f"fold_{fold}"
        
        for split in ['train', 'val']:
            (fold_dir / 'images' / split).mkdir(parents=True, exist_ok=True)
            (fold_dir / 'labels' / split).mkdir(parents=True, exist_ok=True)
        
        val_mask = df['fold'] == fold
        train_mask = ~val_mask
        
        for _, row in tqdm(df[train_mask].iterrows(), 
                          desc=f"Fold {fold} - train", 
                          total=train_mask.sum()):
            img_src = Path(row['image_path'])
            lbl_src = Path(row['label_path'])
            
            img_dst = fold_dir / 'images' / 'train' / img_src.name
            lbl_dst = fold_dir / 'labels' / 'train' / lbl_src.name
            
            shutil.copy2(img_src, img_dst)
            shutil.copy2(lbl_src, lbl_dst)
        
        for _, row in tqdm(df[val_mask].iterrows(), 
                          desc=f"Fold {fold} - val", 
                          total=val_mask.sum()):
            img_src = Path(row['image_path'])
            lbl_src = Path(row['label_path'])
            
            img_dst = fold_dir / 'images' / 'val' / img_src.name
            lbl_dst = fold_dir / 'labels' / 'val' / lbl_src.name
            
            shutil.copy2(img_src, img_dst)
            shutil.copy2(lbl_src, lbl_dst)
        
        data_yaml = {
            'path': str(fold_dir.absolute()),
            'train': 'images/train',
            'val': 'images/val',
            'nc': len(CLASS_NAMES),
            'names': CLASS_NAMES
        }
        
        with open(fold_dir / 'data.yaml', 'w') as f:
            yaml.dump(data_yaml, f, default_flow_style=False)
        
        print(f"Fold {fold}: train={train_mask.sum()}, val={val_mask.sum()}")

create_fold_directories(OUTPUT_PATH, NUM_FOLDS, df)

In [None]:
df['image_id'] = df['image_path'].apply(lambda x: Path(x).stem)
fold_mapping = df[['image_id', 'image_path', 'label_path', 'fold', 'primary_class', 'is_background']].copy()
fold_mapping['all_classes'] = df['all_classes'].apply(lambda x: list(x))
fold_mapping.to_csv(OUTPUT_PATH / 'fold_mapping.csv', index=False)
fold_mapping.to_pickle(OUTPUT_PATH / 'fold_mapping.pkl')

print(f"Fold mapping saved to {OUTPUT_PATH / 'fold_mapping.csv'}")
print(f"Total samples: {len(fold_mapping)}")
print(f"  - With objects: {(~fold_mapping['is_background']).sum()}")
print(f"  - Hard negatives: {fold_mapping['is_background'].sum()}")

### Prepare classification

If you need actual copies (e.g., for portability or uploading to cloud), just replace:  
os.symlink(src_path.absolute(), dst_path)  
with:  
shutil.copy2(src_path, dst_path)  

In [1]:
import shutil
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
import numpy as np
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from functools import partial

def copy_file(idx, all_images, all_labels, classes, fold_dir, split):
    src_path = all_images[idx]
    class_name = classes[all_labels[idx]]
    dst_path = fold_dir / split / class_name / src_path.name
    shutil.copy2(src_path, dst_path)

dataset_path = Path('/workspace/yolo_dataset_cls_cropped')
output_path = Path('/workspace/yolo_dataset_cls_5fold')

classes = ['baseball_bat', 'gun', 'knife', 'rifle']
all_images = []
all_labels = []

print("Collecting all images...")
for class_idx, class_name in enumerate(tqdm(classes, desc="Classes")):
    for split in ['train', 'valid', 'test']:
        class_dir = dataset_path / split / class_name
        if class_dir.exists():
            for img_path in class_dir.glob('*.jpg'):
                all_images.append(img_path)
                all_labels.append(class_idx)

all_images = np.array(all_images)
all_labels = np.array(all_labels)

print(f"Total images: {len(all_images)}")
print(f"Class distribution: {np.bincount(all_labels)}")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold_idx, (train_idx, val_idx) in enumerate(skf.split(all_images, all_labels)):
    print(f"\nProcessing Fold {fold_idx}...")
    fold_dir = output_path / f'fold_{fold_idx}'
    
    for split in ['train', 'val']:
        for class_name in classes:
            (fold_dir / split / class_name).mkdir(parents=True, exist_ok=True)
    
    with ProcessPoolExecutor(max_workers=8) as executor:
        copy_func = partial(copy_file, all_images=all_images, all_labels=all_labels, 
                           classes=classes, fold_dir=fold_dir, split='train')
        list(tqdm(executor.map(copy_func, train_idx), total=len(train_idx), 
                  desc=f"Fold {fold_idx} - Train", unit="img"))
        
        copy_func = partial(copy_file, all_images=all_images, all_labels=all_labels, 
                           classes=classes, fold_dir=fold_dir, split='val')
        list(tqdm(executor.map(copy_func, val_idx), total=len(val_idx), 
                  desc=f"Fold {fold_idx} - Val", unit="img"))
    
    print(f"Fold {fold_idx}: train={len(train_idx)}, val={len(val_idx)}")

print(f"\n5-fold dataset created at: {output_path}")

Collecting all images...


Classes: 100%|██████████| 4/4 [00:00<00:00,  8.11it/s]


Total images: 15918
Class distribution: [1532 6423 3362 4601]

Processing Fold 0...


Fold 0 - Train:   3%|▎         | 399/12734 [01:08<42:01,  4.89img/s]Process ForkProcess-5:
Process ForkProcess-3:
Process ForkProcess-6:
Fold 0 - Train:   3%|▎         | 401/12734 [01:09<35:25,  5.80img/s]Process ForkProcess-7:
Process ForkProcess-8:
Process ForkProcess-1:
Process ForkProcess-2:
Process ForkProcess-4:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/root/.local/share/uv/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/root/.local/share/uv/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/root/.local/share/uv/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/multiproc

KeyboardInterrupt: 