In [None]:
import os
import shutil
import yaml
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

DATASET_PATH = Path("/workspace/yolo_dataset_4_dec")
OUTPUT_PATH = Path("/workspace/cv_folds_5fold") 
NUM_FOLDS = 5
SEED = 42

CLASS_NAMES = ['knife', 'gun', 'rifle', 'baseball_bat']
BACKGROUND_CLASS_IDX = 4
STRATIFY_NAMES = CLASS_NAMES + ['background']

In [None]:
def get_image_label_pairs(dataset_path):
    image_dirs = ['train', 'valid', 'test']
    pairs = []
    
    for split in image_dirs:
        img_dir = dataset_path / 'images' / split
        lbl_dir = dataset_path / 'labels' / split
        
        if not img_dir.exists():
            continue
            
        for img_path in img_dir.iterdir():
            if img_path.suffix.lower() in ['.jpg', '.jpeg', '.png']:
                lbl_path = lbl_dir / f"{img_path.stem}.txt"
                if lbl_path.exists():
                    pairs.append({
                        'image_path': str(img_path),
                        'label_path': str(lbl_path),
                        'image_name': img_path.name,
                        'original_split': split
                    })
    
    return pairs

pairs = get_image_label_pairs(DATASET_PATH)
print(f"Total image-label pairs: {len(pairs)}")

In [None]:
def get_primary_class(label_path):
    classes = []
    if os.path.exists(label_path):
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if parts:
                    classes.append(int(parts[0]))
    if not classes:
        return BACKGROUND_CLASS_IDX
    return max(set(classes), key=classes.count)

def get_all_classes(label_path):
    classes = set()
    if os.path.exists(label_path):
        with open(label_path, 'r') as f:
            for line in f:
                parts = line.strip().split()
                if parts:
                    classes.add(int(parts[0]))
    return classes

for pair in tqdm(pairs, desc="Extracting classes"):
    pair['primary_class'] = get_primary_class(pair['label_path'])
    pair['all_classes'] = get_all_classes(pair['label_path'])
    pair['is_background'] = pair['primary_class'] == BACKGROUND_CLASS_IDX

df = pd.DataFrame(pairs)
print(f"Total samples (including hard negatives): {len(df)}")
print(f"\nClass distribution (primary class):")
print(df['primary_class'].value_counts().sort_index().rename(index=dict(enumerate(STRATIFY_NAMES))))
print(f"\nImages with objects: {(~df['is_background']).sum()}")
print(f"Hard negatives (background): {df['is_background'].sum()}")

In [None]:
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=SEED)

df['fold'] = -1
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(df, df['primary_class'])):
    df.loc[df.index[val_idx], 'fold'] = fold_idx

print("Samples per fold:")
print(df['fold'].value_counts().sort_index())

print("\nClass distribution per fold (including background/hard negatives):")
for fold in range(NUM_FOLDS):
    fold_df = df[df['fold'] == fold]
    print(f"\nFold {fold}:")
    print(fold_df['primary_class'].value_counts().sort_index().rename(index=dict(enumerate(STRATIFY_NAMES))))

In [None]:
def create_fold_directories(output_path, num_folds, df):
    output_path = Path(output_path)
    if output_path.exists():
        shutil.rmtree(output_path)
    output_path.mkdir(parents=True)
    
    for fold in range(num_folds):
        fold_dir = output_path / f"fold_{fold}"
        
        for split in ['train', 'val']:
            (fold_dir / 'images' / split).mkdir(parents=True, exist_ok=True)
            (fold_dir / 'labels' / split).mkdir(parents=True, exist_ok=True)
        
        val_mask = df['fold'] == fold
        train_mask = ~val_mask
        
        for _, row in tqdm(df[train_mask].iterrows(), 
                          desc=f"Fold {fold} - train", 
                          total=train_mask.sum()):
            img_src = Path(row['image_path'])
            lbl_src = Path(row['label_path'])
            
            img_dst = fold_dir / 'images' / 'train' / img_src.name
            lbl_dst = fold_dir / 'labels' / 'train' / lbl_src.name
            
            shutil.copy2(img_src, img_dst)
            shutil.copy2(lbl_src, lbl_dst)
        
        for _, row in tqdm(df[val_mask].iterrows(), 
                          desc=f"Fold {fold} - val", 
                          total=val_mask.sum()):
            img_src = Path(row['image_path'])
            lbl_src = Path(row['label_path'])
            
            img_dst = fold_dir / 'images' / 'val' / img_src.name
            lbl_dst = fold_dir / 'labels' / 'val' / lbl_src.name
            
            shutil.copy2(img_src, img_dst)
            shutil.copy2(lbl_src, lbl_dst)
        
        data_yaml = {
            'path': str(fold_dir.absolute()),
            'train': 'images/train',
            'val': 'images/val',
            'nc': len(CLASS_NAMES),
            'names': CLASS_NAMES
        }
        
        with open(fold_dir / 'data.yaml', 'w') as f:
            yaml.dump(data_yaml, f, default_flow_style=False)
        
        print(f"Fold {fold}: train={train_mask.sum()}, val={val_mask.sum()}")

create_fold_directories(OUTPUT_PATH, NUM_FOLDS, df)

In [None]:
df['image_id'] = df['image_path'].apply(lambda x: Path(x).stem)
fold_mapping = df[['image_id', 'image_path', 'label_path', 'fold', 'primary_class', 'is_background']].copy()
fold_mapping['all_classes'] = df['all_classes'].apply(lambda x: list(x))
fold_mapping.to_csv(OUTPUT_PATH / 'fold_mapping.csv', index=False)
fold_mapping.to_pickle(OUTPUT_PATH / 'fold_mapping.pkl')

print(f"Fold mapping saved to {OUTPUT_PATH / 'fold_mapping.csv'}")
print(f"Total samples: {len(fold_mapping)}")
print(f"  - With objects: {(~fold_mapping['is_background']).sum()}")
print(f"  - Hard negatives: {fold_mapping['is_background'].sum()}")