In [2]:
import os
import shutil
from pathlib import Path
import re

cls_data_path = Path("/workspace/yolo_dataset_cls_5fold")
yolo_data_path = Path("/workspace/yolo_dataset_4_dec")

In [3]:
to_remove_both = [
    "dangerous_train_000706",
    "dangerous_train_003229",
    "dangerous_test_000487",
    "dangerous_train_002894",
    "voc_train_005904",
    "youtube_test_001853",
    "youtube_train_008184",
    "youtube_train_008620",
    "youtube_train_008931",
    "youtube_train_010154",
    "youtube_valid_001774",
    "crowdhuman_train_011771",
    "dangerous_test_000168",
    "youtube_train_008275",
    "youtube_test_001725",
    "youtube_test_001915",
    "youtube_train_010819",
]

to_remove_cls_only = [
    "mp_94",
    "voc_test_001274",
    "voc_train_007628",
    "dangerous_train_003404",
    "dangerous_train_000424",
    "youtube_test_001833",
    "dangerous_train_003287",
]

print(f"To remove from BOTH (cls + yolo): {len(to_remove_both)} images")
print(f"To remove from CLS only: {len(to_remove_cls_only)} images")


To remove from BOTH (cls + yolo): 17 images
To remove from CLS only: 7 images


In [4]:
def count_cls_images(path):
    """Count total images in cls dataset."""
    count = 0
    for root, dirs, files in os.walk(path):
        count += sum(1 for f in files if f.endswith(('.jpg', '.jpeg', '.png')))
    return count

print(f"CLS dataset before: {count_cls_images(cls_data_path)} images")


CLS dataset before: 99062 images


In [5]:
all_to_remove_cls = to_remove_both + to_remove_cls_only

removed_cls = []
not_found_cls = []

for base_name in all_to_remove_cls:
    found = False
    for root, dirs, files in os.walk(cls_data_path):
        for file in files:
            if file.endswith(('.jpg', '.jpeg', '.png')):
                if file.startswith(base_name):
                    file_path = Path(root) / file
                    print(f"Removing from CLS: {file_path}")
                    file_path.unlink()
                    removed_cls.append(str(file_path))
                    found = True
    if not found:
        not_found_cls.append(base_name)

print(f"\nRemoved from CLS: {len(removed_cls)} files")
print(f"Not found in CLS: {len(not_found_cls)} files")
if not_found_cls:
    print(f"  Missing: {not_found_cls[:5]}")


Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_4/train/baseball_bat/dangerous_train_000706.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_3/train/baseball_bat/dangerous_train_000706.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_2/train/baseball_bat/dangerous_train_000706.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_1/train/baseball_bat/dangerous_train_000706.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_0/val/baseball_bat/dangerous_train_000706.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_4/train/baseball_bat/dangerous_train_003229.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_3/val/baseball_bat/dangerous_train_003229.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_2/train/baseball_bat/dangerous_train_003229.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fold/fold_1/train/baseball_bat/dangerous_train_003229.jpg
Removing from CLS: /workspace/yolo_dataset_cls_5fo

In [6]:
print(f"CLS dataset after: {count_cls_images(cls_data_path)} images")

CLS dataset after: 98697 images


In [7]:
def count_yolo_images(path):
    """Count images in YOLO dataset (train/test/valid folders)."""
    count = 0
    for split in ['train', 'test', 'valid']:
        img_dir = path / 'images' / split
        if img_dir.exists():
            count += len(list(img_dir.glob('*.jpg'))) + len(list(img_dir.glob('*.png')))
    return count

print(f"YOLO dataset before: {count_yolo_images(yolo_data_path)} images")


YOLO dataset before: 17411 images


In [9]:
removed_yolo = []
not_found_yolo = []

for base_name in to_remove_both:
    found = False
    for split in ['train', 'test', 'valid']:
        for folder in ['images', 'labels']:
            base_dir = yolo_data_path / folder / split
            if not base_dir.exists():
                continue
            for ext in ['.jpg', '.png', '.txt']:
                file_path = base_dir / f"{base_name}{ext}"
                if file_path.exists():
                    print(f"Removing from YOLO: {file_path}")
                    file_path.unlink()
                    removed_yolo.append(str(file_path))
                    found = True
    if not found:
        not_found_yolo.append(base_name)

print(f"\nRemoved from YOLO: {len(removed_yolo)} files")
print(f"Not found in YOLO: {len(not_found_yolo)} files")
if not_found_yolo:
    print(f"  Missing: {not_found_yolo[:5]}")



Removed from YOLO: 0 files
Not found in YOLO: 17 files
  Missing: ['dangerous_train_000706', 'dangerous_train_003229', 'dangerous_test_000487', 'dangerous_train_002894', 'voc_train_005904']


In [10]:
print(f"YOLO dataset after: {count_yolo_images(yolo_data_path)} images")


YOLO dataset after: 17394 images


In [11]:
label_changes = {
    "voc_train_004772": {"from_class": 1, "to_class": 2},
    "voc_train_007833": {"from_class": 1, "to_class": 2},
    "youtube_test_001881": {"from_class": 1, "to_class": 2},
    "youtube_train_008286": {"from_class": 1, "to_class": 2},
    "youtube_train_008302": {"from_class": 1, "to_class": 2},
    "youtube_train_008881": {"from_class": 1, "to_class": 2},
    "youtube_train_009064": {"from_class": 1, "to_class": 2},
    "youtube_train_009088": {"from_class": 1, "to_class": 2},
    "dangerous_train_003450": {"from_class": 2, "to_class": 1},
    "youtube_train_008524": {"from_class": 2, "to_class": 1},
    "youtube_train_008698": {"from_class": 2, "to_class": 1},
    "youtube_train_008729": {"from_class": 2, "to_class": 1},
    "youtube_train_009112": {"from_class": 2, "to_class": 1},
    "dangerous_valid_000576": {"from_class": 2, "to_class": 0},
}

remove_tiny_labels = [
    "youtube_train_008184",
    "youtube_train_008275",
    "youtube_test_001915",
    "youtube_train_008745",
    "youtube_train_009729",
    "youtube_train_010396",
    "youtube_train_010466",
    "youtube_train_010730",
    "youtube_train_009838",
]

print(f"Label changes (class swap): {len(label_changes)}")
print(f"Remove tiny labels: {len(remove_tiny_labels)}")


Label changes (class swap): 14
Remove tiny labels: 9


In [12]:
def change_label_class(label_path, from_class, to_class):
    """Change class ID in YOLO label file."""
    if not label_path.exists():
        return False
    
    lines = label_path.read_text().strip().split('\n')
    new_lines = []
    changed = False
    
    for line in lines:
        if line.strip():
            parts = line.split()
            if int(parts[0]) == from_class:
                parts[0] = str(to_class)
                changed = True
            new_lines.append(' '.join(parts))
    
    if changed:
        label_path.write_text('\n'.join(new_lines) + '\n')
    return changed

changed_count = 0
for base_name, change in label_changes.items():
    for split in ['train', 'test', 'valid']:
        label_path = yolo_data_path / 'labels' / split / f"{base_name}.txt"
        if change_label_class(label_path, change['from_class'], change['to_class']):
            print(f"Changed {base_name}: class {change['from_class']} -> {change['to_class']}")
            changed_count += 1
            break

print(f"\nTotal label class changes: {changed_count}")


Changed voc_train_004772: class 1 -> 2
Changed voc_train_007833: class 1 -> 2
Changed youtube_test_001881: class 1 -> 2
Changed youtube_train_008286: class 1 -> 2
Changed youtube_train_008302: class 1 -> 2
Changed youtube_train_008881: class 1 -> 2
Changed youtube_train_009064: class 1 -> 2
Changed youtube_train_009088: class 1 -> 2
Changed dangerous_train_003450: class 2 -> 1
Changed youtube_train_008524: class 2 -> 1
Changed youtube_train_008698: class 2 -> 1
Changed youtube_train_008729: class 2 -> 1
Changed youtube_train_009112: class 2 -> 1
Changed dangerous_valid_000576: class 2 -> 0

Total label class changes: 14


In [13]:
def remove_smallest_bbox(label_path):
    """Remove the smallest bbox from YOLO label file (by area)."""
    if not label_path.exists():
        return False
    
    lines = label_path.read_text().strip().split('\n')
    if len(lines) <= 1:
        return False
    
    bboxes = []
    for line in lines:
        if line.strip():
            parts = line.split()
            if len(parts) >= 5:
                w, h = float(parts[3]), float(parts[4])
                area = w * h
                bboxes.append((area, line))
    
    if len(bboxes) <= 1:
        return False
    
    bboxes.sort(key=lambda x: x[0])
    kept_lines = [bbox[1] for bbox in bboxes[1:]]
    
    label_path.write_text('\n'.join(kept_lines) + '\n')
    print(f"  Removed smallest bbox (area={bboxes[0][0]:.4f}) from {label_path.name}")
    return True

removed_tiny_count = 0
for base_name in remove_tiny_labels:
    for split in ['train', 'test', 'valid']:
        label_path = yolo_data_path / 'labels' / split / f"{base_name}.txt"
        if remove_smallest_bbox(label_path):
            removed_tiny_count += 1
            break

print(f"\nTotal tiny labels removed: {removed_tiny_count}")


  Removed smallest bbox (area=0.0042) from youtube_train_008745.txt
  Removed smallest bbox (area=0.0020) from youtube_train_009729.txt
  Removed smallest bbox (area=0.0050) from youtube_train_010396.txt
  Removed smallest bbox (area=0.0280) from youtube_train_010466.txt
  Removed smallest bbox (area=0.0002) from youtube_train_010730.txt
  Removed smallest bbox (area=0.0104) from youtube_train_009838.txt

Total tiny labels removed: 6


In [14]:
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"\nCLS Dataset:")
print(f"  - Removed: {len(removed_cls)} files")
print(f"  - Not found: {len(not_found_cls)}")

print(f"\nYOLO Dataset:")
print(f"  - Removed: {len(removed_yolo)} files")
print(f"  - Not found: {len(not_found_yolo)}")
print(f"  - Label class changes: {changed_count}")
print(f"  - Tiny labels removed: {removed_tiny_count}")

print(f"\nAll removed image names (base):")
for name in sorted(set(to_remove_both + to_remove_cls_only)):
    print(f"  - {name}")


SUMMARY

CLS Dataset:
  - Removed: 365 files
  - Not found: 0

YOLO Dataset:
  - Removed: 0 files
  - Not found: 17
  - Label class changes: 14
  - Tiny labels removed: 6

All removed image names (base):
  - crowdhuman_train_011771
  - dangerous_test_000168
  - dangerous_test_000487
  - dangerous_train_000424
  - dangerous_train_000706
  - dangerous_train_002894
  - dangerous_train_003229
  - dangerous_train_003287
  - dangerous_train_003404
  - mp_94
  - voc_test_001274
  - voc_train_005904
  - voc_train_007628
  - youtube_test_001725
  - youtube_test_001833
  - youtube_test_001853
  - youtube_test_001915
  - youtube_train_008184
  - youtube_train_008275
  - youtube_train_008620
  - youtube_train_008931
  - youtube_train_010154
  - youtube_train_010819
  - youtube_valid_001774
