In [None]:
import os
import shutil
from pathlib import Path
import re

cls_data_path = Path("/workspace/yolo_dataset_cls_5fold")
yolo_data_path = Path("/workspace/yolo_dataset_4_dec")

In [None]:
cleanup_md = Path("/workspace/yolo_dangerous_weapons/outliers/cleanup.md")

to_remove = []
with open(cleanup_md, 'r') as f:
    for line in f:
        line = line.strip()
        if line and line.endswith('.jpg'):
            stem = Path(line).stem
            if stem not in to_remove:
                to_remove.append(stem)

print(f"Images to remove: {len(to_remove)}")
print("\nList of images:")
for img in sorted(to_remove):
    print(f"  - {img}")


In [None]:
def count_cls_images(path):
    """Count total images in cls dataset."""
    count = 0
    for root, dirs, files in os.walk(path):
        count += sum(1 for f in files if f.endswith(('.jpg', '.jpeg', '.png')))
    return count

print(f"CLS dataset before: {count_cls_images(cls_data_path)} images")


In [None]:
removed_cls = []
not_found_cls = []

for base_name in to_remove:
    found = False
    for root, dirs, files in os.walk(cls_data_path):
        for file in files:
            if file.endswith(('.jpg', '.jpeg', '.png')):
                if file.startswith(base_name):
                    file_path = Path(root) / file
                    print(f"Removing from CLS: {file_path}")
                    file_path.unlink()
                    removed_cls.append(str(file_path))
                    found = True
    if not found:
        not_found_cls.append(base_name)

print(f"\nRemoved from CLS: {len(removed_cls)} files")
print(f"Not found in CLS: {len(not_found_cls)} files")
if not_found_cls:
    print(f"  Missing: {not_found_cls}")


In [None]:
print(f"CLS dataset after: {count_cls_images(cls_data_path)} images")


In [None]:
def count_yolo_images(path):
    """Count images in YOLO dataset (train/test/valid folders)."""
    count = 0
    for split in ['train', 'test', 'valid']:
        img_dir = path / 'images' / split
        if img_dir.exists():
            count += len(list(img_dir.glob('*.jpg'))) + len(list(img_dir.glob('*.png')))
    return count

print(f"YOLO dataset before: {count_yolo_images(yolo_data_path)} images")


In [None]:
removed_yolo = []
not_found_yolo = []

for base_name in to_remove:
    found = False
    for split in ['train', 'test', 'valid']:
        for folder in ['images', 'labels']:
            base_dir = yolo_data_path / folder / split
            if not base_dir.exists():
                continue
            for ext in ['.jpg', '.png', '.txt']:
                file_path = base_dir / f"{base_name}{ext}"
                if file_path.exists():
                    print(f"Removing from YOLO: {file_path}")
                    file_path.unlink()
                    removed_yolo.append(str(file_path))
                    found = True
    if not found:
        not_found_yolo.append(base_name)

print(f"\nRemoved from YOLO: {len(removed_yolo)} files")
print(f"Not found in YOLO: {len(not_found_yolo)} files")
if not_found_yolo:
    print(f"  Missing: {not_found_yolo}")


In [None]:
print(f"YOLO dataset after: {count_yolo_images(yolo_data_path)} images")

In [None]:
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"\nImages to remove: {len(to_remove)}")

print(f"\nCLS Dataset:")
print(f"  - Files removed: {len(removed_cls)}")
print(f"  - Not found: {len(not_found_cls)}")

print(f"\nYOLO Dataset:")
print(f"  - Files removed: {len(removed_yolo)}")
print(f"  - Not found: {len(not_found_yolo)}")

print(f"\nRemoved image names (base):")
for name in sorted(to_remove):
    print(f"  - {name}")
