In [None]:
json_path = "/workspace/yolo_dataset_cls_5fold/cleanup_reports/removed_tiny_crops_20251220_091435.json"

In [None]:
import json

with open(json_path, 'r') as f:
    data = json.load(f)

print(data)

In [None]:
all_img_path  = list(data['by_source_image'].keys())

In [None]:
import cv2
import numpy as np
from pathlib import Path
import shutil

yolo_dataset = Path("/workspace/yolo_dataset_4_dec")
output_dir = Path("/workspace/small_crops_visualization")

if output_dir.exists():
    shutil.rmtree(output_dir)
output_dir.mkdir(parents=True)

class_names = ['knife', 'gun', 'rifle', 'baseball_bat']
colors = [
    (255, 0, 255),  # knife - Magenta
    (0, 0, 255),    # gun - Red
    (0, 255, 255),  # rifle - Yellow
    (255, 0, 0)     # baseball_bat - Blue
]

print(f"Output directory: {output_dir}")
print(f"Total unique source images to process: {len(all_img_path)}")


In [None]:
def extract_real_image_name(source_name):
    """Extract the actual image name from source_image field."""
    if source_name.startswith('crop'):
        parts = source_name.split('_')
        if len(parts) >= 2:
            img_id = parts[1].replace('.jpg', '')
            for prefix in ['dangerous', 'youtube', 'voc', 'crowdhuman', 'cellphone']:
                for split in ['train', 'test', 'valid']:
                    candidate = f"{prefix}_{split}_{img_id}"
                    return candidate
    return source_name.replace('.png', '').replace('.jpg', '')

def find_image_and_label(base_name, dataset_path):
    """Find image and label file in YOLO dataset."""
    for split in ['train', 'test', 'valid']:
        img_dir = dataset_path / 'images' / split
        label_dir = dataset_path / 'labels' / split
        
        for ext in ['.jpg', '.png', '.jpeg']:
            img_path = img_dir / f"{base_name}{ext}"
            if img_path.exists():
                label_path = label_dir / f"{base_name}.txt"
                return img_path, label_path if label_path.exists() else None
    return None, None

test_name = all_img_path[0]
real_name = extract_real_image_name(test_name)
print(f"Test extraction:")
print(f"  Source: {test_name}")
print(f"  Extracted: {real_name}")

img_path, label_path = find_image_and_label(real_name, yolo_dataset)
print(f"  Image: {img_path}")
print(f"  Label: {label_path}")


In [None]:
def draw_bboxes(image_path, label_path, output_path):
    """Draw bounding boxes on image and save."""
    img = cv2.imread(str(image_path))
    if img is None:
        print(f"Failed to read image: {image_path}")
        return False
    
    h, w = img.shape[:2]
    has_boxes = False
    
    if label_path and label_path.exists():
        with open(label_path, 'r') as f:
            lines = f.readlines()
            
        for line in lines:
            parts = line.strip().split()
            if len(parts) < 5:
                continue
            
            has_boxes = True
            class_id = int(parts[0])
            x_center, y_center, width, height = map(float, parts[1:5])
            
            x1 = int((x_center - width/2) * w)
            y1 = int((y_center - height/2) * h)
            x2 = int((x_center + width/2) * w)
            y2 = int((y_center + height/2) * h)
            
            color = colors[class_id % len(colors)]
            cv2.rectangle(img, (x1, y1), (x2, y2), color, 2)
            
            label_text = f"{class_names[class_id]}"
            cv2.putText(img, label_text, (x1, y1-5), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
            
            bbox_w = x2 - x1
            bbox_h = y2 - y1
            size_text = f"{bbox_w}x{bbox_h}"
            cv2.putText(img, size_text, (x1, y2+15), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.4, color, 1)
    
    if not has_boxes:
        cv2.putText(img, "HARD NEGATIVE", (10, 30), 
                   cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0, 0, 255), 3)
    
    cv2.imwrite(str(output_path), img)
    return True

print("Functions defined successfully")


In [None]:
processed = 0
not_found = []
failed = []

for source_name in all_img_path:
    base_name = extract_real_image_name(source_name)
    
    img_path, label_path = find_image_and_label(base_name, yolo_dataset)
    
    if img_path is None:
        not_found.append((source_name, base_name))
        continue
    
    output_path = output_dir / f"{base_name}.jpg"
    
    if draw_bboxes(img_path, label_path, output_path):
        processed += 1
        if processed % 10 == 0:
            print(f"Processed {processed}/{len(all_img_path)} images...")
    else:
        failed.append(base_name)

print(f"\n{'='*60}")
print(f"Processing complete!")
print(f"{'='*60}")
print(f"Total images: {len(all_img_path)}")
print(f"Successfully processed: {processed}")
print(f"Not found: {len(not_found)}")
print(f"Failed to process: {len(failed)}")
print(f"\nOutput directory: {output_dir}")

if not_found:
    print(f"\nNot found ({len(not_found)} images):")
    for src, base in not_found[:10]:
        print(f"  {src} -> {base}")
    if len(not_found) > 10:
        print(f"  ... and {len(not_found) - 10} more")


In [None]:
saved_images = sorted(output_dir.glob('*.jpg'))

print(f"{'='*60}")
print(f"SAVED IMAGES SUMMARY")
print(f"{'='*60}")
print(f"\nTotal saved images: {len(saved_images)}")
print(f"Location: {output_dir}")
print(f"\nFirst 10 saved images:")
for img_path in saved_images[:10]:
    file_size_kb = img_path.stat().st_size / 1024
    print(f"  {img_path.name} ({file_size_kb:.1f} KB)")

if len(saved_images) > 10:
    print(f"  ... and {len(saved_images) - 10} more images")

print(f"\n{'='*60}")
print(f"LEGEND")
print(f"{'='*60}")
for i, (name, color) in enumerate(zip(class_names, colors)):
    print(f"  {i}: {name} - {color}")

print(f"\nVisualization:")
print(f"  - Bounding boxes show class name at top")
print(f"  - Bbox dimensions (WxH) shown at bottom")
print(f"  - Images with 'HARD NEGATIVE' label have empty label files (no objects)")


In [None]:
def count_yolo_images(path):
    """Count images in YOLO dataset."""
    count = 0
    for split in ['train', 'test', 'valid']:
        img_dir = path / 'images' / split
        if img_dir.exists():
            count += len(list(img_dir.glob('*.jpg'))) + len(list(img_dir.glob('*.png')))
    return count

print(f"YOLO dataset before removal: {count_yolo_images(yolo_dataset)} images")


In [None]:
exclude_from_removal = [
    'dangerous_train_000059',
    'dangerous_train_000107',
    'dangerous_train_000156',
    'dangerous_train_000165',
    'dangerous_train_000404',
    'dangerous_train_000513',
    'dangerous_train_000797',
    'dangerous_train_001086',
    'dangerous_train_001543',
    'dangerous_train_003563'
]

images_to_remove = []
excluded_images = []

for source_name in all_img_path:
    base_name = extract_real_image_name(source_name)
    img_path, label_path = find_image_and_label(base_name, yolo_dataset)
    
    if img_path is not None:
        if base_name in exclude_from_removal:
            excluded_images.append(base_name)
            print(f"Excluding from removal: {base_name}")
        else:
            images_to_remove.append(base_name)

print(f"\n{'='*60}")
print(f"Images to remove from YOLO dataset: {len(images_to_remove)}")
print(f"Images excluded from removal: {len(excluded_images)}")
print(f"\nList of images to remove:")
for img_name in sorted(images_to_remove):
    print(f"  - {img_name}")


In [None]:
removed_files = []
not_found_for_removal = []

for base_name in images_to_remove:
    found = False
    
    for split in ['train', 'test', 'valid']:
        for folder in ['images', 'labels']:
            base_dir = yolo_dataset / folder / split
            if not base_dir.exists():
                continue
            
            exts = ['.txt'] if folder == 'labels' else ['.jpg', '.png', '.jpeg']
            
            for ext in exts:
                file_path = base_dir / f"{base_name}{ext}"
                if file_path.exists():
                    print(f"Removing: {file_path}")
                    file_path.unlink()
                    removed_files.append(str(file_path))
                    found = True
    
    if not found:
        not_found_for_removal.append(base_name)

print(f"\n{'='*60}")
print(f"REMOVAL COMPLETE")
print(f"{'='*60}")
print(f"Total files removed: {len(removed_files)}")
print(f"Images not found for removal: {len(not_found_for_removal)}")

if not_found_for_removal:
    print(f"\nNot found:")
    for name in not_found_for_removal:
        print(f"  - {name}")


In [None]:
print(f"YOLO dataset after removal: {count_yolo_images(yolo_dataset)} images")

print(f"\n{'='*60}")
print(f"SUMMARY")
print(f"{'='*60}")
print(f"Images identified with tiny crops: {len(all_img_path)}")
print(f"Images found in YOLO dataset: {len(images_to_remove)}")
print(f"Total files removed (images + labels): {len(removed_files)}")
print(f"\nThese images had bounding boxes smaller than 16x16 pixels or area < 256 pixels")
print(f"Visualizations saved to: {output_dir}")
