In [1]:
import os
import shutil
import random
import numpy as np
from PIL import Image
from collections import defaultdict

# --- CONFIGURATION ---
IMG_DIR = '/app/data/2026-01-19-defect_dataset/images'
LBL_DIR = '/app/data/2026-01-19-defect_dataset/labels_full'
#OUT_IMG = '/app/data/subset_X/images'
OUT_LBL = '/app/data/2026-01-19-defect_dataset/labels_cracks'


In [2]:
# Map mask classes form each img with image id and create a dictionary 

class_to_images = defaultdict(list)
image_to_classes = defaultdict(set)

# Get all label files
try:
    label_files = [f for f in os.listdir(LBL_DIR) if f.endswith(('.png', '.jpg', '.jpeg'))]
except FileNotFoundError:
    print(f"Error: Could not find label directory at {LBL_DIR}")


print(f"Scanning {len(label_files)} labels to identify defect types...")

for f in label_files:
    mask = np.array(Image.open(os.path.join(LBL_DIR, f)).convert('L'))
    unique_classes = np.unique(mask)
    for cls_id in unique_classes:
        if cls_id != 0: # Ignore background for the count
            class_to_images[cls_id].append(f)
            image_to_classes[f].add(cls_id)
                

Scanning 7286 labels to identify defect types...


In [3]:
# Print Distribution
all_found_ids = sorted(class_to_images.keys())
print("\n" + "="*35)
print(f"{'Class ID':<10} | {'Original Count'}")
print("-" * 35)
for cls_id in all_found_ids:
    count = len(class_to_images[cls_id])
    print(f"{cls_id:<10} | {count}")
print("="*35 + "\n")


Class ID   | Original Count
-----------------------------------
1          | 5403
2          | 1754
3          | 1408
4          | 336
5          | 1649
6          | 921
7          | 494
8          | 244
9          | 11
10         | 274
11         | 54
12         | 40
13         | 148
14         | 1
15         | 95
16         | 13



In [4]:
'''
# Remove calsses under threshold
classes_to_remove = [cls for cls, imgs in class_to_images.items() if len(imgs) < MIN_PER_CLASS]
    
print(f"Removing {len(classes_to_remove)} classes with fewer than {MIN_PER_CLASS} instances: {classes_to_remove}")

for cls_id in classes_to_remove:
    # Get the list of images that contained this rare defect
    affected_images = class_to_images[cls_id]
    
    # Remove the class from the image-to-class mapping for those images
    for img in affected_images:
        if cls_id in image_to_classes[img]:
            image_to_classes[img].remove(cls_id)
    
    # Remove the class entry entirely from the dictionary
    del class_to_images[cls_id]
''' 

'\n# Remove calsses under threshold\nclasses_to_remove = [cls for cls, imgs in class_to_images.items() if len(imgs) < MIN_PER_CLASS]\n    \nprint(f"Removing {len(classes_to_remove)} classes with fewer than {MIN_PER_CLASS} instances: {classes_to_remove}")\n\nfor cls_id in classes_to_remove:\n    # Get the list of images that contained this rare defect\n    affected_images = class_to_images[cls_id]\n    \n    # Remove the class from the image-to-class mapping for those images\n    for img in affected_images:\n        if cls_id in image_to_classes[img]:\n            image_to_classes[img].remove(cls_id)\n    \n    # Remove the class entry entirely from the dictionary\n    del class_to_images[cls_id]\n'

In [21]:
# Select img using greedy approach 

# 1. Define targets (Uniform target for all classes)
target_per_class = TARGET_TOTAL // len(class_to_images)
selected_images = set()
current_counts = defaultdict(int)

# 2. Greedy Selection Loop
while len(selected_images) < TARGET_TOTAL:
    best_img = None
    best_score = -float('inf')
    
    # Pool of images not yet picked
    candidates = [f for f in label_files if f not in selected_images and len(image_to_classes[f]) > 0]
    
    if not candidates: break

    # Sample candidates for speed if label_files is huge
    sample_candidates = random.sample(candidates, min(len(candidates), 500))

    for img in sample_candidates:
        score = 0
        for cls_id in image_to_classes[img]:
            # Higher score for images containing classes we are missing the most
            score += (target_per_class - current_counts[cls_id])
        
        if score > best_score:
            best_score = score
            best_img = img
            
    # Add the best image found
    selected_images.add(best_img)
    for cls_id in image_to_classes[best_img]:
        current_counts[cls_id] += 1

print(f"Optimized selection complete: {len(selected_images)} images.")

Optimized selection complete: 500 images.


In [22]:
# New distribution
subset_class_counts = defaultdict(int)
    
for f in selected_images:
    # We use image_to_classes because it already has the 
    # cleaned/filtered class list for each file
    for cls_id in image_to_classes[f]:
        subset_class_counts[cls_id] += 1
        
print("\n" + "="*40)
print(f"{'Class ID':<10} | {'Subset Count (Final)'}")
print("-" * 40)

# Sort by ID for a clean table
for cls_id in sorted(subset_class_counts.keys()):
    count = subset_class_counts[cls_id]
    print(f"{cls_id:<10} | {count}")
    
print("="*40)
print(f"Total Unique Images in Subset: {len(selected_images)}")


Class ID   | Subset Count (Final)
----------------------------------------
1          | 67
2          | 66
3          | 66
4          | 62
5          | 66
6          | 66
7          | 66
8          | 67
10         | 66
11         | 44
12         | 40
13         | 66
15         | 66
Total Unique Images in Subset: 500


In [26]:
# Save new subset  
import shutil

os.makedirs(OUT_IMG, exist_ok=True)
os.makedirs(OUT_LBL, exist_ok=True)

print(f"Copying {len(selected_images)} images and labels...")

for f in selected_images:
    # 'f' is the label filename (e.g., 'abc.png')
    base_name = os.path.splitext(f)[0] # This gets 'abc'
    
    # 1. Define the image filename with its correct extension
    img_filename = f"{base_name}.jpg" 
    
    # 2. Paths
    src_img = os.path.join(IMG_DIR, img_filename)
    dst_img = os.path.join(OUT_IMG, img_filename)
    
    src_lbl = os.path.join(LBL_DIR, f)
    dst_lbl = os.path.join(OUT_LBL, f)
    
    # 3. Copy (with error check for image)
    if os.path.exists(src_img):
        shutil.copy(src_img, dst_img)
        shutil.copy(src_lbl, dst_lbl)
    else:
        print(f"⚠️ Warning: Image not found for label {f}. Looked for: {img_filename}")

print(f"✅ Success! Your subset is ready in {OUT_IMG}")

Copying 500 images and labels...
✅ Success! Your subset is ready in /app/data/subset_500/images
