In [None]:
import os
import cv2
import xml.etree.ElementTree as ET

# Directories
images_dir = "/images"
annotations_dir = "/annotations"

# Validation thresholds
MIN_BOX_WIDTH = 10    # pixels
MIN_BOX_HEIGHT = 10   # pixels

def is_annotation_valid(xml_path, img_width, img_height):
    """
    Validate a Pascal VOC annotation file against image size and thresholds.
    """
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
    except Exception:
        return False

    objects = root.findall("object")
    if not objects:  # No annotations
        return False

    for obj in objects:
        bndbox = obj.find("bndbox")
        if bndbox is None:
            return False

        try:
            xmin = int(float(bndbox.find("xmin").text))
            ymin = int(float(bndbox.find("ymin").text))
            xmax = int(float(bndbox.find("xmax").text))
            ymax = int(float(bndbox.find("ymax").text))
        except Exception:
            return False

        # Check coordinate validity
        if xmin < 0 or ymin < 0 or xmax <= xmin or ymax <= ymin:
            return False

        # Check if inside image bounds
        if xmax > img_width or ymax > img_height:
            return False

        # Check if box is too small
        if (xmax - xmin) < MIN_BOX_WIDTH or (ymax - ymin) < MIN_BOX_HEIGHT:
            return False

    return True


def delete_file_pair(image_path, annotation_path):
    """
    Delete image and annotation files if they exist.
    """
    if os.path.exists(image_path):
        os.remove(image_path)
        print(f"Deleted image: {image_path}")
    if os.path.exists(annotation_path):
        os.remove(annotation_path)
        print(f"Deleted annotation: {annotation_path}")


# Process all images
for image_file in os.listdir(images_dir):
    if not image_file.lower().endswith((".jpg", ".jpeg", ".png")):
        continue

    image_path = os.path.join(images_dir, image_file)
    xml_file = os.path.splitext(image_file)[0] + ".xml"
    xml_path = os.path.join(annotations_dir, xml_file)

    # If annotation file missing → delete image
    if not os.path.exists(xml_path):
        print(f"Missing annotation for {image_file}, deleting...")
        delete_file_pair(image_path, xml_path)
        continue

    # Load image to get size
    img = cv2.imread(image_path)
    if img is None:
        print(f"Unreadable image: {image_file}, deleting...")
        delete_file_pair(image_path, xml_path)
        continue
    img_height, img_width = img.shape[:2]

    # Validate annotation
    if not is_annotation_valid(xml_path, img_width, img_height):
        print(f"Invalid annotation for {image_file}, deleting...")
        delete_file_pair(image_path, xml_path)
