In [3]:
import os
import xml.etree.ElementTree as ET
from pathlib import Path
import numpy as np
from PIL import Image

# Dataset base paths
DATASET_PAIRS = {
    "UAM": "/home/mdb/DL_Lab3/UAM_DATASET/stratified/{}/",
    "RIVAS": "/home/mdb/DL_Lab3/RIVAS_DATASET/DATASET/{}/",
}

CLASSES = ["Containers", "Crosswalks", "Rubish"]
SPLITS = ["image_train", "image_test", "image_query"]

print("\n===== INICIANDO EDA COMPLETO =====\n")

for dataset_name, base_path in DATASET_PAIRS.items():
    print(f"===== Dataset: {dataset_name} =====\n")
    
    for class_name in CLASSES:
        class_path = Path(base_path.format(class_name))
        print(f"--- Clase: {class_name} ---")
        
        for split in SPLITS:
            split_path = class_path / split
            if not split_path.exists():
                print(f"  {split}: No existe")
                continue
            
            image_sizes = []
            for img_file in os.listdir(split_path):
                if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
                    img_path = split_path / img_file
                    try:
                        with Image.open(img_path) as img:
                            image_sizes.append(img.size[::-1])  # (h, w)
                    except Exception as e:
                        print(f"[WARNING] Error leyendo imagen {img_file}: {e}")

            image_sizes = np.array(image_sizes)
            if len(image_sizes) > 0:
                mean_size = np.mean(image_sizes, axis=0)
                min_size = np.min(image_sizes, axis=0)
                max_size = np.max(image_sizes, axis=0)
                print(f"{split}: {len(image_sizes)} imágenes")
                print(f"  Tamaño medio: {mean_size}")
                print(f"  Tamaño mínimo: {min_size}")
                print(f"  Tamaño máximo: {max_size}")
            else:
                print(f"{split}: 0 imágenes")
        
        # Leer XMLs de etiquetas
        for xml_name in ["train_label.xml", "test_label.xml", "query_label.xml"]:
            xml_path = class_path / xml_name
            if not xml_path.exists():
                print(f"{xml_name}: No existe")
                continue
            
            try:
                tree = ET.parse(xml_path)
                root = tree.getroot()
                if root.tag == "TrainingImages":
                    items = root.find("Items").findall("Item")
                else:
                    items = root.findall("Item")
                ids = [item.attrib.get("objectID", None) for item in items if "objectID" in item.attrib]
                unique_ids = len(set(ids))
                total_items = len(ids)
                print(f"{xml_name}: {unique_ids} IDs únicos, {total_items} imágenes")
            except Exception as e:
                print(f"[WARNING] Error leyendo {xml_name}: {e}")
        print()

print("\n=== EDA COMPLETADO === 🚀\n")



===== INICIANDO EDA COMPLETO =====

===== Dataset: UAM =====

--- Clase: Containers ---
image_train: 1697 imágenes
  Tamaño medio: [129.82144962 103.90218032]
  Tamaño mínimo: [12 13]
  Tamaño máximo: [816 549]
image_test: 272 imágenes
  Tamaño medio: [125.01838235 103.97426471]
  Tamaño mínimo: [18 17]
  Tamaño máximo: [ 616 1131]
image_query: 98 imágenes
  Tamaño medio: [63.84693878 56.47959184]
  Tamaño mínimo: [17 12]
  Tamaño máximo: [371 343]
train_label.xml: 83 IDs únicos, 1585 imágenes
test_label.xml: 0 IDs únicos, 0 imágenes
query_label.xml: 0 IDs únicos, 0 imágenes

--- Clase: Crosswalks ---
image_train: 1352 imágenes
  Tamaño medio: [118.03328402 598.69748521]
  Tamaño mínimo: [16 23]
  Tamaño máximo: [ 527 1920]
image_test: 350 imágenes
  Tamaño medio: [123.26285714 691.82      ]
  Tamaño mínimo: [13 42]
  Tamaño máximo: [ 520 1920]
image_query: 90 imágenes
  Tamaño medio: [156.28888889 738.62222222]
  Tamaño mínimo: [17 87]
  Tamaño máximo: [ 523 1920]
train_label.xml: 10