In [2]:
import os
import numpy as np
import pandas as pd
from PIL import Image

# Directorios de tus datos
IMAGES_DIR = "../data/images"
MASKS_DIR = "../data/masks"

# Crear una lista de todos los archivos en images y masks
image_files = sorted([f for f in os.listdir(IMAGES_DIR) if f.endswith(("_image.jpg", "_image.png"))])
mask_files = sorted([f for f in os.listdir(MASKS_DIR) if f.endswith(("_mask.jpg", "_mask.png"))])

print("Total images found:", len(image_files))
print("Total masks found:", len(mask_files))

# Extraer IDs de los nombres (ej: 0000_image.jpg -> 0000)
def get_id_from_filename(filename):
    return filename.split("_")[0]  # "0000_image.jpg" -> "0000"

# Diccionario para almacenar resultados
issues = {
    "filename": [],
    "issue_type": [],
    "image_width": [],
    "image_height": [],
    "mask_width": [],
    "mask_height": [],
    "mask_pixel_ratio": []
}

# Convertir listas a diccionario {id: filename} para localización más rápida
image_dict = {get_id_from_filename(f): f for f in image_files}
mask_dict = {get_id_from_filename(f): f for f in mask_files}

# Iterar sobre todos los IDs detectados (intersección o unión)
all_ids = set(image_dict.keys()).union(set(mask_dict.keys()))

for idx in sorted(all_ids):
    # Comprobar si hay imagen o máscara ausente
    if idx not in image_dict:
        issues["filename"].append(f"ID={idx}")
        issues["issue_type"].append("Image missing")
        issues["image_width"].append(None)
        issues["image_height"].append(None)
        issues["mask_width"].append(None)
        issues["mask_height"].append(None)
        issues["mask_pixel_ratio"].append(None)
        continue
    
    if idx not in mask_dict:
        issues["filename"].append(f"ID={idx}")
        issues["issue_type"].append("Mask missing")
        issues["image_width"].append(None)
        issues["image_height"].append(None)
        issues["mask_width"].append(None)
        issues["mask_height"].append(None)
        issues["mask_pixel_ratio"].append(None)
        continue
    
    # Cargar imagen y máscara
    img_path = os.path.join(IMAGES_DIR, image_dict[idx])
    mask_path = os.path.join(MASKS_DIR, mask_dict[idx])
    
    try:
        with Image.open(img_path) as img:
            img_width, img_height = img.size

        with Image.open(mask_path) as msk:
            mask_width, mask_height = msk.size
            # Convertir a numpy array para calcular proporción de píxeles
            msk_array = np.array(msk)
    except Exception as e:
        issues["filename"].append(image_dict[idx])
        issues["issue_type"].append(f"Error reading files: {e}")
        issues["image_width"].append(None)
        issues["image_height"].append(None)
        issues["mask_width"].append(None)
        issues["mask_height"].append(None)
        issues["mask_pixel_ratio"].append(None)
        continue

    # Verificar dimensiones
    if img_width != mask_width or img_height != mask_height:
        issues["filename"].append(image_dict[idx])
        issues["issue_type"].append("Dimension mismatch")
        issues["image_width"].append(img_width)
        issues["image_height"].append(img_height)
        issues["mask_width"].append(mask_width)
        issues["mask_height"].append(mask_height)
        # Proporción de píxeles no se calcula si hay mismatch
        issues["mask_pixel_ratio"].append(None)
        continue
    
    # Calcular proporción de píxeles “encendidos” (asumiendo que la máscara es binaria)
    # Nota: algunas máscaras pueden ser RGBA o tener diferentes valores. Ajusta si es tu caso.
    # Aquí convertimos a escalas de gris (si es binaria real, la mayoría será 0 o 255)
    if len(msk_array.shape) == 3:
        # Por si la máscara tiene 3 o 4 canales, tomamos un canal
        msk_array = msk_array[:, :, 0]
    
    nonzero = np.count_nonzero(msk_array)
    total_pixels = msk_array.shape[0] * msk_array.shape[1]
    ratio = nonzero / total_pixels

    # Definir umbrales para detectar problemas (ajusta según tu criterio)
    # Ejemplo: si la máscara cubre menos del 1% o más del 90%, consideramos sospechoso
    min_ratio = 0.01
    max_ratio = 0.90

    if ratio < min_ratio or ratio > max_ratio:
        issues["filename"].append(image_dict[idx])
        issue_description = "Mask too small" if ratio < min_ratio else "Mask too large"
        issues["issue_type"].append(issue_description)
        issues["image_width"].append(img_width)
        issues["image_height"].append(img_height)
        issues["mask_width"].append(mask_width)
        issues["mask_height"].append(mask_height)
        issues["mask_pixel_ratio"].append(ratio)

# Convertir a DataFrame para análisis
issues_df = pd.DataFrame(issues)
issues_df


Total images found: 150
Total masks found: 150


Unnamed: 0,filename,issue_type,image_width,image_height,mask_width,mask_height,mask_pixel_ratio
0,0077_image.jpg,Mask too large,2160,4032,2160,4032,0.956753


In [5]:
import shutil

quarantine_dir = "../data/quarantine"
os.makedirs(quarantine_dir, exist_ok=True)
os.makedirs(os.path.join(quarantine_dir, "images"), exist_ok=True)
os.makedirs(os.path.join(quarantine_dir, "masks"), exist_ok=True)

for row in issues_df.itertuples():
    if row.issue_type in ["Dimension mismatch", "Mask too small", "Mask too large"]:
        # Extraer el ID de la fila
        file_id = row.filename.split("_")[0].replace("ID=", "")  # ajusta si necesario
        image_name = file_id + "_image.jpg"
        mask_name  = file_id + "_mask.png"

        # Ruta de origen y destino
        src_img = os.path.join(IMAGES_DIR, image_name)
        src_msk = os.path.join(MASKS_DIR, mask_name)

        dst_img = os.path.join(quarantine_dir, "images", image_name)
        dst_msk = os.path.join(quarantine_dir, "masks",  mask_name)

        # Mover si existen
        if os.path.exists(src_img):
            shutil.move(src_img, dst_img)
        if os.path.exists(src_msk):
            shutil.move(src_msk, dst_msk)