In [4]:
import cv2
import numpy as np
from pathlib import Path
import shutil

# =========================
# RUTAS
# =========================
INPUT_DIR = Path("../Imagenes_defectos")        # carpeta original
OUTPUT_DIR = Path("data_imagenes/clasificados")

DEFECT_DIR = OUTPUT_DIR / "defecto"
OK_DIR = OUTPUT_DIR / "no_defecto"

DEFECT_DIR.mkdir(parents=True, exist_ok=True)
OK_DIR.mkdir(parents=True, exist_ok=True)

# =========================
# PARÁMETROS DE DECISIÓN
# =========================
WHITE_THRESHOLD = 10     # para considerar un píxel "blanco"
MIN_WHITE_PIXELS = 1     # con que haya 1 píxel blanco ya es defecto

# =========================
# FUNCIÓN: decidir defecto
# =========================
def is_defective(mask_path):
    mask = cv2.imread(str(mask_path), cv2.IMREAD_GRAYSCALE)
    if mask is None:
        return False

    white_pixels = np.sum(mask > WHITE_THRESHOLD)
    return white_pixels >= MIN_WHITE_PIXELS

# =========================
# PROCESO PRINCIPAL
# =========================
total = 0
defect_count = 0
ok_count = 0

for kos_dir in INPUT_DIR.iterdir():
    if not kos_dir.is_dir():
        continue

    kos_name = kos_dir.name
    print(f"\nProcesando {kos_name}")

    for img_path in kos_dir.glob("Part*.jpg"):

        mask_path = img_path.with_name(img_path.stem + "_label.bmp")

        if not mask_path.exists():
            print(f" Falta máscara para {img_path.name}")
            continue

        if is_defective(mask_path):
            target_dir = DEFECT_DIR
            defect_count += 1
            label = "DEFECTO"
        else:
            target_dir = OK_DIR
            ok_count += 1
            label = "NO_DEFECTO"

        # nombre único para evitar colisiones
        new_name = f"{kos_name}_{img_path.name}"

        shutil.copy(img_path, target_dir / new_name)
        total += 1

        print(f"  {label}: {new_name}")

# =========================
# RESUMEN
# =========================
print("\n===== RESUMEN FINAL =====")
print(f"Total imágenes procesadas: {total}")
print(f"Defecto: {defect_count}")
print(f"No defecto: {ok_count}")



Procesando kos01
  NO_DEFECTO: kos01_Part0.jpg
  NO_DEFECTO: kos01_Part1.jpg
  NO_DEFECTO: kos01_Part2.jpg
  NO_DEFECTO: kos01_Part3.jpg
  NO_DEFECTO: kos01_Part4.jpg
  DEFECTO: kos01_Part5.jpg
  NO_DEFECTO: kos01_Part6.jpg
  NO_DEFECTO: kos01_Part7.jpg

Procesando kos02
  NO_DEFECTO: kos02_Part0.jpg
  NO_DEFECTO: kos02_Part1.jpg
  NO_DEFECTO: kos02_Part2.jpg
  NO_DEFECTO: kos02_Part3.jpg
  NO_DEFECTO: kos02_Part4.jpg
  NO_DEFECTO: kos02_Part5.jpg
  DEFECTO: kos02_Part6.jpg
  NO_DEFECTO: kos02_Part7.jpg

Procesando kos03
  NO_DEFECTO: kos03_Part0.jpg
  NO_DEFECTO: kos03_Part1.jpg
  DEFECTO: kos03_Part2.jpg
  NO_DEFECTO: kos03_Part3.jpg
  NO_DEFECTO: kos03_Part4.jpg
  NO_DEFECTO: kos03_Part5.jpg
  NO_DEFECTO: kos03_Part6.jpg
  NO_DEFECTO: kos03_Part7.jpg

Procesando kos04
  NO_DEFECTO: kos04_Part0.jpg
  NO_DEFECTO: kos04_Part1.jpg
  NO_DEFECTO: kos04_Part2.jpg
  DEFECTO: kos04_Part3.jpg
  NO_DEFECTO: kos04_Part4.jpg
  NO_DEFECTO: kos04_Part5.jpg
  NO_DEFECTO: kos04_Part6.jpg
  NO_DEFEC

In [5]:
from pathlib import Path
import shutil
import random

# =========================
# CONFIG
# =========================
SOURCE_DIR = Path("data_imagenes/clasificados")  # aquí están defecto/ y no_defecto/
OUT_DIR = Path("dataset_cls")

SPLIT = {"train": 0.70, "val": 0.15, "test": 0.15}
SEED = 42

classes = ["defecto", "no_defecto"]

random.seed(SEED)

# Validación de splits
if abs(sum(SPLIT.values()) - 1.0) > 1e-9:
    raise ValueError("Los porcentajes de SPLIT deben sumar 1.0")

# Crear carpetas de salida
for split in SPLIT:
    for cls in classes:
        (OUT_DIR / split / cls).mkdir(parents=True, exist_ok=True)

# =========================
# FUNCIÓN DE SPLIT
# =========================
def split_list(items, split_dict):
    """Devuelve dict con listas para train/val/test respetando proporciones."""
    n = len(items)
    n_train = int(n * split_dict["train"])
    n_val = int(n * split_dict["val"])
    # el resto para test para asegurar que suma n
    n_test = n - n_train - n_val

    train_items = items[:n_train]
    val_items = items[n_train:n_train + n_val]
    test_items = items[n_train + n_val:]

    assert len(test_items) == n_test
    return {"train": train_items, "val": val_items, "test": test_items}

# =========================
# PROCESO POR CLASE (estratificado)
# =========================
for cls in classes:
    src_cls_dir = SOURCE_DIR / cls
    imgs = list(src_cls_dir.glob("*.jpg"))

    if len(imgs) == 0:
        raise ValueError(f"No hay imágenes .jpg en {src_cls_dir}")

    random.shuffle(imgs)
    parts = split_list(imgs, SPLIT)

    for split_name, split_imgs in parts.items():
        for img_path in split_imgs:
            shutil.copy(img_path, OUT_DIR / split_name / cls / img_path.name)

    print(f"\nClase '{cls}':")
    for split_name in ["train", "val", "test"]:
        print(f"  {split_name}: {len(parts[split_name])} imágenes")

print("\n✅ Dataset creado en:", OUT_DIR.resolve())



Clase 'defecto':
  train: 36 imágenes
  val: 7 imágenes
  test: 9 imágenes

Clase 'no_defecto':
  train: 242 imágenes
  val: 52 imágenes
  test: 53 imágenes

✅ Dataset creado en: C:\Users\aneli\OneDrive\Documentos\GitHub\grupo1reto2\dataset_cls
