# Deepchecks step by step

### 1) Prepare repository structure

Deepchecks needs a repository structure to operate correctly, so first of all we need to correctly prepare the structure.

In [35]:
import pandas as pd

In [11]:
# Instalation
# !python -m pip install "deepchecks[vision]" pyyaml pillow --upgrade

from pathlib import Path
import os, shutil, yaml
from PIL import Image

# Routes 
BASE_TRAIN = Path("../data/raw/dataset_train_rgb")
BASE_TEST  = Path("../data/raw/dataset_test_rgb")
IMAGES_TRAIN_DIR = BASE_TRAIN / "rgb" / "train"
IMAGES_TEST_DIR  = BASE_TEST  / "rgb" / "test"
TRAIN_YAML = BASE_TRAIN / "train.yaml"
TEST_YAML  = BASE_TEST /  "test.yaml"
CLS_ROOT = Path("cls_data")  # output repository

# Given that we have several labels for the same color, we will group them
SUPERMAP = {
    # Green
    "Green": "Green", "GreenLeft": "Green", "GreenRight": "Green",
    "GreenStraight": "Green", "GreenStraightLeft": "Green", "GreenStraightRight": "Green",
    # Red
    "Red": "Red", "RedLeft": "Red", "RedRight": "Red",
    "RedStraight": "Red", "RedStraightLeft": "Red",
    # Yellow
    "Yellow": "Yellow",
    # Off
    "off": "off",
}
VALID_CLASSES = {"Green","Red","Yellow","off"}

def load_yaml_list(path: Path):
    with open(path, "r") as f:
        return yaml.safe_load(f)

def resolve_img_path(raw_path: str, images_dir: Path, base_dir: Path):
    """
    Given a yaml file, it gets the routes:
    * If absolute routes -> use basename in images_dir
    * If relative route -> use base_dir/raw_path
    """
    p = Path(raw_path)
    if p.is_absolute():
        return (images_dir / p.name).resolve()
    cand = (base_dir / p).resolve()
    return cand if cand.is_file() else (images_dir / p.name).resolve()

def sanitize_box(b, W, H):
    """Normalize and clip a bounding box to image bounds; return (x1,y1,x2,y2) or None if degenerate."""
    x1, y1 = float(b["x_min"]), float(b["y_min"])
    x2, y2 = float(b["x_max"]), float(b["y_max"])
    if x2 < x1: x1, x2 = x2, x1
    if y2 < y1: y1, y2 = y2, y1
    x1 = max(0.0, min(x1, W)); x2 = max(0.0, min(x2, W))
    y1 = max(0.0, min(y1, H)); y2 = max(0.0, min(y2, H))
    if x2 <= x1 or y2 <= y1:
        return None
    return x1, y1, x2, y2

def ensure_dir(d):
    d.mkdir(parents=True, exist_ok=True)

def link_or_copy(src: Path, dst: Path):
    """Intenta enlace duro; si no, symlink; si no, copia."""
    if dst.exists():
        return
    try:
        os.link(src, dst)               # hardlink (no duplica bytes)
    except Exception:
        try:
            os.symlink(src, dst)        # symlink
        except Exception:
            shutil.copy2(src, dst)      # copia

def build_classification_split(items, images_dir: Path, base_dir: Path, out_dir: Path, max_per_class=None):
    """
    Create class folders with images that have EXACTLY one valid bounding box,
    mapping each label to {Red, Yellow, Green, off}.
    """
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    counts = {c:0 for c in VALID_CLASSES}
    kept, skipped = 0, 0

    for it in items:
        # resolver imagen
        img_path = resolve_img_path(it["path"], images_dir, base_dir)
        if not img_path.is_file():
            skipped += 1
            continue

        # cargar y sanear cajas
        try:
            img = Image.open(img_path).convert("RGB")
        except Exception:
            skipped += 1
            continue
        W, H = img.size

        boxes = []
        for b in (it.get("boxes") or []):
            superlab = SUPERMAP.get(str(b.get("label")))
            if superlab not in VALID_CLASSES:
                continue
            sb = sanitize_box(b, W, H)
            if sb is None:
                continue
            boxes.append(superlab)

        # criterio: exactamente 1 caja -> una sola etiqueta para clasificación
        if len(boxes) != 1:
            skipped += 1
            continue

        cls = boxes[0]
        if max_per_class is not None and counts[cls] >= max_per_class:
            continue

        cls_dir = out_dir / cls
        ensure_dir(cls_dir)
        dst = cls_dir / img_path.name
        Path(dst).parent.mkdir(parents=True, exist_ok=True)  # create if it doesn't exist
        shutil.copy2(img_path, dst)
        counts[cls] += 1
        kept += 1

    return counts, kept, skipped

# Execute this for train and test
y_train = load_yaml_list(TRAIN_YAML)
y_test  = load_yaml_list(TEST_YAML)

train_counts, train_kept, train_skipped = build_classification_split(
    y_train, IMAGES_TRAIN_DIR, BASE_TRAIN, CLS_ROOT / "train", max_per_class=None
)
test_counts, test_kept, test_skipped = build_classification_split(
    y_test, IMAGES_TEST_DIR, BASE_TEST, CLS_ROOT / "test", max_per_class=None
)

### 2) Explore file structure output

Since we’ve removed all images with multiple or zero bounding boxes, this step just helps us see how many valid single-box images remain in the dataset.

In [12]:
from collections import Counter
stats = Counter()
for it in y_train:
    boxes = it.get("boxes") or []
    if len(boxes) == 0: stats['zero_boxes'] += 1
    elif len(boxes) == 1: stats['one_box'] += 1
    else: stats['multi_boxes'] += 1
print("TRAIN stats:", stats)

stats = Counter()
for it in y_test:
    boxes = it.get("boxes") or []
    if len(boxes) == 0: stats['zero_boxes'] += 1
    elif len(boxes) == 1: stats['one_box'] += 1
    else: stats['multi_boxes'] += 1
print("TEST stats:", stats)


TRAIN stats: Counter({'multi_boxes': 2789, 'zero_boxes': 1940, 'one_box': 364})
TEST stats: Counter({'multi_boxes': 5253, 'one_box': 1894, 'zero_boxes': 1187})


### 3) Deepchecks validation

In [20]:
# Instalation
#!python -m pip uninstall -y torchvision torch
#!python -m pip cache purge
#!python -m pip install --no-cache-dir torch==2.4.1 torchvision==0.19.1

from deepchecks.vision import classification_dataset_from_directory
from deepchecks.vision.suites import train_test_validation

ROOT = "cls_data"

train_ds, test_ds = classification_dataset_from_directory(
    root=ROOT, object_type='VisionData', image_extension='png'
)

suite = train_test_validation()
result = suite.run(train_ds, test_ds)
result.save_as_html('output_deepchecks_cls.html', as_widget=False, requirejs=False)
print("OK -> output_deepchecks_cls.html")


'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.



  import pynvml  # type: ignore[import]
  import pynvml  # type: ignore[import]
  import pynvml  # type: ignore[import]
  import pynvml  # type: ignore[import]
  import pynvml  # type: ignore[import]
  import pynvml  # type: ignore[import]
  import pynvml  # type: ignore[import]


OK -> output_deepchecks_cls.html


In [34]:
from IPython.display import IFrame

import webbrowser, os
webbrowser.open("file://" + os.path.abspath("output_deepchecks_cls.html",))


True