In [25]:
from ultralytics import YOLO

import os
import random
from pathlib import Path
import sys
import time
import gc
import math
import cv2
from typing import List, Optional, Tuple, Dict
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from PIL import Image
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torchvision import transforms
import timm
from tqdm import tqdm

In [26]:
class CFG:
    train_dir  = '../data/hecto/train'
    test_dir = '../data/hecto/test'
    submission_csv    = '../data/hecto/sample_submission.csv'
    test_meta      = '../data/hecto/test.csv'
    name = "efficientnet_b0.ra_in1k"
    epochs = 2
    batch_size = 32
    num_workers = 4             
    seed = 42                       
    optimizer = "AdamW"
    lr = 2.0e-3                     
    weight_decay = 1.0e-4
    scheduler = "CosineAnnealingLR"
    eta_min = 1.0e-6
    device = "cuda"
    n_fold = 4
    selected_folds = [0,1,2,3]
    T_max = 50
    target_shape = [224, 224]
    mixup_alpha = 0.5
    in_channels = 3
    pretrained = True

cfg = CFG()

models_dir = Path("../models")
num_classes = pd.read_csv(
   cfg.submission_csv
).columns[1:].shape[0]

In [27]:
num_classes

396

In [28]:
# ─── 1) Pad → Denoise helper ────────────────────────────────────────────────
def pad_denoise_mask(pil_img: Image.Image, patch_size: int, denoise_h: int = 10):
    """
    Given a PIL image of arbitrary size (e.g., 553×402), pad its dimensions 
    up to the next multiple of patch_size, apply denoising, and return:
      - img_masked: a NumPy array of shape (h_pad, w_pad, 3)
      - (h_pad, w_pad): the padded height and width
      - (orig_w, orig_h): the original width and height
      - (x1, y1, x2, y2): coordinates of the valid (original) region
        (always (0, 0, orig_w, orig_h))

    Steps:
      1. Convert PIL → NumPy.
      2. Compute pad so that both h_pad and w_pad are multiples of patch_size.
      3. Pad with zeros on right/bottom.
      4. Run OpenCV colored denoising (fastNlMeansDenoisingColored).
      5. Apply a “mask” that retains exactly the original image region 
         and zeroes everything in the padded margins.

    Returns:
        img_masked: np.ndarray of dtype uint8, shape (h_pad, w_pad, 3)
        (h_pad, w_pad): padded dims
        (orig_w, orig_h): original dims
        (x1, y1, x2, y2): coordinates of the valid region
    """
    img_np = np.array(pil_img)
    orig_h, orig_w = img_np.shape[:2]
    ps = patch_size

    # (a) Compute padded dimensions
    w_pad = math.ceil(orig_w / ps) * ps
    h_pad = math.ceil(orig_h / ps) * ps
    pad_right  = w_pad - orig_w
    pad_bottom = h_pad - orig_h

    # (b) Pad with zeros on right/bottom
    img_padded = cv2.copyMakeBorder(
        img_np,
        top=0,
        bottom=pad_bottom,
        left=0,
        right=pad_right,
        borderType=cv2.BORDER_CONSTANT,
        value=[0, 0, 0]
    )

    # (c) Denoise (OpenCV expects BGR)
    bgr           = cv2.cvtColor(img_padded, cv2.COLOR_RGB2BGR)
    denoised_bgr  = cv2.fastNlMeansDenoisingColored(
                        bgr, None, denoise_h, denoise_h, 7, 21
                    )
    img_denoised  = cv2.cvtColor(denoised_bgr, cv2.COLOR_BGR2RGB)

    # (d) Build a mask that keeps only the original region
    x1, y1, x2, y2 = 0, 0, orig_w, orig_h
    mask2d = np.zeros((h_pad, w_pad), dtype=np.uint8)
    mask2d[y1:y2, x1:x2] = 1
    mask3c = np.stack([mask2d] * 3, axis=2)  # shape (h_pad, w_pad, 3)

    img_masked = img_denoised * mask3c

    return img_masked, (h_pad, w_pad), (orig_w, orig_h), (x1, y1, x2, y2)


# ─── 2) Stage 1 Dataset: one center‐crop per image ────────────────────────────
class Stage1Dataset(Dataset):
    """
    Stage 1: For each image, load it, resize so that the shorter side ≥ target_size,
    then take a single center‐crop of size `target_size` (e.g. 224×224). 
    __getitem__ returns (tensor, label_int).

    Args:
      image_paths: List[str] of full JPG/PNG paths.
      labels:      List[int] of corresponding integer labels.
      target_size: Tuple (tw, th), e.g. (224, 224).
      transform:   torchvision transforms to apply after cropping
                   (e.g., ToTensor + Normalize).
    """
    def __init__(
        self,
        image_paths: List[str],
        labels: List[int],
        target_size: Tuple[int, int],
        transform=None
    ):
        assert len(image_paths) == len(labels)
        self.paths       = image_paths
        self.labels      = labels
        self.target_size = target_size
        self.transform   = transform

    def __len__(self) -> int:
        return len(self.paths)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        path  = self.paths[idx]
        label = self.labels[idx]
        pil   = Image.open(path).convert("RGB")

        # 1) Resize so that both dimensions ≥ max(target_size)
        tw, th = self.target_size
        s = max(tw, th)
        pil = pil.resize((s, s))

        # 2) Center‐crop to (tw, th)
        w, h = pil.size
        x0 = (w - tw) // 2
        y0 = (h - th) // 2
        crop = pil.crop((x0, y0, x0 + tw, y0 + th))

        # 3) Apply transforms (ToTensor + Normalize, etc.)
        if self.transform:
            crop = self.transform(crop)

        return crop, label


# ─── 3) Stage 2 Dataset: cache padded+denoised in RAM, sample random patches ───
class Stage2Dataset(Dataset):
    """
    Stage 2: For each image, do pad+denoise exactly once (per DataLoader worker), cache 
    the result as a NumPy array, then repeatedly sample `num_patches_per_image` random 
    patches of shape (patch_size × patch_size). __getitem__ returns 
    (patch_tensor, one_hot_label).

    Note:
      - Each worker has its own `self.cache` dict, so no cross-worker locking is required.
      - For an image of arbitrary original size (e.g. 553×402), pad_denoise_mask
        will pad up to multiples of patch_size (e.g. 672×672 if patch_size = 224).
    """
    def __init__(
        self,
        items: List[Tuple[str, int]],    # List of (jpg_path, label_int)
        patch_size: int = 224,
        num_patches_per_image: int = 4,
        denoise_h: int = 10,
        transform=None
    ):
        super().__init__()
        self.items       = items
        self.patch_size  = patch_size
        self.num_patches = num_patches_per_image
        self.denoise_h   = denoise_h
        self.transform   = transform

        # Cache: jpg_path -> (denoised_np (h_pad, w_pad, 3), h_pad, w_pad)
        self.cache: Dict[str, Tuple[np.ndarray, int, int]] = {}

    def __len__(self) -> int:
        # total # of patches across all images = len(items) * num_patches_per_image
        return len(self.items) * self.num_patches

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        img_idx = idx // self.num_patches
        jpg_path, label = self.items[img_idx]

        # If not cached yet, load + pad+denoise + cache
        if jpg_path not in self.cache:
            pil = Image.open(jpg_path).convert("RGB")
            denoised_np, (h_pad, w_pad), (_, _), (_, _, _, _) = pad_denoise_mask(
                pil,
                self.patch_size,
                denoise_h=self.denoise_h
            )
            self.cache[jpg_path] = (denoised_np, h_pad, w_pad)
        else:
            denoised_np, h_pad, w_pad = self.cache[jpg_path]

        # Sample one random patch of size patch_size × patch_size
        x0 = random.randint(0, w_pad - self.patch_size)
        y0 = random.randint(0, h_pad - self.patch_size)
        patch_np = denoised_np[y0 : y0 + self.patch_size, x0 : x0 + self.patch_size, :]

        patch_pil = Image.fromarray(patch_np)
        if self.transform:
            patch_tensor = self.transform(patch_pil)
        else:
            patch_tensor = transforms.ToTensor()(patch_pil)

        # One-hot label for BCEWithLogitsLoss
        label_onehot = torch.zeros(num_classes, dtype=torch.float32)
        label_onehot[label] = 1.0

        return patch_tensor, label_onehot

In [29]:
def compute_class_weights(labels: List[int], num_classes: int) -> List[float]:
    """
    Given a list of integer labels (0 to num_classes-1), compute per-sample weights 
    inversely proportional to each class’s frequency. Returns a list of weights aligned 
    with the input `labels` list.
    """
    # Count how many examples belong to each class
    counts = np.bincount(labels, minlength=num_classes)
    # For any class with zero examples, set count to 1 to avoid division by zero
    counts = np.where(counts == 0, 1, counts)
    # Inverse frequency for each class
    class_weights = 1.0 / counts  # array of length num_classes
    # Assign each sample the weight corresponding to its label
    sample_weights = [class_weights[label] for label in labels]
    return sample_weights


class FullImageLoader:
    """
    Scans a training directory with subfolders per class and builds:
      - self.paths:  List[str] of full image paths
      - self.labels: List[int] of integer labels (index into `self.classes`)
      - self.classes: sorted List[str] of class names (folder names)
      - self.class_to_idx: Dict[str, int] mapping class name → index
      - self.idx_to_class: Dict[int, str] mapping index → class name

    Expects:
        train_dir/
            classA/
                img1.jpg
                img2.png
                ...
            classB/
                img3.jpg
                ...
            ...
    """
    def __init__(self, train_dir: str):
        self.train_dir = Path(train_dir)
        # List all immediate subdirectories (each represents one class), sorted
        self.classes = sorted([d.name for d in self.train_dir.iterdir() if d.is_dir()])
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}
        self.idx_to_class = {idx: cls_name for cls_name, idx in self.class_to_idx.items()}

        self.paths: List[str] = []
        self.labels: List[int] = []

        for cls_name in self.classes:
            cls_idx = self.class_to_idx[cls_name]
            cls_folder = self.train_dir / cls_name
            # Iterate over files in this class folder in sorted order
            for img_file in sorted(cls_folder.iterdir()):
                if img_file.suffix.lower() not in {".jpg", ".jpeg", ".png"}:
                    continue
                self.paths.append(str(img_file))
                self.labels.append(cls_idx)

        # Sanity check
        assert len(self.paths) == len(self.labels), "Image-paths and labels lengths mismatch"

class PatchInferenceDataset(Dataset):
    """
    For inference on test images: extract a grid of possibly overlapping patches from each image.
    __getitem__ returns (patch_batch, filename), where:
      - patch_batch: torch.Tensor of shape [num_patches, 3, patch_size, patch_size]
      - filename:    the base filename (e.g., "abc.jpg")

    Args:
        test_image_paths: List[str] of full paths to test images.
        patch_size:       side length of each square patch (e.g. 255).
        stride:           pixel step between adjacent patches (e.g. 240 for 15-pixel overlap).
        transform:        torchvision transforms to apply to each patch 
                          (e.g. ToTensor + Normalize).
    """
    def __init__(
        self,
        test_image_paths: List[str],
        patch_size: int = 255,
        stride: int = 240,
        transform: Optional[transforms.Compose] = None,
    ):
        self.test_paths = test_image_paths
        self.patch_size = patch_size
        self.stride     = stride
        self.transform  = transform

    def __len__(self) -> int:
        return len(self.test_paths)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, str]:
        img_path = self.test_paths[idx]
        pil_img  = Image.open(img_path).convert("RGB")
        w, h     = pil_img.size  # e.g. 1024×1024 or arbitrary

        patches: List[torch.Tensor] = []
        ps = self.patch_size
        st = self.stride

        # 1) Slide window top-to-bottom, left-to-right
        for y in range(0, h - ps + 1, st):
            for x in range(0, w - ps + 1, st):
                patch = pil_img.crop((x, y, x + ps, y + ps))
                if self.transform:
                    patch = self.transform(patch)
                patches.append(patch)

        # 2) Handle bottom row if not exactly divisible by stride
        if (h - ps) % st != 0:
            y0 = h - ps
            for x in range(0, w - ps + 1, st):
                patch = pil_img.crop((x, y0, x + ps, y0 + ps))
                if self.transform:
                    patch = self.transform(patch)
                patches.append(patch)

        # 3) Handle right column if not exactly divisible by stride
        if (w - ps) % st != 0:
            x0 = w - ps
            for y in range(0, h - ps + 1, st):
                patch = pil_img.crop((x0, y, x0 + ps, y + ps))
                if self.transform:
                    patch = self.transform(patch)
                patches.append(patch)

        # 4) Handle bottom-right corner if both dims had remainders
        if (h - ps) % st != 0 and (w - ps) % st != 0:
            patch = pil_img.crop((w - ps, h - ps, w, h))
            if self.transform:
                patch = self.transform(patch)
            patches.append(patch)

        # Stack all patches into a single tensor [num_patches, 3, ps, ps]
        patch_batch = torch.stack(patches, dim=0)
        filename    = os.path.basename(img_path)
        return patch_batch, filename


def calculate_log_loss(targets: np.ndarray, outputs: np.ndarray) -> float:
    """
    Compute multi-class log loss (cross-entropy) given:
      - targets: np.ndarray of shape [N], each an integer in [0..C-1]
      - outputs: np.ndarray of shape [N, C], each row a predicted probability vector (summing to 1)

    Steps:
      1. Clip outputs into [eps, 1-eps]
      2. Re-normalize so each row sums to 1
      3. Use sklearn’s log_loss with labels=[0..C-1]

    Returns:
        float: the overall log-loss.
    """
    num_classes = outputs.shape[1]
    eps = 1e-15

    clipped = np.clip(outputs, eps, 1 - eps)
    clipped = clipped / clipped.sum(axis=1, keepdims=True)

    return float(log_loss(targets, clipped, labels=list(range(num_classes))))


In [30]:
# ----------------------------------------------------------------------------
# Fix random seeds and set device
# ----------------------------------------------------------------------------
seed = cfg.seed
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device(cfg.device if torch.cuda.is_available() else "cpu")


# ----------------------------------------------------------------------------
# Model definition with soft‐label BCE and optional MixUp
# ----------------------------------------------------------------------------
class Hecto_EFFICIENTNET(nn.Module):
    def __init__(
        self,
        model_name: str,
        in_chans: int,
        num_classes: int,
        pretrained: bool = True,
        mixup_alpha: float = 0.0,
    ):
        super().__init__()
        self.num_classes   = num_classes
        self.mixup_alpha   = mixup_alpha
        self.mixup_enabled = mixup_alpha > 0.0

        self.backbone = timm.create_model(
            model_name,
            pretrained=pretrained,
            in_chans=in_chans,
            num_classes=num_classes,
        )

    def forward(self, x: torch.Tensor, targets: torch.Tensor = None):
        # If training and mixup is enabled, perform MixUp
        if self.training and self.mixup_enabled and targets is not None:
            lam = np.random.beta(self.mixup_alpha, self.mixup_alpha)
            idx = torch.randperm(x.size(0), device=x.device)
            x_mixed = lam * x + (1 - lam) * x[idx]
            y_a, y_b = targets, targets[idx]
            logits = self.backbone(x_mixed)
            loss = (
                lam * F.binary_cross_entropy_with_logits(logits, y_a)
                + (1 - lam) * F.binary_cross_entropy_with_logits(logits, y_b)
            )
            return logits, loss

        # Otherwise, just forward normally
        logits = self.backbone(x)
        return logits


# ----------------------------------------------------------------------------
# Optimizer, scheduler, criterion factories
# ----------------------------------------------------------------------------
def get_optimizer(model: nn.Module, cfg: CFG) -> optim.Optimizer:
    opt = cfg.optimizer
    lr  = cfg.lr
    wd  = cfg.weight_decay

    if opt == "Adam":
        return optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    if opt == "AdamW":
        return optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    if opt == "SGD":
        return optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=wd)

    raise ValueError(f"Unsupported optimizer: {opt}")


def get_scheduler(optimizer: optim.Optimizer, cfg: CFG):
    sch = cfg.scheduler

    if sch == "CosineAnnealingLR":
        return lr_scheduler.CosineAnnealingLR(
            optimizer,
            T_max=cfg.T_max,
            eta_min=cfg.eta_min,
        )
    if sch == "ReduceLROnPlateau":
        return lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode="min",
            factor=0.5,
            patience=2,
            min_lr=cfg.eta_min,
            verbose=True,
        )
    if sch == "StepLR":
        return lr_scheduler.StepLR(
            optimizer,
            step_size=max(1, cfg.epochs // 3),
            gamma=0.5,
        )

    return None


def get_criterion(cfg: CFG):
    return nn.BCEWithLogitsLoss()


# ----------------------------------------------------------------------------
# Full‐image validation dataset (patchify & average)
# ----------------------------------------------------------------------------
class FullValDataset(Dataset):
    """
    For validation: given a list of (image_path, label), run pad→denoise→mask,
    then extract all non‐overlapping patches of size `patch_size` and stack them.
    __getitem__ returns (patch_tensor, label_int), where patch_tensor has shape
    [num_patches, 3, patch_size, patch_size].
    """
    def __init__(self, val_items: List[Tuple[str, int]], patch_size: int, transform=None):
        """
        Args:
            val_items:  List of (file_path, label_int).
            patch_size: side length of square patch (e.g., 224).
            transform:  torchvision transforms to apply to each patch.
        """
        self.items      = val_items
        self.patch_size = patch_size
        self.transform  = transform

    def __len__(self) -> int:
        return len(self.items)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        path, label = self.items[idx]
        pil = Image.open(path).convert("RGB")

        # Pad and denoise, then apply full‐image mask
        img_masked_np, (h_pad, w_pad), (_, _), (_, _, _, _) = pad_denoise_mask(
            pil, self.patch_size, denoise_h=10
        )

        # Extract all non‐overlapping patches
        patches = []
        for top in range(0, h_pad, self.patch_size):
            for left in range(0, w_pad, self.patch_size):
                patch_np = img_masked_np[top : top + self.patch_size, left : left + self.patch_size]
                p = Image.fromarray(patch_np)
                if self.transform:
                    p = self.transform(p)
                patches.append(p)

        patch_tensor = torch.stack(patches, dim=0)  # [num_patches, 3, patch_size, patch_size]
        return patch_tensor, label


In [31]:
def run_training():
    # ─── Prepare full loader and precompute all patches ───────────────────────
    full_loader = FullImageLoader(cfg.train_dir)
    labels_arr  = np.array(full_loader.labels)

    patch_size = cfg.target_shape[0]
    precomputed_patches: Dict[str, List[np.ndarray]] = {}
    print("Precomputing padded+denoised patches for every training image ...")
    for img_path in full_loader.paths:
        pil = Image.open(img_path).convert("RGB")
        denoised_np, (h_pad, w_pad), (_, _), (_, _, _, _) = pad_denoise_mask(
            pil, patch_size, denoise_h=10
        )
        patches = []
        for top in range(0, h_pad, patch_size):
            for left in range(0, w_pad, patch_size):
                patch_np = denoised_np[top : top + patch_size, left : left + patch_size, :]
                patches.append(patch_np)
        precomputed_patches[img_path] = patches
    total_patches = sum(len(v) for v in precomputed_patches.values())
    print(f"→ Done. {len(full_loader.paths)} images → {total_patches} patches")

    # ─── Cross‐validation ────────────────────────────────────────────────────
    skf = StratifiedKFold(n_splits=cfg.n_fold, shuffle=True, random_state=cfg.seed)
    best_scores = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(np.zeros(len(labels_arr)), labels_arr)):
        if fold not in cfg.selected_folds:
            continue

        print(f"\n===== Fold {fold} =====")
        ckpt_path = models_dir / f"{cfg.name}_fold{fold}_best.pth"

        # ─── Instantiate model, optimizer, scheduler, criterion ─────────────
        model     = Hecto_EFFICIENTNET(
                        cfg.name,
                        cfg.in_channels,
                        num_classes,
                        pretrained=cfg.pretrained,
                        mixup_alpha=cfg.mixup_alpha
                    ).to(device)
        optimizer = get_optimizer(model, cfg)
        scheduler = get_scheduler(optimizer, cfg)
        criterion = get_criterion(cfg)

        start_epoch = 0
        if ckpt_path.exists() and not cfg.pretrained:
            state = torch.load(ckpt_path, map_location=device)
            model.load_state_dict(state["model_state_dict"])
            optimizer.load_state_dict(state["optimizer_state_dict"])
            if state.get("scheduler_state_dict") is not None and scheduler:
                scheduler.load_state_dict(state["scheduler_state_dict"])
            start_epoch = state.get("epoch", 0)
            print(f"Resumed fold {fold} at epoch {start_epoch}")

        # ─── Build train_items & val_items ───────────────────────────────────
        train_items = [(full_loader.paths[i], full_loader.labels[i]) for i in tr_idx]
        val_items   = [(full_loader.paths[i], full_loader.labels[i]) for i in va_idx]

        # ─── STAGE 1: One center‐crop per image ──────────────────────────────
        stage1_ds = Stage1Dataset(
            image_paths = [p for (p, _) in train_items],
            labels      = [lbl for (_, lbl) in train_items],
            target_size = tuple(cfg.target_shape),
            transform   = transforms.Compose([
                              transforms.ToTensor(),
                              transforms.Normalize(mean=[0.485,0.456,0.406],
                                                   std =[0.229,0.224,0.225]),
                          ])
        )
        stage1_loader = DataLoader(
            stage1_ds,
            batch_size  = cfg.batch_size,
            shuffle     = True,
            num_workers = cfg.num_workers,
            pin_memory  = True,
        )

        N1 = 1
        best_logloss_stage1 = float("inf")

        for e1 in range(N1):
            print(f"\n>>> Fold {fold} | Stage 1 Epoch {e1+1}/{N1} <<<")
            model.train()
            total_loss1 = 0.0
            all_targets1, all_probs1 = [], []

            for imgs, labels_int in tqdm(stage1_loader, desc="Stage1 Training"):
                imgs     = imgs.to(device)
                labels_i = labels_int.to(device)
                one_hot  = F.one_hot(labels_i, num_classes).float()

                optimizer.zero_grad()
                out = model(imgs, one_hot)
                if isinstance(out, tuple):
                    logits1, loss1 = out
                else:
                    logits1 = out
                    loss1   = criterion(logits1, one_hot)

                loss1.backward()
                optimizer.step()
                if isinstance(scheduler, lr_scheduler.OneCycleLR):
                    scheduler.step()

                total_loss1 += loss1.item() * imgs.size(0)
                probs1 = torch.sigmoid(logits1).detach().cpu().numpy()
                all_probs1.append(probs1)
                all_targets1.append(labels_i.cpu().numpy())

            avg_loss1    = total_loss1 / len(stage1_loader.dataset)
            all_probs1   = np.vstack(all_probs1)
            all_targets1 = np.concatenate(all_targets1)
            eps    = 1e-15
            clipped1 = np.clip(all_probs1, eps, 1 - eps)
            clipped1 = clipped1 / clipped1.sum(axis=1, keepdims=True)
            val_logloss1 = log_loss(all_targets1, clipped1, labels=list(range(num_classes)))
            print(f"    Stage1: Train Loss {avg_loss1:.4f} | LogLoss {val_logloss1:.4f}")

            if val_logloss1 < best_logloss_stage1:
                best_logloss_stage1 = val_logloss1
                torch.save(
                    model.state_dict(),
                    models_dir / f"{cfg.name}_fold{fold}_stage1.pth"
                )

            if scheduler and not isinstance(scheduler, lr_scheduler.OneCycleLR):
                if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(val_logloss1)
                else:
                    scheduler.step()

        torch.cuda.empty_cache()
        gc.collect()

        # ─── STAGE 2: use precomputed patches ────────────────────────────────
        stage2_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std =[0.229, 0.224, 0.225]),
        ])
        stage2_ds = Stage2Dataset(
            items       = train_items,
            precomputed = precomputed_patches,
            transform   = stage2_transform,
        )

        patch_labels = [lbl for (_, _, lbl) in stage2_ds.index_map]
        patch_weights = compute_class_weights(patch_labels, num_classes)

        sampler2 = WeightedRandomSampler(
            weights     = patch_weights,
            num_samples = len(patch_weights),
            replacement = False
        )
        stage2_loader = DataLoader(
            stage2_ds,
            batch_size  = cfg.batch_size,
            sampler     = sampler2,
            num_workers = cfg.num_workers,
            pin_memory  = True,
        )

        N2 = cfg.epochs - N1
        best_logloss_stage2 = float("inf")

        for epoch2 in range(N2):
            print(f"\n>>> Fold {fold} | Stage 2 Epoch {epoch2+1}/{N2} <<<")
            model.train()
            total_loss2 = 0.0
            all_targets2, all_probs2 = [], []

            for patches, labels_onehot in tqdm(stage2_loader, desc="Stage2 Training"):
                patches  = patches.to(device)
                labels_oh = labels_onehot.to(device)

                optimizer.zero_grad()
                out2 = model(patches, labels_oh)
                if isinstance(out2, tuple):
                    logits2, loss2 = out2
                else:
                    logits2 = out2
                    loss2   = criterion(logits2, labels_oh)

                loss2.backward()
                optimizer.step()
                if isinstance(scheduler, lr_scheduler.OneCycleLR):
                    scheduler.step()

                total_loss2 += loss2.item() * patches.size(0)
                probs2 = torch.sigmoid(logits2).detach().cpu().numpy()
                all_probs2.append(probs2)
                labels_int2 = torch.argmax(labels_oh, dim=1).cpu().numpy()
                all_targets2.append(labels_int2)

            avg_loss2   = total_loss2 / len(stage2_loader.dataset)
            all_probs2  = np.vstack(all_probs2)
            all_targets2 = np.concatenate(all_targets2)

            num_images_fold = len(train_items)
            all_probs2 = all_probs2.reshape(num_images_fold, -1, num_classes)
            avg_probs_img = all_probs2.mean(axis=1)

            eps     = 1e-15
            clipped2 = np.clip(avg_probs_img, eps, 1 - eps)
            clipped2 = clipped2 / clipped2.sum(axis=1, keepdims=True)
            labels_img = np.array([lbl for (_, lbl) in train_items])
            val_logloss2 = log_loss(labels_img, clipped2, labels=list(range(num_classes)))
            print(f"    Stage2: Train Loss {avg_loss2:.4f} | Image‐level LogLoss {val_logloss2:.4f}")

            if val_logloss2 < best_logloss_stage2:
                best_logloss_stage2 = val_logloss2
                torch.save({
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scheduler_state_dict": scheduler.state_dict() if scheduler else None,
                    "epoch": N1 + epoch2 + 1,
                    "val_logloss": val_logloss2
                }, ckpt_path)
                print(f"      → Saved new best Stage2 checkpoint (LogLoss: {best_logloss_stage2:.4f})")

            if scheduler and not isinstance(scheduler, lr_scheduler.OneCycleLR):
                if isinstance(scheduler, lr_scheduler.ReduceLROnPlateau):
                    scheduler.step(val_logloss2)
                else:
                    scheduler.step()

        best_scores.append(best_logloss_stage2)

        # ─── Fold‐level validation ─────────────────────────────────────────────
        val_ds = FullValDataset(
            val_items  = val_items,
            patch_size = cfg.target_shape[0],
            transform  = transforms.Compose([
                            transforms.ToTensor(),
                            transforms.Normalize(mean=[0.485,0.456,0.406],
                                                 std =[0.229,0.224,0.225]),
                        ])
        )
        val_loader = DataLoader(
            val_ds,
            batch_size   = 1,
            shuffle      = False,
            num_workers  = cfg.num_workers,
            pin_memory   = True
        )

        model.eval()
        fold_preds, fold_labels = [], []
        with torch.no_grad():
            for patch_batch, label_int in tqdm(val_loader, desc="Fold Validation"):
                patches = patch_batch.squeeze(0)
                all_logits = []
                for i in range(0, patches.size(0), cfg.batch_size):
                    chunk = patches[i : i + cfg.batch_size].to(device)
                    logits_chunk = model(chunk)
                    all_logits.append(logits_chunk.cpu())
                all_logits = torch.cat(all_logits, dim=0)
                avg_probs = torch.sigmoid(all_logits).mean(dim=0)
                fold_preds.append(avg_probs.numpy())
                fold_labels.append(label_int.item())

        fold_preds   = np.vstack(fold_preds)
        fold_labels  = np.array(fold_labels)
        eps          = 1e-15
        clipped_val  = np.clip(fold_preds, eps, 1 - eps)
        clipped_val  = clipped_val / clipped_val.sum(axis=1, keepdims=True)
        fold_logloss = log_loss(fold_labels, clipped_val, labels=list(range(num_classes)))
        print(f"Fold {fold} final LogLoss: {fold_logloss:.4f}")

    # ─── Cross‐validation summary ────────────────────────────────────────────
    print("\n===== CV Results =====")
    for f, score in zip(cfg.selected_folds, best_scores):
        print(f"Fold {f}: {score:.4f}")
    print(f"Mean LogLoss: {np.mean(best_scores):.4f}")

In [32]:
import time
from pathlib import Path

import numpy as np
from PIL import Image
import torch
from ultralytics import YOLO

# ─── STEP 0: Load YOLO model
yolo_weights_path = "../models/yolo11n.pt"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    model = YOLO(yolo_weights_path)
    model.to(device).eval()
    print("✅ YOLO11n successfully loaded via `YOLO(...)` interface\n")
except Exception as e:
    raise RuntimeError(f"❌ Could not load YOLO11n weights:\n  {e}")

# ─── STEP 1.1: Warm‐up CUDA (avoids a large one‐time allocation crash)
with torch.no_grad():
    dummy = torch.zeros((1, 3, 640, 640), device=device, dtype=torch.float32)
    _ = model(dummy)[0]
del dummy
print("🚀 CUDA kernels warmed up with a dummy forward‐pass\n")

# ─── STEP 2: Set up FullImageLoader, thresholds, and parameters
full_loader   = FullImageLoader(cfg.train_dir)
conf_thresh   = 0.25
iou_thresh    = 0.45
img_size_inf  = 640   # YOLO inference size (letterbox to 640×640)
patch_size    = 224   # for b0 variant

print("Starting precompute for b0 (224×224), two stages\n")

variant      = "b0"
start_all    = time.time()

# ─── Prepare output folders under /data/hecto/precomputed_b0
out_base     = Path("/data") / "hecto" / f"precomputed_{variant}"
stage1_base  = out_base / "stage1_masked_resized"
stage2_base  = out_base / "stage2_masked_patchify"
stage1_base.mkdir(parents=True, exist_ok=True)
stage2_base.mkdir(parents=True, exist_ok=True)

total_images = len(full_loader.paths)
print(f"=== Precomputing variant {variant} (patch_size = {patch_size}) ===")

for idx, img_path in enumerate(full_loader.paths, start=1):
    if idx % 100 == 1:
        elapsed = time.time() - start_all
        print(f"  [{variant}] Image {idx}/{total_images} — elapsed {elapsed:.1f}s")

    rel      = Path(img_path).relative_to(cfg.train_dir)
    cls_name = rel.parent.name
    stem     = rel.stem

    (stage1_base / cls_name).mkdir(exist_ok=True, parents=True)
    (stage2_base / cls_name).mkdir(exist_ok=True, parents=True)

    pil = Image.open(img_path).convert("RGB")
    img0 = np.array(pil)

    results = model(pil, imgsz=img_size_inf)
    dets    = results[0].boxes

    boxes_xyxy = dets.xyxy.cpu().numpy()
    confs      = dets.conf.cpu().numpy()
    cls_idxs   = dets.cls.cpu().numpy()

    car_mask = (cls_idxs == 2) & (confs >= conf_thresh)
    if car_mask.sum() == 0:
        masked = pil
    else:
        car_boxes = boxes_xyxy[car_mask]
        areas     = (car_boxes[:, 2] - car_boxes[:, 0]) * (car_boxes[:, 3] - car_boxes[:, 1])
        best_i    = int(np.argmax(areas))
        x1, y1, x2, y2 = car_boxes[best_i].astype(int)

        H, W, _ = img0.shape
        x1 = max(0, min(x1, W - 1))
        y1 = max(0, min(y1, H - 1))
        x2 = max(0, min(x2, W))
        y2 = max(0, min(y2, H))

        masked   = Image.new("RGB", pil.size, (0, 0, 0))
        car_crop = pil.crop((x1, y1, x2, y2))
        masked.paste(car_crop, (x1, y1))

    resized_masked = masked.resize((patch_size, patch_size))
    arr_s1         = np.array(resized_masked, dtype=np.uint8)
    np.save(str(stage1_base / cls_name / f"{stem}.npy"), arr_s1)

    denoised_np, (h_pad, w_pad), (_, _), (_, _, _, _) = pad_denoise_mask(
        masked, patch_size, denoise_h=10
    )
    patches_m = []
    for top in range(0, h_pad, patch_size):
        for left in range(0, w_pad, patch_size):
            tile = denoised_np[top : top + patch_size, left : left + patch_size, :]
            patches_m.append(tile)
    arr_s2 = np.stack(patches_m, axis=0).astype(np.uint8)
    np.save(str(stage2_base / cls_name / f"{stem}.npy"), arr_s2)

elapsed_total = time.time() - start_all
print(f"\n→ Finished variant {variant} in {elapsed_total:.1f}s, saved under {out_base}")


✅ YOLO11n successfully loaded via `YOLO(...)` interface


0: 640x640 (no detections), 2.5ms
Speed: 0.0ms preprocess, 2.5ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)
🚀 CUDA kernels warmed up with a dummy forward‐pass

Starting precompute for b0 (224×224), two stages

=== Precomputing variant b0 (patch_size = 224) ===
  [b0] Image 1/33137 — elapsed 0.0s

0: 352x640 1 car, 20.4ms
Speed: 6.3ms preprocess, 20.4ms inference, 88.7ms postprocess per image at shape (1, 3, 352, 640)

0: 512x640 1 car, 18.9ms
Speed: 0.9ms preprocess, 18.9ms inference, 0.6ms postprocess per image at shape (1, 3, 512, 640)

0: 320x640 1 car, 16.0ms
Speed: 0.6ms preprocess, 16.0ms inference, 0.5ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 car, 2.7ms
Speed: 0.6ms preprocess, 2.7ms inference, 0.5ms postprocess per image at shape (1, 3, 320, 640)

0: 512x640 1 car, 2.9ms
Speed: 0.8ms preprocess, 2.9ms inference, 0.5ms postprocess per image at shape (1, 3, 512, 640)

0: 544x640

KeyboardInterrupt: 

'/home/minkeymouse/kaggle_projects/hecto/src'