# A.5 Late Fusion (Frozen RGB + Frozen Depth + Trainable Fusion)

**Experiment:** A.5  
**Architecture:** Late Fusion (Two-Stream with Feature Fusion)  
**Input:** RGB (3-channel) + Depth (3-channel) - processed separately  
**Objective:** Combine RGB and Depth features for improved detection  
**Classes:** 1 (fresh_fruit_bunch)

## Architecture Overview

```
RGB Image (3ch)          Depth Image (3ch)
     |                        |
     v                        v
[Frozen RGB Backbone]    [Frozen Depth Backbone]
  (from A.1 weights)       (from A.2 weights)
     |                        |
     +-----> P3 Features <----+
              (256 ch each)
                   |
                   v
         [Concatenate: 512 ch]
                   |
                   v
         [1x1 Conv: 512 -> 256]
         [BatchNorm + SiLU]
                   |
                   v
         [YOLO Detection Head]
              (trainable)
                   |
              [Output]
```

## Key Features
- **Frozen Backbones:** RGB (A.1) and Depth (A.2) backbones are 100% frozen
- **Trainable Components:** Only fusion layer (1x1 Conv) and detection head
- **Dual Input:** Separate RGB and Depth images loaded together
- **Memory Note:** Batch size 8 due to dual backbone forward pass

## Uniform Augmentation (All Experiments)
- translate: 0.1
- scale: 0.5
- fliplr: 0.5
- hsv_h: 0.0 (disabled)
- hsv_s: 0.0 (disabled)
- hsv_v: 0.0 (disabled)
- erasing: 0.0
- mosaic: 0.0
- mixup: 0.0

In [None]:
# =============================================================================
# Cell 1: Environment Setup & Install
# =============================================================================
import os
import sys
from pathlib import Path

# Detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')

if IS_KAGGLE:
    BASE_PATH = Path('/kaggle/working')
else:
    BASE_PATH = Path(r'D:/Work/Assisten Dosen/Anylabel/Experiments')

# Install dependencies
!pip install -q ultralytics

print("="*60)
print("A.5 LATE FUSION - ENVIRONMENT SETUP")
print("="*60)
print(f"Running on: {'Kaggle' if IS_KAGGLE else 'Local'}")
print(f"Base Path: {BASE_PATH}")

In [None]:
# =============================================================================
# Cell 2: Imports
# =============================================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast

import cv2
import numpy as np
import pandas as pd
import shutil
import json
import gc
import time
import random
from datetime import datetime
from tqdm.auto import tqdm
from typing import Dict, List, Tuple, Optional
from copy import deepcopy

from ultralytics import YOLO
from ultralytics.nn.tasks import DetectionModel
from ultralytics.utils.loss import v8DetectionLoss
from ultralytics.utils.ops import xywh2xyxy

# Disable wandb logging
os.environ["WANDB_DISABLED"] = "true"

print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    torch.cuda.empty_cache()

In [None]:
# =============================================================================
# Cell 3: Configuration - UNIFORM AUGMENTATION
# =============================================================================

# Uniform augmentation parameters (MUST match A.1-A.4b)
AUGMENT_PARAMS = {
    'translate': 0.1,
    'scale': 0.5,
    'fliplr': 0.5,
    'hsv_h': 0.0,      # Disabled for uniformity
    'hsv_s': 0.0,      # Disabled for uniformity
    'hsv_v': 0.0,      # Disabled for uniformity
    'erasing': 0.0,
    'mosaic': 0.0,     # Disabled for uniformity
    'mixup': 0.0,
    'degrees': 0.0,
    'copy_paste': 0.0,
}

# Training parameters
SEEDS = [42, 123, 456, 789, 101]
EXP_PREFIX = "exp_a5_fusion"
EPOCHS = 100
PATIENCE = 30
IMGSZ = 640
BATCH_SIZE = 8  # Smaller due to dual backbone (memory intensive)
DEVICE = 0 if torch.cuda.is_available() else 'cpu'
NUM_WORKERS = 4 if not IS_KAGGLE else 2

print("="*60)
print("A.5 LATE FUSION - TRAINING CONFIGURATION")
print("="*60)
print(f"Experiment:   A.5 Late Fusion")
print(f"Seeds:        {SEEDS} ({len(SEEDS)} runs)")
print(f"Epochs:       {EPOCHS} (patience: {PATIENCE})")
print(f"Image Size:   {IMGSZ}")
print(f"Batch Size:   {BATCH_SIZE} (reduced for dual backbone)")
print(f"Device:       {DEVICE}")
print(f"\nArchitecture:")
print(f"  RGB Backbone:   FROZEN (from A.1)")
print(f"  Depth Backbone: FROZEN (from A.2)")
print(f"  Trainable:      Fusion Layer + Detection Head")
print(f"\nUniform Augmentation:")
for k, v in AUGMENT_PARAMS.items():
    print(f"  {k}: {v}")
print("="*60)

In [None]:
# =============================================================================
# Cell 4: Paths Configuration
# =============================================================================

if IS_KAGGLE:
    # Kaggle paths - adjust dataset names as needed
    RGB_DATASET = Path('/kaggle/input/ffb-localization-dataset/ffb_localization')
    DEPTH_DATASET = Path('/kaggle/input/ffb-localization-depth')
    # Pre-trained weights from A.1 and A.2 (upload as datasets)
    RGB_WEIGHTS_DIR = Path('/kaggle/input/ffb-a1-weights')
    DEPTH_WEIGHTS_DIR = Path('/kaggle/input/ffb-a2-weights')
else:
    # Local paths
    RGB_DATASET = BASE_PATH / 'datasets' / 'ffb_localization'
    DEPTH_DATASET = BASE_PATH / 'datasets' / 'ffb_localization_depth'
    RGB_WEIGHTS_DIR = BASE_PATH / 'runs' / 'detect'
    DEPTH_WEIGHTS_DIR = BASE_PATH / 'runs' / 'detect'

RUNS_PATH = BASE_PATH / 'runs' / 'detect'
KAGGLE_OUTPUT = BASE_PATH / 'kaggleoutput'
RUNS_PATH.mkdir(parents=True, exist_ok=True)
KAGGLE_OUTPUT.mkdir(parents=True, exist_ok=True)

print("Paths Configuration:")
print(f"  RGB Dataset:     {RGB_DATASET}")
print(f"  Depth Dataset:   {DEPTH_DATASET}")
print(f"  RGB Weights Dir: {RGB_WEIGHTS_DIR}")
print(f"  Depth Weights:   {DEPTH_WEIGHTS_DIR}")
print(f"  Runs Path:       {RUNS_PATH}")
print(f"  Output Path:     {KAGGLE_OUTPUT}")

# Verify datasets exist
print(f"\nDataset Verification:")
for name, path in [('RGB', RGB_DATASET), ('Depth', DEPTH_DATASET)]:
    exists = path.exists()
    print(f"  {name}: {'OK' if exists else 'NOT FOUND'} - {path}")

In [None]:
# =============================================================================
# Cell 5: Dual-Input Dataset Class
# =============================================================================

class LateFusionDataset(Dataset):
    """
    Dataset that loads RGB and Depth images separately for late fusion.
    Applies synchronized geometric augmentation to both modalities.
    """
    
    def __init__(
        self,
        rgb_img_dir: Path,
        depth_img_dir: Path,
        label_dir: Path,
        img_size: int = 640,
        augment: bool = False,
        augment_params: dict = None
    ):
        self.rgb_img_dir = Path(rgb_img_dir)
        self.depth_img_dir = Path(depth_img_dir)
        self.label_dir = Path(label_dir)
        self.img_size = img_size
        self.augment = augment
        self.augment_params = augment_params or {}
        
        # Get list of files (use RGB as reference)
        self.image_files = sorted([p.name for p in self.rgb_img_dir.glob('*.png')])
        
        # Filter to only include files that exist in both RGB and Depth
        valid_files = []
        for fname in self.image_files:
            rgb_exists = (self.rgb_img_dir / fname).exists()
            depth_exists = (self.depth_img_dir / fname).exists()
            label_exists = (self.label_dir / fname.replace('.png', '.txt')).exists()
            if rgb_exists and depth_exists and label_exists:
                valid_files.append(fname)
        
        self.image_files = valid_files
        print(f"[LateFusionDataset] Loaded {len(self)} valid samples")
    
    def __len__(self) -> int:
        return len(self.image_files)
    
    def _apply_augmentation(self, rgb, depth, labels):
        """
        Apply synchronized geometric augmentation to RGB and Depth.
        Only geometric transforms (translate, scale, fliplr) are applied.
        """
        h, w = rgb.shape[:2]
        
        # Horizontal flip
        if random.random() < self.augment_params.get('fliplr', 0.0):
            rgb = cv2.flip(rgb, 1)
            depth = cv2.flip(depth, 1)
            if len(labels) > 0:
                labels[:, 1] = 1.0 - labels[:, 1]  # Flip x_center
        
        # Scale and translate (affine transform)
        scale = self.augment_params.get('scale', 0.0)
        translate = self.augment_params.get('translate', 0.0)
        
        if scale > 0 or translate > 0:
            # Random scale factor
            s = random.uniform(1 - scale, 1 + scale)
            
            # Random translation
            tx = random.uniform(-translate, translate) * w
            ty = random.uniform(-translate, translate) * h
            
            # Affine matrix
            M = np.array([
                [s, 0, tx + (1 - s) * w / 2],
                [0, s, ty + (1 - s) * h / 2]
            ], dtype=np.float32)
            
            # Apply to both images
            rgb = cv2.warpAffine(rgb, M, (w, h), borderValue=(114, 114, 114))
            depth = cv2.warpAffine(depth, M, (w, h), borderValue=(0, 0, 0))
            
            # Transform labels
            if len(labels) > 0:
                # Convert to pixel coordinates
                x_center = labels[:, 1] * w
                y_center = labels[:, 2] * h
                box_w = labels[:, 3] * w
                box_h = labels[:, 4] * h
                
                # Apply transformation
                x_center = x_center * s + tx + (1 - s) * w / 2
                y_center = y_center * s + ty + (1 - s) * h / 2
                box_w = box_w * s
                box_h = box_h * s
                
                # Convert back to normalized
                labels[:, 1] = x_center / w
                labels[:, 2] = y_center / h
                labels[:, 3] = box_w / w
                labels[:, 4] = box_h / h
                
                # Clip to valid range
                labels[:, 1:] = np.clip(labels[:, 1:], 0, 1)
                
                # Filter out invalid boxes
                valid = (labels[:, 3] > 0.001) & (labels[:, 4] > 0.001)
                labels = labels[valid]
        
        return rgb, depth, labels
    
    def __getitem__(self, idx: int) -> Dict:
        """
        Get a single sample with RGB, Depth, and labels.
        """
        fname = self.image_files[idx]
        
        # Load RGB (BGR format from cv2)
        rgb_path = self.rgb_img_dir / fname
        rgb = cv2.imread(str(rgb_path))
        
        # Load Depth (3-channel from processed depth)
        depth_path = self.depth_img_dir / fname
        depth = cv2.imread(str(depth_path))
        
        if depth is None:
            # Fallback: load as grayscale and convert to 3-channel
            depth = cv2.imread(str(depth_path), cv2.IMREAD_GRAYSCALE)
            if depth is not None:
                depth = cv2.cvtColor(depth, cv2.COLOR_GRAY2BGR)
            else:
                # Last fallback: use zeros
                depth = np.zeros_like(rgb)
        
        # Load labels (YOLO format: class x_center y_center width height)
        label_path = self.label_dir / fname.replace('.png', '.txt')
        if label_path.exists():
            labels = np.loadtxt(str(label_path), ndmin=2).astype(np.float32)
            if labels.size == 0:
                labels = np.zeros((0, 5), dtype=np.float32)
        else:
            labels = np.zeros((0, 5), dtype=np.float32)
        
        # Resize if needed
        if rgb.shape[:2] != (self.img_size, self.img_size):
            rgb = cv2.resize(rgb, (self.img_size, self.img_size))
            depth = cv2.resize(depth, (self.img_size, self.img_size))
        
        # Apply augmentation (synchronized)
        if self.augment:
            rgb, depth, labels = self._apply_augmentation(rgb, depth, labels)
        
        # Convert BGR to RGB for model input
        rgb = cv2.cvtColor(rgb, cv2.COLOR_BGR2RGB)
        depth = cv2.cvtColor(depth, cv2.COLOR_BGR2RGB)
        
        # Normalize to [0, 1] and convert to tensor
        rgb_tensor = torch.from_numpy(rgb).permute(2, 0, 1).float() / 255.0
        depth_tensor = torch.from_numpy(depth).permute(2, 0, 1).float() / 255.0
        
        # Labels tensor
        labels_tensor = torch.from_numpy(labels).float()
        
        return {
            'rgb': rgb_tensor,
            'depth': depth_tensor,
            'labels': labels_tensor,
            'batch_idx': torch.zeros(len(labels)),
            'img_path': str(rgb_path),
        }


def collate_fn(batch):
    """Custom collate function for variable-length labels."""
    rgb = torch.stack([item['rgb'] for item in batch])
    depth = torch.stack([item['depth'] for item in batch])
    
    # Handle labels with batch index
    labels_list = []
    for i, item in enumerate(batch):
        labels = item['labels']
        if len(labels) > 0:
            batch_idx = torch.full((len(labels), 1), i, dtype=torch.float32)
            labels_with_idx = torch.cat([batch_idx, labels], dim=1)
            labels_list.append(labels_with_idx)
    
    if labels_list:
        labels = torch.cat(labels_list, dim=0)
    else:
        labels = torch.zeros((0, 6), dtype=torch.float32)
    
    return {
        'rgb': rgb,
        'depth': depth,
        'labels': labels,
        'img_paths': [item['img_path'] for item in batch],
    }

print("LateFusionDataset class defined")

In [None]:
# =============================================================================
# Cell 6: Late Fusion Model Architecture (Multi-Scale)
# =============================================================================

class LateFusionModel(nn.Module):
    """
    Late Fusion Model for FFB Detection with Multi-Scale Features.
    
    Architecture:
        1. Frozen RGB backbone (from A.1) - extracts P3, P4, P5
        2. Frozen Depth backbone (from A.2) - extracts P3, P4, P5
        3. Concatenate features at each scale:
           - P3: 128 + 128 = 256 -> 128 channels
           - P4: 256 + 256 = 512 -> 256 channels  
           - P5: 256 + 256 = 512 -> 256 channels
        4. YOLO Detection head (trainable) on fused features
    """
    
    def __init__(
        self,
        rgb_model_path: str,
        depth_model_path: str,
        num_classes: int = 1,
        device: str = 'cuda'
    ):
        super().__init__()
        
        print("\n" + "="*60)
        print("Initializing Late Fusion Model (Multi-Scale)")
        print("="*60)
        
        self.device = device
        self.num_classes = num_classes
        
        # Load RGB model (A.1) and freeze
        print(f"\nLoading RGB backbone from: {rgb_model_path}")
        self.rgb_yolo = YOLO(rgb_model_path)
        self.rgb_backbone = self.rgb_yolo.model.model[0]  # Backbone only
        for param in self.rgb_backbone.parameters():
            param.requires_grad = False
        self.rgb_backbone.eval()
        print(f"  RGB backbone frozen")
        
        # Load Depth model (A.2) and freeze
        print(f"\nLoading Depth backbone from: {depth_model_path}")
        self.depth_yolo = YOLO(depth_model_path)
        self.depth_backbone = self.depth_yolo.model.model[0]  # Backbone only
        for param in self.depth_backbone.parameters():
            param.requires_grad = False
        self.depth_backbone.eval()
        print(f"  Depth backbone frozen")
        
        # Fusion layers for each scale
        # P3: 128 (RGB) + 128 (Depth) = 256 -> 128
        # P4: 256 (RGB) + 256 (Depth) = 512 -> 256
        # P5: 256 (RGB) + 256 (Depth) = 512 -> 256
        self.fusion_p3 = nn.Sequential(
            nn.Conv2d(256, 128, kernel_size=1, bias=False),
            nn.BatchNorm2d(128),
            nn.SiLU(inplace=True)
        )
        self.fusion_p4 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=1, bias=False),
            nn.BatchNorm2d(256),
            nn.SiLU(inplace=True)
        )
        self.fusion_p5 = nn.Sequential(
            nn.Conv2d(512, 256, kernel_size=1, bias=False),
            nn.BatchNorm2d(256),
            nn.SiLU(inplace=True)
        )
        
        # Initialize fusion layers
        for m in [self.fusion_p3, self.fusion_p4, self.fusion_p5]:
            for layer in m.modules():
                if isinstance(layer, nn.Conv2d):
                    nn.init.kaiming_normal_(layer.weight, mode='fan_out', nonlinearity='relu')
        
        print(f"\nFusion layers initialized:")
        print(f"  P3: 256 -> 128 channels")
        print(f"  P4: 512 -> 256 channels")
        print(f"  P5: 512 -> 256 channels")
        
        # Detection head from RGB model (will be retrained)
        # Copy the Detect head which expects multi-scale features
        self.detect = deepcopy(self.rgb_yolo.model.model[-1])
        for param in self.detect.parameters():
            param.requires_grad = True
        print(f"\nDetection head initialized (trainable)")
        
        # Count parameters
        self._count_parameters()
    
    def _count_parameters(self):
        """Count total and trainable parameters."""
        total = sum(p.numel() for p in self.parameters())
        trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
        frozen = total - trainable
        
        print(f"\n[Parameter Count]")
        print(f"  Total:      {total:,}")
        print(f"  Trainable:  {trainable:,} ({100*trainable/total:.1f}%)")
        print(f"  Frozen:     {frozen:,} ({100*frozen/total:.1f}%)")
        print("="*60)
    
    def _extract_features(self, backbone, x):
        """
        Extract multi-scale features (P3, P4, P5) from backbone.
        
        For YOLOv11n:
        - P3: stride 8, 128 channels (layer 4)
        - P4: stride 16, 256 channels (layer 6)
        - P5: stride 32, 256 channels (layer 8)
        """
        features = []
        extract_layers = [4, 6, 8]  # P3, P4, P5
        
        for i, module in enumerate(backbone):
            x = module(x)
            if i in extract_layers:
                features.append(x)
                if len(features) == len(extract_layers):
                    break
        
        return features  # [P3, P4, P5]
    
    def forward(self, rgb: torch.Tensor, depth: torch.Tensor):
        """
        Forward pass through late fusion model.
        
        Args:
            rgb: RGB input [B, 3, H, W]
            depth: Depth input [B, 3, H, W]
            
        Returns:
            Detection output from YOLO Detect head
        """
        # Extract multi-scale features from frozen backbones (no grad)
        with torch.no_grad():
            rgb_features = self._extract_features(self.rgb_backbone, rgb)   # [P3, P4, P5]
            depth_features = self._extract_features(self.depth_backbone, depth)  # [P3, P4, P5]
        
        # Fuse each scale
        # P3: 128 + 128 = 256 -> 128
        # P4: 256 + 256 = 512 -> 256
        # P5: 256 + 256 = 512 -> 256
        fused_p3 = self.fusion_p3(torch.cat([rgb_features[0], depth_features[0]], dim=1))
        fused_p4 = self.fusion_p4(torch.cat([rgb_features[1], depth_features[1]], dim=1))
        fused_p5 = self.fusion_p5(torch.cat([rgb_features[2], depth_features[2]], dim=1))
        
        fused_features = [fused_p3, fused_p4, fused_p5]
        
        # Pass through detection head
        output = self.detect(fused_features)
        
        return output
    
    def train(self, mode=True):
        """Set training mode, keeping backbones in eval."""
        super().train(mode)
        # Always keep backbones in eval mode
        self.rgb_backbone.eval()
        self.depth_backbone.eval()
        return self


print("LateFusionModel class defined with multi-scale features")

In [None]:
# =============================================================================
# Cell 7: Late Fusion Trainer with Proper YOLO Loss
# =============================================================================

class LateFusionTrainer:
    """
    Proper trainer for Late Fusion model using Ultralytics YOLO loss.
    
    Uses v8DetectionLoss for proper training with box_loss, cls_loss, and dfl_loss.
    """
    
    def __init__(
        self,
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        device: str = 'cuda',
        epochs: int = 100,
        patience: int = 30,
        lr: float = 0.01,
        momentum: float = 0.937,
        weight_decay: float = 0.0005,
        save_dir: Path = None,
        num_classes: int = 1,
    ):
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.device = device
        self.epochs = epochs
        self.patience = patience
        self.save_dir = Path(save_dir) if save_dir else Path('runs/fusion')
        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.num_classes = num_classes
        
        # Only optimize trainable parameters
        trainable_params = [p for p in model.parameters() if p.requires_grad]
        self.optimizer = torch.optim.SGD(
            trainable_params,
            lr=lr,
            momentum=momentum,
            weight_decay=weight_decay,
            nesterov=True,
        )
        
        # Learning rate scheduler - cosine annealing
        self.scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer,
            T_max=epochs,
            eta_min=lr * 0.01,
        )
        
        # Mixed precision training
        self.scaler = GradScaler()
        
        # Initialize YOLO loss function
        # We need to create a dummy DetectionModel to get the loss function
        self.criterion = self._create_loss_function()
        
        # Training state
        self.best_fitness = 0.0
        self.epochs_no_improve = 0
        self.history = []
        
        # EMA (Exponential Moving Average) for model weights
        self.ema = None
        
    def _create_loss_function(self):
        """
        Create YOLO v8DetectionLoss function.
        
        Returns:
            v8DetectionLoss instance
        """
        from ultralytics.utils.loss import v8DetectionLoss
        from ultralytics.nn.tasks import DetectionModel
        
        # Create a minimal DetectionModel to get the loss function
        # The loss function needs access to model.stride and model.nc
        class DummyDetectionModel:
            def __init__(self, model):
                self.model = model
                self.nc = model.num_classes
                # Get stride from the detect head if available
                if hasattr(model, 'detect') and model.detect is not None:
                    self.stride = model.detect.stride
                else:
                    # Default strides for YOLOv11n
                    self.stride = torch.tensor([8., 16., 32.])
        
        dummy_model = DummyDetectionModel(self.model)
        return v8DetectionLoss(dummy_model)
    
    def _prepare_batch(self, batch: Dict) -> Dict:
        """
        Prepare batch for YOLO loss computation.
        
        Args:
            batch: Batch from dataloader
            
        Returns:
            Prepared batch dict with keys needed by v8DetectionLoss
        """
        rgb = batch['rgb'].to(self.device)
        depth = batch['depth'].to(self.device)
        labels = batch['labels'].to(self.device)  # [batch_idx, class, x, y, w, h]
        
        # Split labels into components
        if len(labels) > 0:
            batch_idx = labels[:, 0].long()
            cls = labels[:, 1].long()
            bboxes = labels[:, 2:6]  # xywh normalized
        else:
            batch_idx = torch.zeros(0, dtype=torch.long, device=self.device)
            cls = torch.zeros(0, dtype=torch.long, device=self.device)
            bboxes = torch.zeros(0, 4, device=self.device)
        
        return {
            'img': (rgb, depth),
            'batch_idx': batch_idx,
            'cls': cls,
            'bboxes': bboxes,
        }
    
    def train_epoch(self, epoch: int):
        """Train for one epoch."""
        self.model.train()
        
        # Ensure backbones stay in eval mode (frozen)
        self.model.rgb_backbone.eval()
        self.model.depth_backbone.eval()
        
        epoch_loss = 0.0
        epoch_box_loss = 0.0
        epoch_cls_loss = 0.0
        epoch_dfl_loss = 0.0
        
        pbar = tqdm(self.train_loader, desc=f"Epoch {epoch+1}/{self.epochs}")
        
        for batch_idx, batch in enumerate(pbar):
            # Prepare batch
            prepared_batch = self._prepare_batch(batch)
            
            self.optimizer.zero_grad()
            
            # Forward pass with autocast for mixed precision
            with autocast():
                rgb = prepared_batch['img'][0]
                depth = prepared_batch['img'][1]
                
                # Forward through model
                preds = self.model(rgb, depth)
                
                # Compute YOLO loss
                loss, loss_items = self.criterion(preds, prepared_batch)
                
                # loss_items: [box_loss, cls_loss, dfl_loss]
                box_loss = loss_items[0] if len(loss_items) > 0 else 0
                cls_loss = loss_items[1] if len(loss_items) > 1 else 0
                dfl_loss = loss_items[2] if len(loss_items) > 2 else 0
            
            # Backward pass
            self.scaler.scale(loss).backward()
            
            # Gradient clipping
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10.0)
            
            self.scaler.step(self.optimizer)
            self.scaler.update()
            
            # Track losses
            epoch_loss += loss.item()
            epoch_box_loss += box_loss.item() if torch.is_tensor(box_loss) else box_loss
            epoch_cls_loss += cls_loss.item() if torch.is_tensor(cls_loss) else cls_loss
            epoch_dfl_loss += dfl_loss.item() if torch.is_tensor(dfl_loss) else dfl_loss
            
            # Update progress bar
            pbar.set_postfix({
                'loss': f"{loss.item():.4f}",
                'box': f"{box_loss.item() if torch.is_tensor(box_loss) else box_loss:.4f}",
                'cls': f"{cls_loss.item() if torch.is_tensor(cls_loss) else cls_loss:.4f}",
            })
        
        num_batches = len(self.train_loader)
        return {
            'loss': epoch_loss / num_batches,
            'box_loss': epoch_box_loss / num_batches,
            'cls_loss': epoch_cls_loss / num_batches,
            'dfl_loss': epoch_dfl_loss / num_batches,
        }
    
    @torch.no_grad()
    def validate(self):
        """Validate the model."""
        self.model.eval()
        
        val_loss = 0.0
        val_box_loss = 0.0
        val_cls_loss = 0.0
        val_dfl_loss = 0.0
        
        for batch in self.val_loader:
            prepared_batch = self._prepare_batch(batch)
            
            rgb = prepared_batch['img'][0]
            depth = prepared_batch['img'][1]
            
            # Forward pass
            preds = self.model(rgb, depth)
            
            # Compute loss
            loss, loss_items = self.criterion(preds, prepared_batch)
            
            val_loss += loss.item()
            val_box_loss += loss_items[0].item() if len(loss_items) > 0 and torch.is_tensor(loss_items[0]) else 0
            val_cls_loss += loss_items[1].item() if len(loss_items) > 1 and torch.is_tensor(loss_items[1]) else 0
            val_dfl_loss += loss_items[2].item() if len(loss_items) > 2 and torch.is_tensor(loss_items[2]) else 0
        
        num_batches = len(self.val_loader)
        return {
            'loss': val_loss / num_batches,
            'box_loss': val_box_loss / num_batches,
            'cls_loss': val_cls_loss / num_batches,
            'dfl_loss': val_dfl_loss / num_batches,
        }
    
    def save_checkpoint(self, epoch: int, is_best: bool = False):
        """Save model checkpoint."""
        weights_dir = self.save_dir / 'weights'
        weights_dir.mkdir(exist_ok=True)
        
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'best_fitness': self.best_fitness,
        }
        
        # Save last
        torch.save(checkpoint, weights_dir / 'last.pt')
        
        if is_best:
            torch.save(checkpoint, weights_dir / 'best.pt')
            print(f"  Saved best model (fitness: {self.best_fitness:.4f})")
    
    def train(self):
        """Full training loop."""
        print(f"\n{'='*60}")
        print(f"Starting Late Fusion Training")
        print(f"{'='*60}")
        print(f"Epochs: {self.epochs}")
        print(f"Patience: {self.patience}")
        print(f"Save directory: {self.save_dir}")
        print(f"Device: {self.device}")
        print(f"{'='*60}\n")
        
        for epoch in range(self.epochs):
            # Train
            train_metrics = self.train_epoch(epoch)
            
            # Validate
            val_metrics = self.validate()
            
            # Update LR
            self.scheduler.step()
            
            # Fitness: weighted combination of metrics (higher is better)
            # Similar to Ultralytics: mAP50 with some weighting
            fitness = 1.0 / (val_metrics['loss'] + 1e-6)
            
            # Check if best
            is_best = fitness > self.best_fitness
            if is_best:
                self.best_fitness = fitness
                self.epochs_no_improve = 0
            else:
                self.epochs_no_improve += 1
            
            # Save
            self.save_checkpoint(epoch, is_best)
            
            # Log
            self.history.append({
                'epoch': epoch + 1,
                'train_loss': train_metrics['loss'],
                'train_box_loss': train_metrics['box_loss'],
                'train_cls_loss': train_metrics['cls_loss'],
                'train_dfl_loss': train_metrics['dfl_loss'],
                'val_loss': val_metrics['loss'],
                'val_box_loss': val_metrics['box_loss'],
                'val_cls_loss': val_metrics['cls_loss'],
                'val_dfl_loss': val_metrics['dfl_loss'],
                'lr': self.optimizer.param_groups[0]['lr'],
                'fitness': fitness,
            })
            
            print(f"\nEpoch {epoch+1}/{self.epochs}:")
            print(f"  Train Loss: {train_metrics['loss']:.4f} "
                  f"(box: {train_metrics['box_loss']:.4f}, "
                  f"cls: {train_metrics['cls_loss']:.4f}, "
                  f"dfl: {train_metrics['dfl_loss']:.4f})")
            print(f"  Val Loss:   {val_metrics['loss']:.4f} "
                  f"(box: {val_metrics['box_loss']:.4f}, "
                  f"cls: {val_metrics['cls_loss']:.4f}, "
                  f"dfl: {val_metrics['dfl_loss']:.4f})")
            print(f"  Fitness:    {fitness:.4f} {'*' if is_best else ''}")
            print(f"  LR:         {self.optimizer.param_groups[0]['lr']:.6f}")
            
            # Early stopping
            if self.epochs_no_improve >= self.patience:
                print(f"\n{'='*60}")
                print(f"Early stopping at epoch {epoch+1}")
                print(f"Best fitness: {self.best_fitness:.4f}")
                print(f"{'='*60}")
                break
        
        print(f"\n{'='*60}")
        print(f"Training complete!")
        print(f"Best fitness: {self.best_fitness:.4f}")
        print(f"{'='*60}")
        
        return self.history


print("LateFusionTrainer class defined with proper YOLO loss")

In [None]:
# =============================================================================
# Cell 8: Find Best Weights from A.1 and A.2
# =============================================================================

def find_best_weights(weights_dir: Path, exp_prefix: str) -> str:
    """
    Find the best weights file from experiment runs.
    Searches for exp_prefix_seed*/weights/best.pt
    """
    # Try different patterns
    patterns = [
        f"{exp_prefix}_seed*/weights/best.pt",
        f"{exp_prefix}*/weights/best.pt",
        f"*{exp_prefix}*/weights/best.pt",
    ]
    
    for pattern in patterns:
        matches = list(weights_dir.glob(pattern))
        if matches:
            # Return the first match (seed 42 is usually first)
            return str(sorted(matches)[0])
    
    return None

# Find RGB weights (A.1)
rgb_weights = find_best_weights(RGB_WEIGHTS_DIR, 'exp_a1_rgb')
if rgb_weights is None:
    rgb_weights = find_best_weights(RGB_WEIGHTS_DIR, 'exp_a1_rgb_v2')

# Find Depth weights (A.2)
depth_weights = find_best_weights(DEPTH_WEIGHTS_DIR, 'exp_a2_depth')
if depth_weights is None:
    depth_weights = find_best_weights(DEPTH_WEIGHTS_DIR, 'exp_a2_depth_v2')

print("Pre-trained Weights:")
print(f"  RGB (A.1):   {rgb_weights}")
print(f"  Depth (A.2): {depth_weights}")

# Check if weights exist
if rgb_weights is None or depth_weights is None:
    print("\nWARNING: Pre-trained weights not found!")
    print("Please ensure A.1 and A.2 experiments have been run first.")
    print("\nExpected paths:")
    print(f"  RGB: {RGB_WEIGHTS_DIR}/exp_a1_rgb_seed42/weights/best.pt")
    print(f"  Depth: {DEPTH_WEIGHTS_DIR}/exp_a2_depth_seed42/weights/best.pt")
    
    # Fallback: use pretrained yolo11n.pt
    print("\nFallback: Using yolo11n.pt for both backbones")
    rgb_weights = 'yolo11n.pt'
    depth_weights = 'yolo11n.pt'
else:
    print("\nWeights found successfully!")

In [None]:
# =============================================================================
# Cell 9: Training Loop (5 Seeds) with Proper YOLO Loss
# =============================================================================

results_all = {}
training_times = {}

print("\n" + "="*60)
print("STARTING TRAINING LOOP - A.5 LATE FUSION")
print("="*60)
print("\nKey Features:")
print("  - Multi-scale feature fusion (P3, P4, P5)")
print("  - Proper YOLO v8DetectionLoss (box + cls + dfl)")
print("  - Frozen RGB and Depth backbones")
print("  - Trainable fusion layers and detection head")
print("="*60)

for idx, seed in enumerate(SEEDS, 1):
    start_time = time.time()
    
    print(f"\n{'='*60}")
    print(f"TRAINING A.5 LATE FUSION - Seed {seed} ({idx}/{len(SEEDS)})")
    print(f"{'='*60}")
    
    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    
    try:
        # Create datasets
        train_dataset = LateFusionDataset(
            rgb_img_dir=RGB_DATASET / 'images' / 'train',
            depth_img_dir=DEPTH_DATASET / 'images' / 'train',
            label_dir=RGB_DATASET / 'labels' / 'train',
            img_size=IMGSZ,
            augment=True,
            augment_params=AUGMENT_PARAMS
        )
        
        val_dataset = LateFusionDataset(
            rgb_img_dir=RGB_DATASET / 'images' / 'val',
            depth_img_dir=DEPTH_DATASET / 'images' / 'val',
            label_dir=RGB_DATASET / 'labels' / 'val',
            img_size=IMGSZ,
            augment=False
        )
        
        # Create dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=BATCH_SIZE,
            shuffle=True,
            num_workers=NUM_WORKERS,
            collate_fn=collate_fn,
            pin_memory=True
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=BATCH_SIZE,
            shuffle=False,
            num_workers=NUM_WORKERS,
            collate_fn=collate_fn,
            pin_memory=True
        )
        
        print(f"\nDataset loaded:")
        print(f"  Train samples: {len(train_dataset)}")
        print(f"  Val samples: {len(val_dataset)}")
        
        # Create model
        model = LateFusionModel(
            rgb_model_path=rgb_weights,
            depth_model_path=depth_weights,
            num_classes=1,
            device=DEVICE
        )
        
        # Create save directory
        save_dir = RUNS_PATH / f"{EXP_PREFIX}_seed{seed}"
        
        # Create trainer with proper YOLO loss
        trainer = LateFusionTrainer(
            model=model,
            train_loader=train_loader,
            val_loader=val_loader,
            device=DEVICE,
            epochs=EPOCHS,
            patience=PATIENCE,
            lr=0.01,
            save_dir=save_dir,
            num_classes=1,
        )
        
        # Train
        history = trainer.train()
        
        elapsed = time.time() - start_time
        training_times[seed] = elapsed
        
        results_all[seed] = {
            'model_path': str(save_dir / 'weights' / 'best.pt'),
            'history': history,
            'completed': True,
            'best_fitness': trainer.best_fitness,
        }
        
        print(f"\n{'='*60}")
        print(f"Seed {seed} completed successfully!")
        print(f"Best fitness: {trainer.best_fitness:.4f}")
        print(f"Training time: {elapsed/60:.1f} minutes")
        print(f"{'='*60}")
        
    except Exception as e:
        print(f"\n{'='*60}")
        print(f"Seed {seed} failed: {e}")
        print(f"{'='*60}")
        import traceback
        traceback.print_exc()
        results_all[seed] = {'error': str(e), 'completed': False}
    
    finally:
        # Cleanup
        if 'model' in locals():
            del model
        if 'trainer' in locals():
            del trainer
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

print("\n" + "="*60)
print("TRAINING LOOP COMPLETED")
print(f"Successful: {sum(1 for r in results_all.values() if r.get('completed', False))}/{len(SEEDS)}")
print("="*60)

In [None]:
# =============================================================================
# Cell 10: mAP Evaluation Functions (Proper Implementation)
# =============================================================================

def box_iou(box1, box2):
    """
    Compute IoU between two sets of boxes.
    box1: [N, 4] in xyxy format
    box2: [M, 4] in xyxy format
    Returns: [N, M] IoU matrix
    """
    def box_area(box):
        return (box[:, 2] - box[:, 0]) * (box[:, 3] - box[:, 1])

    area1 = box_area(box1)
    area2 = box_area(box2)

    # Intersection
    lt = torch.max(box1[:, None, :2], box2[None, :, :2])
    rb = torch.min(box1[:, None, 2:], box2[None, :, 2:])
    wh = (rb - lt).clamp(min=0)
    inter = wh[:, :, 0] * wh[:, :, 1]

    # Union
    union = area1[:, None] + area2[None, :] - inter

    return inter / (union + 1e-6)


def xywh_to_xyxy(boxes, img_size=640):
    """Convert boxes from xywh normalized to xyxy pixel format."""
    if len(boxes) == 0:
        return torch.zeros((0, 4))

    boxes = torch.tensor(boxes) if not isinstance(boxes, torch.Tensor) else boxes
    x_center = boxes[:, 0] * img_size
    y_center = boxes[:, 1] * img_size
    w = boxes[:, 2] * img_size
    h = boxes[:, 3] * img_size

    x1 = x_center - w / 2
    y1 = y_center - h / 2
    x2 = x_center + w / 2
    y2 = y_center + h / 2

    return torch.stack([x1, y1, x2, y2], dim=1)


def compute_ap(recalls, precisions):
    """Compute Average Precision using 101-point interpolation (COCO style)."""
    recalls = np.concatenate([[0.0], recalls, [1.0]])
    precisions = np.concatenate([[1.0], precisions, [0.0]])

    # Ensure precision is monotonically decreasing
    for i in range(len(precisions) - 2, -1, -1):
        precisions[i] = max(precisions[i], precisions[i + 1])

    # 101-point interpolation
    recall_levels = np.linspace(0, 1, 101)
    ap = 0.0
    for r in recall_levels:
        prec_at_r = precisions[recalls >= r]
        if len(prec_at_r) > 0:
            ap += prec_at_r.max()

    return ap / 101


def evaluate_detections(all_predictions, all_targets, iou_threshold=0.5, img_size=640):
    """
    Evaluate detections and compute Precision, Recall, and AP.

    Args:
        all_predictions: List of (boxes, scores) per image, boxes in xywh normalized
        all_targets: List of target boxes per image, in xywh normalized
        iou_threshold: IoU threshold for matching

    Returns:
        dict with Precision, Recall, AP
    """
    all_scores = []
    all_matches = []  # 1 if TP, 0 if FP
    total_gt = 0

    for preds, targets in zip(all_predictions, all_targets):
        pred_boxes, pred_scores = preds

        if len(pred_boxes) == 0:
            total_gt += len(targets)
            continue

        # Convert to xyxy
        pred_xyxy = xywh_to_xyxy(pred_boxes, img_size)

        if len(targets) == 0:
            # All predictions are FP
            for score in pred_scores:
                all_scores.append(score)
                all_matches.append(0)
            continue

        target_xyxy = xywh_to_xyxy(targets, img_size)
        total_gt += len(targets)

        # Compute IoU matrix
        ious = box_iou(pred_xyxy, target_xyxy)

        # Match predictions to targets (greedy matching)
        matched_gt = set()

        # Sort predictions by score (descending)
        sorted_indices = np.argsort(pred_scores)[::-1]

        for idx in sorted_indices:
            score = pred_scores[idx]
            all_scores.append(score)

            if len(matched_gt) == len(targets):
                all_matches.append(0)
                continue

            # Find best matching GT
            iou_row = ious[idx].cpu().numpy()
            best_gt_idx = -1
            best_iou = iou_threshold

            for gt_idx in range(len(targets)):
                if gt_idx in matched_gt:
                    continue
                if iou_row[gt_idx] > best_iou:
                    best_iou = iou_row[gt_idx]
                    best_gt_idx = gt_idx

            if best_gt_idx >= 0:
                all_matches.append(1)  # TP
                matched_gt.add(best_gt_idx)
            else:
                all_matches.append(0)  # FP

    if len(all_scores) == 0 or total_gt == 0:
        return {'Precision': 0.0, 'Recall': 0.0, 'AP': 0.0}

    # Sort by score
    sorted_indices = np.argsort(all_scores)[::-1]
    all_matches = np.array(all_matches)[sorted_indices]

    # Compute cumulative TP and FP
    tp_cumsum = np.cumsum(all_matches)
    fp_cumsum = np.cumsum(1 - all_matches)

    # Precision and Recall at each threshold
    precisions = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-6)
    recalls = tp_cumsum / (total_gt + 1e-6)

    # Final precision and recall
    final_precision = float(precisions[-1]) if len(precisions) > 0 else 0.0
    final_recall = float(recalls[-1]) if len(recalls) > 0 else 0.0

    # Compute AP
    ap = compute_ap(recalls, precisions)

    return {
        'Precision': final_precision,
        'Recall': final_recall,
        'AP': float(ap)
    }


def nms_numpy(boxes, scores, iou_threshold=0.45):
    """
    Simple NMS implementation.
    boxes: [N, 4] in xyxy format (numpy)
    scores: [N] (numpy)
    Returns: indices to keep
    """
    if len(boxes) == 0:
        return np.array([], dtype=int)

    boxes = np.array(boxes)
    scores = np.array(scores)

    # Sort by score
    order = scores.argsort()[::-1]
    keep = []

    while len(order) > 0:
        i = order[0]
        keep.append(i)

        if len(order) == 1:
            break

        # Compute IoU with rest
        xx1 = np.maximum(boxes[i, 0], boxes[order[1:], 0])
        yy1 = np.maximum(boxes[i, 1], boxes[order[1:], 1])
        xx2 = np.minimum(boxes[i, 2], boxes[order[1:], 2])
        yy2 = np.minimum(boxes[i, 3], boxes[order[1:], 3])

        w = np.maximum(0, xx2 - xx1)
        h = np.maximum(0, yy2 - yy1)
        inter = w * h

        area_i = (boxes[i, 2] - boxes[i, 0]) * (boxes[i, 3] - boxes[i, 1])
        area_rest = (boxes[order[1:], 2] - boxes[order[1:], 0]) * (boxes[order[1:], 3] - boxes[order[1:], 1])

        iou = inter / (area_i + area_rest - inter + 1e-6)

        # Keep boxes with IoU < threshold
        inds = np.where(iou <= iou_threshold)[0]
        order = order[inds + 1]

    return np.array(keep)


def decode_yolo_output(output, conf_threshold=0.25, iou_threshold=0.45, img_size=640):
    """
    Decode YOLO output to boxes and scores.
    
    Args:
        output: Raw model output tensor
        conf_threshold: Confidence threshold
        iou_threshold: NMS IoU threshold
        img_size: Image size
        
    Returns:
        boxes: [N, 4] in xywh normalized format
        scores: [N] confidence scores
    """
    # Handle different output formats
    if isinstance(output, (list, tuple)):
        output = output[0]
    
    # Expected shape: [batch, num_classes + 4, num_anchors] or [batch, num_anchors, num_classes + 4]
    if output.dim() == 3:
        if output.shape[1] == 5:  # [batch, 5, anchors] -> 1 class + 4 coords
            output = output.permute(0, 2, 1)  # [batch, anchors, 5]
        elif output.shape[2] == 5:  # Already [batch, anchors, 5]
            pass
        else:
            # Try to reshape
            output = output.view(output.shape[0], -1, 5)
    
    all_boxes = []
    all_scores = []
    
    for batch_idx in range(output.shape[0]):
        pred = output[batch_idx]  # [anchors, 5] or similar
        
        if pred.dim() == 1:
            pred = pred.unsqueeze(0)
        
        # Assume format: x, y, w, h, conf (or conf first)
        if pred.shape[-1] >= 5:
            # Try conf at last position
            conf = pred[:, 4] if pred.shape[-1] >= 5 else pred[:, 0]
            boxes = pred[:, :4]
        else:
            continue
        
        # Filter by confidence
        mask = conf > conf_threshold
        conf = conf[mask]
        boxes = boxes[mask]
        
        if len(boxes) == 0:
            all_boxes.append(np.zeros((0, 4)))
            all_scores.append(np.array([]))
            continue
        
        # Convert to numpy
        boxes_np = boxes.cpu().numpy()
        scores_np = conf.cpu().numpy()
        
        # Ensure boxes are in valid range [0, 1]
        boxes_np = np.clip(boxes_np, 0, 1)
        
        # Convert to xyxy for NMS
        x_center = boxes_np[:, 0] * img_size
        y_center = boxes_np[:, 1] * img_size
        w = boxes_np[:, 2] * img_size
        h = boxes_np[:, 3] * img_size
        
        x1 = x_center - w / 2
        y1 = y_center - h / 2
        x2 = x_center + w / 2
        y2 = y_center + h / 2
        
        boxes_xyxy = np.stack([x1, y1, x2, y2], axis=1)
        
        # Apply NMS
        keep = nms_numpy(boxes_xyxy, scores_np, iou_threshold)
        
        if len(keep) > 0:
            all_boxes.append(boxes_np[keep])
            all_scores.append(scores_np[keep])
        else:
            all_boxes.append(np.zeros((0, 4)))
            all_scores.append(np.array([]))
    
    return all_boxes, all_scores

print("mAP evaluation functions defined")

In [None]:
# =============================================================================
# Cell 11: Evaluation on Test Set (Proper mAP)
# =============================================================================

results_dict = {}

print("\n" + "="*60)
print("EVALUATION ON TEST SET (mAP)")
print("="*60)

# Create test dataset
test_dataset = LateFusionDataset(
    rgb_img_dir=RGB_DATASET / 'images' / 'test',
    depth_img_dir=DEPTH_DATASET / 'images' / 'test',
    label_dir=RGB_DATASET / 'labels' / 'test',
    img_size=IMGSZ,
    augment=False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    collate_fn=collate_fn
)

print(f"Test samples: {len(test_dataset)}")

for seed in SEEDS:
    model_path = RUNS_PATH / f"{EXP_PREFIX}_seed{seed}" / 'weights' / 'best.pt'
    
    if not model_path.exists():
        print(f"\nSeed {seed}: Model not found at {model_path}")
        continue
    
    print(f"\nEvaluating Seed {seed}...")
    
    try:
        # Load model
        model = LateFusionModel(
            rgb_model_path=rgb_weights,
            depth_model_path=depth_weights,
            device=DEVICE
        )
        model = model.to(DEVICE)
        
        # Load trained weights
        checkpoint = torch.load(model_path, map_location=DEVICE)
        model.load_state_dict(checkpoint['model_state_dict'])
        model.eval()
        
        # Collect predictions and targets
        all_predictions = []
        all_targets = []
        
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f"Seed {seed}"):
                rgb = batch['rgb'].to(DEVICE)
                depth = batch['depth'].to(DEVICE)
                labels = batch['labels']  # [batch_idx, class, x, y, w, h]
                
                # Forward pass
                outputs = model(rgb, depth)
                
                # Decode predictions
                pred_boxes, pred_scores = decode_yolo_output(
                    outputs, 
                    conf_threshold=0.25, 
                    iou_threshold=0.45,
                    img_size=IMGSZ
                )
                
                # Get batch size
                batch_size = rgb.shape[0]
                
                for b in range(batch_size):
                    # Get predictions for this image
                    if b < len(pred_boxes):
                        boxes = pred_boxes[b]
                        scores = pred_scores[b]
                    else:
                        boxes = np.zeros((0, 4))
                        scores = np.array([])
                    
                    all_predictions.append((boxes, scores))
                    
                    # Get targets for this image
                    if len(labels) > 0:
                        img_labels = labels[labels[:, 0] == b]
                        if len(img_labels) > 0:
                            # Extract boxes: [class, x, y, w, h] -> [x, y, w, h]
                            target_boxes = img_labels[:, 2:6].numpy()
                        else:
                            target_boxes = np.zeros((0, 4))
                    else:
                        target_boxes = np.zeros((0, 4))
                    
                    all_targets.append(target_boxes)
        
        # Compute metrics at IoU 0.5
        metrics_50 = evaluate_detections(all_predictions, all_targets, iou_threshold=0.5, img_size=IMGSZ)
        
        # Compute mAP50-95 (average over IoU thresholds 0.5 to 0.95)
        ap_values = []
        for iou_thresh in np.arange(0.5, 0.96, 0.05):
            metrics = evaluate_detections(all_predictions, all_targets, iou_threshold=iou_thresh, img_size=IMGSZ)
            ap_values.append(metrics['AP'])
        
        map50_95 = np.mean(ap_values)
        
        results_dict[seed] = {
            'mAP50': metrics_50['AP'],
            'mAP50-95': map50_95,
            'Precision': metrics_50['Precision'],
            'Recall': metrics_50['Recall'],
        }
        
        print(f"  mAP50:     {metrics_50['AP']:.4f}")
        print(f"  mAP50-95:  {map50_95:.4f}")
        print(f"  Precision: {metrics_50['Precision']:.4f}")
        print(f"  Recall:    {metrics_50['Recall']:.4f}")
        
        del model
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
    except Exception as e:
        print(f"  Evaluation failed: {e}")
        import traceback
        traceback.print_exc()

print("\n" + "="*60)
print("EVALUATION COMPLETED")
print("="*60)

In [None]:
# =============================================================================
# Cell 12: Results Summary (Same Format as A.1-A.4b)
# =============================================================================

if results_dict:
    df = pd.DataFrame(results_dict).T
    df.index.name = 'Seed'
    
    # Calculate statistics
    avg = df.mean()
    std = df.std()
    min_vals = df.min()
    max_vals = df.max()
    
    print("\n" + "="*60)
    print("A.5 LATE FUSION (V2) - FINAL RESULTS")
    print("="*60 + "\n")
    
    print("Per-Seed Results:")
    print(df.to_string(float_format=lambda x: f"{x:.4f}"))
    
    print("\n" + "-"*60)
    print("STATISTICAL SUMMARY")
    print("-"*60)
    print(f"{'Metric':<15} {'Mean':>10} {'Std':>10} {'Min':>10} {'Max':>10}")
    print("-"*60)
    for col in df.columns:
        print(f"{col:<15} {avg[col]:>10.4f} {std[col]:>10.4f} {min_vals[col]:>10.4f} {max_vals[col]:>10.4f}")
    
    # Best seed
    best_seed = df['mAP50'].idxmax()
    print(f"\nBest Seed: {best_seed} (mAP50: {df.loc[best_seed, 'mAP50']:.4f})")
    
    print("="*60)
else:
    print("No results to display. Training may have failed.")

In [None]:
# =============================================================================
# Cell 13: Save Results (Same Format as A.1-A.4b)
# =============================================================================

output_file = KAGGLE_OUTPUT / 'a5_late_fusion_v2_results.txt'

with open(output_file, 'w') as f:
    f.write("="*60 + "\n")
    f.write("A.5 Late Fusion (V2) Results\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Environment: {'Kaggle' if IS_KAGGLE else 'Local'}\n")
    f.write("="*60 + "\n\n")
    
    f.write("Configuration:\n")
    f.write(f"  Architecture: Late Fusion (RGB + Depth)\n")
    f.write(f"  RGB Backbone: Frozen (from A.1)\n")
    f.write(f"  Depth Backbone: Frozen (from A.2)\n")
    f.write(f"  Trainable: Fusion Layer + Detection Head\n")
    f.write(f"  Model: YOLOv11n (backbones)\n")
    f.write(f"  Epochs: {EPOCHS} (patience: {PATIENCE})\n")
    f.write(f"  Image Size: {IMGSZ}\n")
    f.write(f"  Batch Size: {BATCH_SIZE}\n")
    f.write(f"  Seeds: {SEEDS}\n")
    
    f.write("\nUniform Augmentation:\n")
    for key, value in AUGMENT_PARAMS.items():
        f.write(f"  {key}: {value}\n")
    
    if results_dict:
        f.write("\n" + "="*60 + "\n")
        f.write("Per-Seed Results:\n")
        f.write("="*60 + "\n")
        f.write(df.to_string(float_format=lambda x: f"{x:.4f}"))
        
        f.write("\n\n" + "-"*60 + "\n")
        f.write("Summary (Mean +/- Std):\n")
        f.write("-"*60 + "\n")
        for col in df.columns:
            f.write(f"  {col}: {avg[col]:.4f} +/- {std[col]:.4f}\n")
        
        f.write(f"\nBest Seed: {best_seed}\n")

print(f"\nResults saved: {output_file}")

# Save as JSON (same format as other experiments)
json_output = {
    'experiment': 'A.5',
    'variant': 'V2',
    'name': 'Late Fusion',
    'seeds': SEEDS,
    'config': {
        'model': 'yolo11n',
        'architecture': 'late_fusion',
        'rgb_backbone': 'frozen_from_a1',
        'depth_backbone': 'frozen_from_a2',
        'epochs': EPOCHS,
        'patience': PATIENCE,
        'imgsz': IMGSZ,
        'batch': BATCH_SIZE,
        'augmentation': AUGMENT_PARAMS,
    },
    'results': {str(k): v for k, v in results_dict.items()} if results_dict else {},
    'summary': {
        'mean': {k: float(v) for k, v in avg.items()},
        'std': {k: float(v) for k, v in std.items()},
        'best_seed': int(best_seed) if results_dict else None,
    } if results_dict else None,
}

json_file = KAGGLE_OUTPUT / 'a5_late_fusion_v2_results.json'
with open(json_file, 'w') as f:
    json.dump(json_output, f, indent=2)

print(f"JSON saved: {json_file}")

In [None]:
# =============================================================================
# Cell 14: Create Archives for Download
# =============================================================================

print("\n" + "="*60)
print("CREATING ARCHIVES")
print("="*60 + "\n")

# Archive training runs
if RUNS_PATH.exists():
    runs_zip = BASE_PATH / 'a5_late_fusion_v2_runs'
    shutil.make_archive(str(runs_zip), 'zip', RUNS_PATH)
    size_mb = (runs_zip.with_suffix('.zip')).stat().st_size / 1024 / 1024
    print(f"a5_late_fusion_v2_runs.zip: {size_mb:.1f} MB")

# Archive outputs
output_zip = BASE_PATH / 'a5_late_fusion_v2_output'
shutil.make_archive(str(output_zip), 'zip', KAGGLE_OUTPUT)
size_mb = (output_zip.with_suffix('.zip')).stat().st_size / 1024 / 1024
print(f"a5_late_fusion_v2_output.zip: {size_mb:.1f} MB")

print("\n" + "="*60)
print("ALL DONE!")
print("="*60)
print("\nDownload from Output tab:")
print("  - a5_late_fusion_v2_runs.zip (training runs)")
print("  - a5_late_fusion_v2_output.zip (results)")
print("\nOutput files:")
print(f"  - {output_file}")
print(f"  - {json_file}")