# A.4b - RGB + Synthetic Depth (4-Channel) Training - FIXED

**Experiment:** A.4b  
**Input:** RGB + Synthetic Depth (4-channel RGBD)  
**Objective:** Test fusion of RGB with synthetic depth (Depth-Anything-V2)  
**Classes:** 1 (fresh_fruit_bunch)

## FIX Applied
This version includes proper 4-channel support:
1. Monkey-patch `imread` to read 4 channels
2. Custom `RGBD4ChTrainer` and `RGBD4ChValidator`
3. `channels: 4` in YAML config

## Prerequisites
1. Upload RGB dataset as: `ffb-localization-dataset`
2. Upload Synthetic Depth dataset as: `ffb-synthetic-depth`

## Training Config
- Model: YOLOv11n
- Epochs: 100
- Patience: 30
- Seeds: 42, 123, 456, 789, 101

In [1]:
# =============================================================================
# Cell 1: Setup & Install
# =============================================================================
!pip install -q ultralytics

import os
import torch
import torch.nn as nn
import numpy as np
import cv2
import shutil
import pandas as pd
from pathlib import Path
from datetime import datetime
from tqdm.auto import tqdm

os.environ["WANDB_DISABLED"] = "true"

IS_KAGGLE = os.path.exists('/kaggle/input')
print(f"Running on: {'Kaggle' if IS_KAGGLE else 'Local'}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.2/1.2 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hRunning on: Kaggle
PyTorch: 2.8.0+cu126
CUDA: True
GPU: Tesla T4


In [2]:
# =============================================================================
# Cell 2: Path Setup
# =============================================================================
RGB_DATASET = Path('/kaggle/input/ffb-localization-dataset/ffb_localization')
DEPTH_DATASET = Path('/kaggle/input/ffb-synthetic-depth')
BASE_PATH = Path('/kaggle/working')
RUNS_PATH = BASE_PATH / 'runs' / 'detect'
RGBD_DATASET = BASE_PATH / 'rgbd_4ch'
KAGGLE_OUTPUT = BASE_PATH / 'kaggleoutput'
KAGGLE_OUTPUT.mkdir(parents=True, exist_ok=True)

print(f"RGB Dataset: {RGB_DATASET} (exists: {RGB_DATASET.exists()})")
print(f"Depth Dataset: {DEPTH_DATASET} (exists: {DEPTH_DATASET.exists()})")
print(f"Output RGBD: {RGBD_DATASET}")

# Verify datasets
for split in ['train', 'val', 'test']:
    rgb_imgs = len(list((RGB_DATASET / 'images' / split).glob('*.png')))
    depth_imgs = len(list((DEPTH_DATASET / 'images' / split).glob('*.png')))
    rgb_lbls = len(list((RGB_DATASET / 'labels' / split).glob('*.txt')))
    print(f"  {split}: RGB={rgb_imgs}, Depth={depth_imgs}, Labels={rgb_lbls}")

RGB Dataset: /kaggle/input/ffb-localization-dataset/ffb_localization (exists: True)
Depth Dataset: /kaggle/input/ffb-synthetic-depth (exists: True)
Output RGBD: /kaggle/working/rgbd_4ch
  train: RGB=280, Depth=280, Labels=280
  val: RGB=80, Depth=80, Labels=80
  test: RGB=40, Depth=40, Labels=40


In [3]:
# =============================================================================
# Cell 3: Create 4-Channel RGBD Dataset
# =============================================================================
print("="*60)
print("CREATING 4-CHANNEL RGBD DATASET")
print("="*60)

for split in ['train', 'val', 'test']:
    print(f"\nProcessing {split.upper()}...")
    
    rgb_img_dir = RGB_DATASET / 'images' / split
    depth_img_dir = DEPTH_DATASET / 'images' / split
    rgb_lbl_dir = RGB_DATASET / 'labels' / split
    
    # Create output directories
    rgbd_img_dir = RGBD_DATASET / 'images' / split
    rgbd_lbl_dir = RGBD_DATASET / 'labels' / split
    rgbd_img_dir.mkdir(parents=True, exist_ok=True)
    rgbd_lbl_dir.mkdir(parents=True, exist_ok=True)
    
    # Get RGB files
    rgb_files = sorted(list(rgb_img_dir.glob('*.png')))
    
    for rgb_path in tqdm(rgb_files, desc=f"Creating 4-ch images ({split})"):
        # Load RGB (3 channels)
        rgb = cv2.imread(str(rgb_path))
        if rgb is None:
            print(f"  Warning: Could not read {rgb_path}")
            continue
        
        # Load synthetic depth (1 channel)
        depth_path = depth_img_dir / rgb_path.name
        depth = cv2.imread(str(depth_path), cv2.IMREAD_GRAYSCALE)
        if depth is None:
            print(f"  Warning: Could not read {depth_path}")
            continue
        
        # Resize depth if needed
        if depth.shape[:2] != rgb.shape[:2]:
            depth = cv2.resize(depth, (rgb.shape[1], rgb.shape[0]))
        
        # Combine RGB (3) + Depth (1) = 4 channels
        depth_expanded = depth[:, :, np.newaxis]
        rgbd_4ch = np.concatenate([rgb, depth_expanded], axis=2)
        
        # Save 4-channel image
        output_img_path = rgbd_img_dir / rgb_path.name
        cv2.imwrite(str(output_img_path), rgbd_4ch)
    
    # Copy labels
    label_files = list(rgb_lbl_dir.glob('*.txt'))
    for lbl_path in label_files:
        shutil.copy(str(lbl_path), str(rgbd_lbl_dir / lbl_path.name))
    
    img_count = len(list(rgbd_img_dir.glob('*.png')))
    lbl_count = len(list(rgbd_lbl_dir.glob('*.txt')))
    print(f"  Done: {img_count} images, {lbl_count} labels")

# Verify sample
sample_img = list((RGBD_DATASET / 'images' / 'train').glob('*.png'))[0]
img = cv2.imread(str(sample_img), cv2.IMREAD_UNCHANGED)
print(f"\nSample image shape: {img.shape} (should be H x W x 4)")

CREATING 4-CHANNEL RGBD DATASET

Processing TRAIN...


Creating 4-ch images (train):   0%|          | 0/280 [00:00<?, ?it/s]

  Done: 280 images, 280 labels

Processing VAL...


Creating 4-ch images (val):   0%|          | 0/80 [00:00<?, ?it/s]

  Done: 80 images, 80 labels

Processing TEST...


Creating 4-ch images (test):   0%|          | 0/40 [00:00<?, ?it/s]

  Done: 40 images, 40 labels

Sample image shape: (720, 1280, 4) (should be H x W x 4)


In [4]:
# =============================================================================
# Cell 4: Write YAML Config with channels: 4
# =============================================================================
config_content = f"""# A.4b RGB+Synthetic Depth 4-Channel Dataset
path: {RGBD_DATASET}
train: images/train
val: images/val
test: images/test

nc: 1
channels: 4

names:
  0: fresh_fruit_bunch
"""

config_path = RGBD_DATASET / 'dataset_rgbd_synthetic.yaml'
with open(config_path, 'w') as f:
    f.write(config_content)

print(f"YAML saved: {config_path}")
print("(includes channels: 4)")
print(config_content)

YAML saved: /kaggle/working/rgbd_4ch/dataset_rgbd_synthetic.yaml
(includes channels: 4)
# A.4b RGB+Synthetic Depth 4-Channel Dataset
path: /kaggle/working/rgbd_4ch
train: images/train
val: images/val
test: images/test

nc: 1
channels: 4

names:
  0: fresh_fruit_bunch



In [5]:
# =============================================================================
# Cell 5: Monkey-Patch Ultralytics imread for 4-Channel Support
# =============================================================================
import ultralytics.utils.patches as patches

_original_imread = patches.imread

def imread_4ch(filename, flags=cv2.IMREAD_UNCHANGED):
    """Always read with IMREAD_UNCHANGED to preserve 4 channels"""
    return _original_imread(filename, cv2.IMREAD_UNCHANGED)

patches.imread = imread_4ch
print("Patched imread for 4-channel support")

from ultralytics import YOLO
print("YOLO imported")

Creating new Ultralytics Settings v0.0.6 file ‚úÖ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Patched imread for 4-channel support
YOLO imported


In [6]:
# =============================================================================
# Cell 6: Custom Trainer & Validator with 4-Channel Support
# =============================================================================
from ultralytics.models.yolo.detect import DetectionTrainer, DetectionValidator

def convert_conv_to_4ch(conv_layer):
    """Convert a conv layer from 3ch to 4ch input"""
    if conv_layer.in_channels == 4:
        return conv_layer
    
    new_conv = nn.Conv2d(
        in_channels=4,
        out_channels=conv_layer.out_channels,
        kernel_size=conv_layer.kernel_size,
        stride=conv_layer.stride,
        padding=conv_layer.padding,
        bias=conv_layer.bias is not None
    )
    
    with torch.no_grad():
        # Copy RGB weights
        new_conv.weight[:, :3, :, :] = conv_layer.weight.clone()
        # Initialize depth channel as mean of RGB
        new_conv.weight[:, 3:4, :, :] = conv_layer.weight.mean(dim=1, keepdim=True)
        if conv_layer.bias is not None:
            new_conv.bias = nn.Parameter(conv_layer.bias.clone())
    
    return new_conv


def ensure_model_4ch(model):
    """Ensure model has 4-channel input"""
    try:
        # For AutoBackend wrapped models
        if hasattr(model, 'model') and hasattr(model.model, 'model'):
            first_conv = model.model.model[0].conv
            if first_conv.in_channels == 3:
                print("[4ch] Converting AutoBackend model...")
                model.model.model[0].conv = convert_conv_to_4ch(first_conv)
                return True
        # For direct DetectionModel
        elif hasattr(model, 'model') and hasattr(model.model[0], 'conv'):
            first_conv = model.model[0].conv
            if first_conv.in_channels == 3:
                print("[4ch] Converting DetectionModel...")
                model.model[0].conv = convert_conv_to_4ch(first_conv)
                return True
    except Exception as e:
        print(f"[4ch] Warning: {e}")
    return False


class RGBD4ChValidator(DetectionValidator):
    """Validator that converts model to 4ch before validation"""
    
    def setup_model(self):
        """Override: after model is loaded, convert to 4ch"""
        super().setup_model()
        if self.model is not None:
            ensure_model_4ch(self.model)
            print(f"[Validator] Model 4ch ready")


class RGBD4ChTrainer(DetectionTrainer):
    """Trainer that converts model to 4-channel"""
    
    def setup_model(self):
        """Setup model then convert to 4-channel"""
        super().setup_model()
        
        first_conv = self.model.model[0].conv
        
        if first_conv.in_channels == 4:
            print("[Trainer] Model already 4-channel")
            return
        
        print(f"[Trainer] Converting to 4-channel...")
        self.model.model[0].conv = convert_conv_to_4ch(first_conv)
        print(f"[Trainer] Converted! Shape: {self.model.model[0].conv.weight.shape}")
    
    def get_validator(self):
        """Return custom validator"""
        self.loss_names = "box_loss", "cls_loss", "dfl_loss"
        return RGBD4ChValidator(
            self.test_loader, 
            save_dir=self.save_dir, 
            args=self.args,
            _callbacks=self.callbacks
        )

print("Custom Trainer & Validator defined")

Custom Trainer & Validator defined


In [7]:
# =============================================================================
# Cell 7: Training Configuration
# =============================================================================
EXP_PREFIX = "exp_a4b_rgbd_synth"
SEEDS = [42, 123, 456, 789, 101]
EPOCHS = 100
IMGSZ = 640
BATCH_SIZE = 16
DEVICE = 0 if torch.cuda.is_available() else 'cpu'

print(f"Experiment: A.4b RGB+Synthetic Depth (4-Channel) - FIXED")
print(f"Seeds: {SEEDS}")
print(f"Epochs: {EPOCHS}")
print(f"Batch: {BATCH_SIZE}")
print(f"Device: {DEVICE}")

Experiment: A.4b RGB+Synthetic Depth (4-Channel) - FIXED
Seeds: [42, 123, 456, 789, 101]
Epochs: 100
Batch: 16
Device: 0


In [8]:
# =============================================================================
# Cell 8: Training Loop with Custom Trainer
# =============================================================================
results_all = {}

for seed in SEEDS:
    print(f"\n{'='*60}")
    print(f"TRAINING A.4b RGBD SYNTHETIC - Seed {seed} ({SEEDS.index(seed)+1}/{len(SEEDS)})")
    print(f"{'='*60}\n")
    
    # Use custom trainer for 4-channel support
    trainer = RGBD4ChTrainer(overrides={
        'model': 'yolo11n.pt',
        'data': str(config_path),
        'imgsz': IMGSZ,
        'epochs': EPOCHS,
        'batch': BATCH_SIZE,
        'device': DEVICE,
        'seed': seed,
        'name': f"{EXP_PREFIX}_seed{seed}",
        'project': str(RUNS_PATH),
        'exist_ok': True,
        'pretrained': True,
        'patience': 30,
        'val': True,
        # Disable HSV augmentation (not applicable for depth)
        'hsv_h': 0.0,
        'hsv_s': 0.0,
        'hsv_v': 0.0,
        'degrees': 0.0,
        'translate': 0.1,
        'scale': 0.5,
        'fliplr': 0.5,
        'mosaic': 1.0,
        'mixup': 0.0,
        'copy_paste': 0.0,
        'erasing': 0.0,
    })
    
    trainer.train()
    
    results_all[seed] = {
        'model_path': str(RUNS_PATH / f"{EXP_PREFIX}_seed{seed}" / "weights" / "best.pt"),
    }
    
    print(f"\nSeed {seed} completed!")
    
    del trainer
    torch.cuda.empty_cache()

print("\n" + "="*60)
print("ALL TRAINING COMPLETED!")
print("="*60)


TRAINING A.4b RGBD SYNTHETIC - Seed 42 (1/5)

Ultralytics 8.4.8 üöÄ Python-3.12.12 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, angle=1.0, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/kaggle/working/rgbd_4ch/dataset_rgbd_synthetic.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, end2end=None, epochs=100, erasing=0.0, exist_ok=True, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.0, hsv_s=0.0, hsv_v=0.0, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11n.pt, momentum=0.937, mosaic=1.0, multi_scale=0.0, name=exp_a4b_rgbd_synth_seed42, nbs=64, 

In [9]:
# =============================================================================
# Cell 9: Evaluation on Test Set
# =============================================================================
results_dict = {}

print("\n" + "="*60)
print("EVALUATION ON TEST SET")
print("="*60)

for seed in SEEDS:
    model_path = RUNS_PATH / f"{EXP_PREFIX}_seed{seed}" / "weights" / "best.pt"
    
    if not model_path.exists():
        print(f"Model not found: {model_path}")
        continue
    
    print(f"\nSeed {seed}:")
    
    model = YOLO(str(model_path))
    
    # Convert to 4ch if needed
    first_conv = model.model.model[0].conv
    if first_conv.in_channels == 3:
        print("  Converting to 4ch...")
        model.model.model[0].conv = convert_conv_to_4ch(first_conv)
    
    # Verify 4-channel
    print(f"  Model input channels: {model.model.model[0].conv.in_channels}")
    
    metrics = model.val(
        data=str(config_path),
        split="test",
        device=DEVICE,
        name=f"test_{EXP_PREFIX}_seed{seed}",
        exist_ok=True,
    )
    
    results_dict[seed] = {
        'mAP50': metrics.box.map50,
        'mAP50-95': metrics.box.map,
        'Precision': metrics.box.mp,
        'Recall': metrics.box.mr
    }
    
    print(f"  mAP50: {metrics.box.map50:.3f}, mAP50-95: {metrics.box.map:.3f}")
    
    del model
    torch.cuda.empty_cache()


EVALUATION ON TEST SET

Seed 42:
  Model input channels: 4
Ultralytics 8.4.8 üöÄ Python-3.12.12 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
YOLO11n summary (fused): 101 layers, 2,582,491 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access ‚úÖ (ping: 0.0¬±0.0 ms, read: 4620.9¬±1142.0 MB/s, size: 2286.6 KB)
[K[34m[1mval: [0mScanning /kaggle/working/rgbd_4ch/labels/test... 40 images, 0 backgrounds, 0 corrupt: 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 40/40 76.0it/s 0.5s
[34m[1mval: [0mNew cache created: /kaggle/working/rgbd_4ch/labels/test.cache
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 3/3 1.6it/s 1.8s
                   all         40        105      0.826      0.768       0.83      0.359
Speed: 4.4ms preprocess, 12.1ms inference, 0.0ms loss, 2.6ms postprocess per image
Results saved to [1m/kaggle/working/runs/detect/test_exp_a4b_rgbd_synth_seed42[0m
  mAP50: 0.8

In [10]:
# =============================================================================
# Cell 10: Results Summary
# =============================================================================
if results_dict:
    df = pd.DataFrame(results_dict).T
    df.index.name = 'Seed'
    avg = df.mean()
    std = df.std()

    print("\n" + "="*60)
    print("A.4b RGB+SYNTHETIC DEPTH (4-CH) - FINAL RESULTS")
    print("="*60 + "\n")
    print(df.to_string(float_format=lambda x: f"{x:.3f}"))

    print("\n" + "-"*60)
    print("SUMMARY (Mean +/- Std)")
    print("-"*60)
    for col in df.columns:
        print(f"  {col}: {avg[col]:.3f} +/- {std[col]:.3f}")


A.4b RGB+SYNTHETIC DEPTH (4-CH) - FINAL RESULTS

      mAP50  mAP50-95  Precision  Recall
Seed                                    
42    0.830     0.359      0.826   0.768
123   0.836     0.375      0.777   0.762
456   0.801     0.371      0.806   0.714
789   0.779     0.337      0.758   0.686
101   0.820     0.362      0.758   0.714

------------------------------------------------------------
SUMMARY (Mean +/- Std)
------------------------------------------------------------
  mAP50: 0.813 +/- 0.023
  mAP50-95: 0.361 +/- 0.015
  Precision: 0.785 +/- 0.030
  Recall: 0.729 +/- 0.035


In [11]:
# =============================================================================
# Cell 11: Save Results
# =============================================================================
output_file = KAGGLE_OUTPUT / 'a4b_rgbd_synthetic_results.txt'

with open(output_file, 'w') as f:
    f.write("="*60 + "\n")
    f.write("A.4b RGB+Synthetic Depth (4-Channel) Results - FIXED\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write("Training: epochs=100, patience=30, other=default\n")
    f.write(f"Seeds: {SEEDS}\n")
    f.write("4-Channel Support: YES (Custom Trainer)\n")
    f.write("="*60 + "\n\n")
    f.write("Per-Seed Results:\n")
    f.write(df.to_string(float_format=lambda x: f"{x:.3f}"))
    f.write("\n\n" + "-"*60 + "\n")
    f.write("Summary (Mean +/- Std):\n")
    for col in df.columns:
        f.write(f"  {col}: {avg[col]:.3f} +/- {std[col]:.3f}\n")

print(f"Results saved: {output_file}")

Results saved: /kaggle/working/kaggleoutput/a4b_rgbd_synthetic_results.txt


In [12]:
# =============================================================================
# Cell 12: Create Archives
# =============================================================================
if RUNS_PATH.exists():
    shutil.make_archive(str(BASE_PATH / 'a4b_runs'), 'zip', RUNS_PATH)
    print(f"a4b_runs.zip: {os.path.getsize(str(BASE_PATH / 'a4b_runs.zip'))/1024/1024:.1f} MB")

shutil.make_archive(str(BASE_PATH / 'a4b_output'), 'zip', KAGGLE_OUTPUT)
print("a4b_output.zip created")

print("\nDownload from Output tab:")
print("  - a4b_runs.zip (training runs)")
print("  - a4b_output.zip (results)")

a4b_runs.zip: 105.9 MB
a4b_output.zip created

Download from Output tab:
  - a4b_runs.zip (training runs)
  - a4b_output.zip (results)
