In [1]:
# Install Ultralytics
!pip -q install ultralytics

import os
import shutil
from pathlib import Path

import cv2
import numpy as np
import torch

from ultralytics import YOLO

try:
    from ultralytics.models.yolo.detect import DetectionTrainer, DetectionValidator
except Exception:
    from ultralytics.engine.trainer import DetectionTrainer
    from ultralytics.engine.validator import DetectionValidator

from ultralytics.data.dataset import YOLODataset

os.environ["WANDB_DISABLED"] = "true"


[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.2/1.2 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCreating new Ultralytics Settings v0.0.6 file ‚úÖ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [15]:
# Paths (edit to match your Kaggle dataset name)
from pathlib import Path
import shutil

DATASET_ZIP = Path("/kaggle/input/ffb-localization-rgbd/ffb_localization_rgbd.zip")
DATASET_DIR = Path("/kaggle/input/ffb-localization-rgbd-dataset/ffb_localization_rgbd")

WORK_DIR = Path("/kaggle/working/ffb_localization_rgbd")
SYNC_TO_WORKDIR = True  # copy to /kaggle/working to allow sync/delete


def find_dataset_root(base: Path) -> Path:
    candidates = [base, base / "ffb_localization_rgbd"]
    for c in candidates:
        if (c / "rgb").exists() and (c / "depth").exists() and (c / "labels").exists():
            return c
    if base.exists():
        for p in base.iterdir():
            if p.is_dir() and (p / "rgb").exists() and (p / "depth").exists() and (p / "labels").exists():
                return p
    raise FileNotFoundError(f"RGBD dataset root not found under: {base}")


def find_zip(base: Path) -> Path | None:
    if not base.exists():
        return None
    zips = list(base.glob("*.zip"))
    if len(zips) == 1:
        return zips[0]
    return None


if not DATASET_ZIP.exists():
    auto_zip = find_zip(DATASET_DIR) or find_zip(DATASET_DIR.parent)
    if auto_zip:
        DATASET_ZIP = auto_zip
        print("Auto ZIP:", DATASET_ZIP)

if DATASET_ZIP.exists():
    if not WORK_DIR.exists():
        shutil.unpack_archive(str(DATASET_ZIP), str(WORK_DIR))
    DATASET_DIR = WORK_DIR
elif SYNC_TO_WORKDIR and DATASET_DIR.exists() and DATASET_DIR.as_posix().startswith("/kaggle/input"):
    if not WORK_DIR.exists():
        shutil.copytree(DATASET_DIR, WORK_DIR)
    DATASET_DIR = WORK_DIR

DATASET_DIR = find_dataset_root(DATASET_DIR)

print("Dataset dir:", DATASET_DIR)
for sub in ("rgb", "depth", "labels"):
    assert (DATASET_DIR / sub).exists(), f"Missing {sub} folder in {DATASET_DIR}"


Dataset dir: /kaggle/working/ffb_localization_rgbd


In [16]:
%%writefile /kaggle/working/ffb_localization_rgbd_train.yaml
# Use /kaggle/working (writable) to allow cache files
# IMPORTANT: use standard 'images/' + 'labels/' structure so Ultralytics can discover labels.
path: /kaggle/working/ffb_localization_rgbd
train: images/train
val: images/val
test: images/test
nc: 1
names: ['fresh_fruit_bunch']
channels: 4


Overwriting /kaggle/working/ffb_localization_rgbd_train.yaml


In [17]:
from pathlib import Path

if "DATASET_DIR" not in globals():
    raise RuntimeError("Run the Paths cell first.")


def sync_rgbd_dataset(root: Path) -> None:
    splits = ("train", "val", "test")
    for split in splits:
        rgb_dir = root / "rgb" / split
        depth_dir = root / "depth" / split
        label_dir = root / "labels" / split

        rgb_files = {p.name for p in rgb_dir.glob("*.png")}
        depth_files = {p.name for p in depth_dir.glob("*.png")}
        label_files = {p.with_suffix(".png").name for p in label_dir.glob("*.txt")}

        keep = rgb_files & depth_files & label_files
        drop_rgb = rgb_files - keep
        drop_depth = depth_files - keep
        drop_labels = {f.replace(".png", ".txt") for f in (label_files - keep)}

        for f in drop_rgb:
            (rgb_dir / f).unlink(missing_ok=True)
        for f in drop_depth:
            (depth_dir / f).unlink(missing_ok=True)
        for f in drop_labels:
            (label_dir / f).unlink(missing_ok=True)

        print(
            f"{split}: keep={len(keep)} drop_rgb={len(drop_rgb)} "
            f"drop_depth={len(drop_depth)} drop_labels={len(drop_labels)}"
        )


sync_rgbd_dataset(DATASET_DIR)


train: keep=280 drop_rgb=0 drop_depth=0 drop_labels=0
val: keep=80 drop_rgb=0 drop_depth=0 drop_labels=0
test: keep=40 drop_rgb=0 drop_depth=0 drop_labels=0


In [None]:
# Build standard Ultralytics structure + sanity checks
# - Ultralytics expects train/val/test under 'images/' and labels under 'labels/' (same split)
# - We keep your existing 'rgb/' + 'depth/' folders, but expose RGB also via 'images/'

from pathlib import Path
import shutil

root = Path("/kaggle/working/ffb_localization_rgbd")
if "DATASET_DIR" in globals():
    root = Path(str(DATASET_DIR))

# 1) Create /images/{split} as a copy (or refresh) of /rgb/{split}
for split in ("train", "val", "test"):
    src = root / "rgb" / split
    dst = root / "images" / split
    dst.mkdir(parents=True, exist_ok=True)

    src_files = sorted(src.glob("*.png"))
    for p in src_files:
        out = dst / p.name
        if not out.exists():
            shutil.copy2(p, out)

    print(f"{split}: images={len(list(dst.glob('*.png')))} (from rgb={len(src_files)})")

# 2) Verify labels are non-empty (otherwise everything becomes background)
for split in ("train", "val", "test"):
    label_dir = root / "labels" / split
    lbls = sorted(label_dir.glob("*.txt"))
    nonempty = [p for p in lbls if p.read_text(encoding="utf-8").strip()]
    print(f"{split}: labels={len(lbls)} nonempty={len(nonempty)}")
    if not nonempty:
        raise RuntimeError(
            f"No non-empty labels in {label_dir}. "
            "Fix dataset: YOLO .txt must contain lines: <cls> <x> <y> <w> <h>."
        )

sample = nonempty[0]
print("Sample label:", sample.name)
print("\n".join(sample.read_text(encoding="utf-8").splitlines()[:3]))


In [18]:
# Remove old caches (important after changing image/label mapping)
from pathlib import Path

root = Path("/kaggle/working/ffb_localization_rgbd")
if "DATASET_DIR" in globals():
    root = Path(str(DATASET_DIR))

for p in root.rglob("*.cache"):
    try:
        p.unlink()
    except Exception:
        pass
print("Cache cleared.")


Cache cleared.


In [21]:
def normalize_depth_to_uint8(depth: np.ndarray) -> np.ndarray:
    if depth.dtype == np.uint8:
        return depth
    depth_f = depth.astype(np.float32)
    norm = cv2.normalize(depth_f, None, 0, 255, cv2.NORM_MINMAX)
    return norm.astype(np.uint8)


class RGBDDataset(YOLODataset):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Ultralytics Mosaic may sample from dataset.buffer; in some setups this buffer starts empty
        # and causes: IndexError in random.choices(list(self.dataset.buffer), ...)
        try:
            from collections import deque

            buf = getattr(self, "buffer", None)
            if buf is None or len(buf) == 0:
                n = len(getattr(self, "im_files", []))
                seed = list(range(min(n, 256)))  # small warm buffer is enough
                self.buffer = deque(seed, maxlen=1000)
        except Exception:
            # If anything goes wrong, continue without buffer init.
            pass

    @staticmethod
    def img2label_paths(img_paths):
        # Keep robust: support both /images/ (standard) and /rgb/ (legacy)
        label_paths = []
        for p in img_paths:
            p = str(p)
            if (os.sep + "images" + os.sep) in p:
                p = p.replace(os.sep + "images" + os.sep, os.sep + "labels" + os.sep)
            elif (os.sep + "rgb" + os.sep) in p:
                p = p.replace(os.sep + "rgb" + os.sep, os.sep + "labels" + os.sep)
            label_paths.append(os.path.splitext(p)[0] + ".txt")
        return label_paths

    def load_image(self, i):
        f = self.im_files[i]
        rgb = cv2.imread(f)
        if rgb is None:
            raise FileNotFoundError(f"RGB not found: {f}")
        h0, w0 = rgb.shape[:2]

        # Depth path mapping: /images/ -> /depth/ (preferred), fallback /rgb/ -> /depth/
        if (os.sep + "images" + os.sep) in f:
            depth_path = f.replace(os.sep + "images" + os.sep, os.sep + "depth" + os.sep)
        else:
            depth_path = f.replace(os.sep + "rgb" + os.sep, os.sep + "depth" + os.sep)

        depth = cv2.imread(depth_path, cv2.IMREAD_UNCHANGED)
        if depth is None:
            raise FileNotFoundError(f"Depth not found: {depth_path}")
        if depth.ndim == 3:
            depth = depth[:, :, 0]
        if depth.shape[:2] != (h0, w0):
            depth = cv2.resize(depth, (w0, h0), interpolation=cv2.INTER_NEAREST)

        depth = normalize_depth_to_uint8(depth)[:, :, None]
        img = np.concatenate([rgb, depth], axis=2)

        # Resize like YOLODataset
        r = self.imgsz / max(h0, w0)
        if r != 1:
            interp = cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA
            img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)

        return img, (h0, w0), img.shape[:2]


In [23]:
class RGBDTrainer(DetectionTrainer):
    def build_dataset(self, img_path, mode="train", batch=None):
        # Trainer has self.model
        stride_t = self.model.stride
        stride = int(stride_t.max()) if hasattr(stride_t, "max") else int(stride_t)
        return RGBDDataset(
            data=self.data,
            task=self.args.task,
            img_path=img_path,
            imgsz=self.args.imgsz,
            batch_size=batch,
            augment=mode == "train",
            hyp=self.args,
            rect=mode == "val",
            cache=self.args.cache,
            single_cls=False,
            stride=stride,
            pad=0.0,
            prefix=f"{mode}: ",
        )


class RGBDValidator(DetectionValidator):
    def build_dataset(self, img_path, mode="val", batch=None):
        # IMPORTANT: Validator does NOT have self.model at build_dataset time.
        # Ultralytics sets self.stride in validator.__call__(..., model) before get_dataloader().
        stride_t = getattr(self, "stride", 32)
        stride = int(stride_t.max()) if hasattr(stride_t, "max") else int(stride_t)
        return RGBDDataset(
            data=self.data,
            task=self.args.task,
            img_path=img_path,
            imgsz=self.args.imgsz,
            batch_size=batch,
            augment=False,
            hyp=self.args,
            rect=True,
            cache=self.args.cache,
            single_cls=False,
            stride=stride,
            pad=0.0,
            prefix=f"{mode}: ",
        )


In [24]:
def adapt_first_conv_to_4ch(det_model):
    first = det_model.model[0]
    conv = first.conv if hasattr(first, "conv") else first

    if conv.in_channels == 4:
        return

    new_conv = torch.nn.Conv2d(
        in_channels=4,
        out_channels=conv.out_channels,
        kernel_size=conv.kernel_size,
        stride=conv.stride,
        padding=conv.padding,
        bias=(conv.bias is not None),
    )

    with torch.no_grad():
        new_conv.weight[:, : conv.in_channels] = conv.weight
        # init the 4th channel as mean of RGB weights
        new_conv.weight[:, conv.in_channels :] = conv.weight.mean(dim=1, keepdim=True)
        if conv.bias is not None:
            new_conv.bias[:] = conv.bias

    if hasattr(first, "conv"):
        first.conv = new_conv
    else:
        det_model.model[0] = new_conv


In [25]:
DATA = "/kaggle/working/ffb_localization_rgbd_train.yaml"

DATASET_ROOT = Path("/kaggle/input/ffb-localization-rgbd-dataset/ffb_localization_rgbd")
if not DATASET_ROOT.exists():
    DATASET_ROOT = DATASET_DIR

# after the build step, Ultralytics reads from /images/*
train_dir = DATASET_ROOT / "images" / "train"
val_dir = DATASET_ROOT / "images" / "val"
test_dir = DATASET_ROOT / "images" / "test"

print("Train images:", len(list(train_dir.glob("*.png"))))
print("Val images:", len(list(val_dir.glob("*.png"))))
print("Test images:", len(list(test_dir.glob("*.png"))))

IMGSZ = 640
EPOCHS = 100
BATCH = 16
DEVICE = "0"  # set "cpu" if no GPU

RUNS_DIR = Path("/kaggle/working/runs/detect")


def zip_dir(dir_path: Path, zip_path: Path) -> Path:
    zip_path.parent.mkdir(parents=True, exist_ok=True)
    if zip_path.suffix.lower() == ".zip":
        zip_path = zip_path.with_suffix("")
    out = shutil.make_archive(str(zip_path), "zip", root_dir=str(dir_path))
    return Path(out)


def run_seed(seed: int):
    exp_name = f"exp_a3_rgbd_seed{seed}"

    model = YOLO("yolo11n.pt")
    adapt_first_conv_to_4ch(model.model)
    model.model.yaml["ch"] = 4

    # Train
    model.train(
        data=DATA,
        imgsz=IMGSZ,
        epochs=EPOCHS,
        batch=BATCH,
        seed=seed,
        device=DEVICE,
        name=exp_name,
        exist_ok=True,
        hsv_h=0.0,
        hsv_s=0.0,
        hsv_v=0.0,
        trainer=RGBDTrainer,
    )

    # Test evaluation (split=test)
    metrics = model.val(
        data=DATA,
        split="test",
        device=DEVICE,
        name=f"test_{exp_name}",
        exist_ok=True,
        validator=RGBDValidator,
    )

    # Zip outputs for easy download
    train_run_dir = RUNS_DIR / exp_name
    test_run_dir = RUNS_DIR / f"test_{exp_name}"

    z_train = zip_dir(train_run_dir, Path(f"/kaggle/working/{exp_name}_train.zip"))
    z_test = zip_dir(test_run_dir, Path(f"/kaggle/working/{exp_name}_test.zip"))

    print("Zipped:", z_train)
    print("Zipped:", z_test)

    return {"metrics": metrics, "train_zip": str(z_train), "test_zip": str(z_test)}


Train images: 280
Val images: 80
Test images: 40


In [26]:
out_seed42 = run_seed(42)
out_seed123 = run_seed(123)

print("Seed 42 metrics:", out_seed42["metrics"])
print("Seed 42 zips:", out_seed42["train_zip"], out_seed42["test_zip"])

print("Seed 123 metrics:", out_seed123["metrics"])
print("Seed 123 zips:", out_seed123["train_zip"], out_seed123["test_zip"])


[KDownloading https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo11n.pt to 'yolo11n.pt': 100% ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ 5.4MB 67.2MB/s 0.1s
Ultralytics 8.4.6 üöÄ Python-3.12.12 torch-2.8.0+cu126 CUDA:0 (Tesla T4, 15095MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, angle=1.0, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=/kaggle/working/ffb_localization_rgbd_train.yaml, degrees=0.0, deterministic=True, device=0, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.0, hsv_s=0.0, hsv_v=0.0, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train,

: 