In [1]:
import os
import cv2
import glob
import torch
import random
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image
from tqdm import tqdm
from scipy.io import loadmat
from diffusers import MarigoldDepthPipeline
from transformers import AutoImageProcessor, AutoModelForDepthEstimation, ZoeDepthForDepthEstimation

2026-01-09 13:48:01.239952: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767966481.432502      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767966481.489800      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767966481.956455      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767966481.956496      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767966481.956499      55 computation_placer.cc:177] computation placer alr

In [12]:
IBIMS_PATH = "/kaggle/input/ibims-1/iBims-1"
NYU_PATH = "/kaggle/input/nyu-depth-v2/nyu_data/data/nyu2_test"

DEVICE = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
print(DEVICE)

cuda


In [18]:
# https://www.asg.ed.tum.de/lmf/ibims1/

class IBimsLoader:
    def __init__(self, root_dir=IBIMS_PATH):
        self.rgb_files = sorted(glob.glob(os.path.join(root_dir, "rgb", "*.png")))
        self.depth_files = sorted(glob.glob(os.path.join(root_dir, "ibims1_core_mat", "*.mat")))
        
        if len(self.rgb_files) != len(self.depth_files):
            print("Hmm something is wrong with the dataset...")

    def __len__(self):
        return len(self.rgb_files)

    def get_item(self, idx):
        img_path = self.rgb_files[idx]
        img = cv2.imread(img_path)
        #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        depth_path = self.depth_files[idx]
        mat_data = loadmat(depth_path)

        gt_depth = mat_data['data']['depth'][0][0]

        mask = gt_depth > 0.001
        
        return img, gt_depth, mask,img_path

class NyuLoader:
    def __init__(self, root_dir=NYU_PATH, samples_size=100, seed=42):
        self.samples = []
        
        self.rgb_files = sorted(glob.glob(os.path.join(root_dir, "*_colors.png"), recursive=True))
        print(f"Found {len(self.rgb_files)} RGB candidates. Matching with Depth...")

        for rgb_path in self.rgb_files:
            depth_path = rgb_path.replace("_colors.png", "_depth.png")
            if os.path.exists(depth_path):
                self.samples.append((rgb_path, depth_path))

        if samples_size is not None and len(self.samples) > samples_size:
            random.seed(seed)
            self.samples = random.sample(self.samples, samples_size)
    
    def __len__(self):
        return len(self.samples)

    def get_item(self, idx):
        img_path, depth_path = self.samples[idx]
        
        img = cv2.imread(img_path)
        depth_png = cv2.imread(depth_path, -1)
        if depth_png is None:
            raise ValueError(f"Failed to load depth: {depth_path}")

        gt_depth = depth_png.astype(np.float32) / 1000.0
        mask = gt_depth > 0.001

        return img, gt_depth, mask, img_path

In [5]:
#  https://huggingface.co/blog/Isayoften/monocular-depth-estimation-guide

def get_absrel(gt, pred):
    return np.mean(np.abs(gt-pred)/gt)

def get_delta(gt, pred, exponent=1):
    inlier = np.maximum((gt/pred), (pred/gt))
    return np.mean(inlier < 1.25**exponent)

def align_depth_least_square(gt_arr, pred_arr, valid_mask_arr):
    ori_shape = pred_arr.shape

    gt = gt_arr.squeeze()  # [H, W]
    pred = pred_arr.squeeze()
    valid_mask = valid_mask_arr.squeeze()
    gt_masked = gt[valid_mask].reshape((-1, 1))
    pred_masked = pred[valid_mask].reshape((-1, 1))

    # numpy solver
    _ones = np.ones_like(pred_masked)
    A = np.concatenate([pred_masked, _ones], axis=-1)
    X = np.linalg.lstsq(A, gt_masked, rcond=None)[0]
    scale, shift = X

    aligned_pred = pred_arr * scale + shift

    # restore dimensions
    aligned_pred = aligned_pred.reshape(ori_shape)

    return aligned_pred, scale, shift

In [6]:
class ModelWrapper:
    def __init__(self, choice):
        self.choice = choice
        
        if "marigold" in choice.lower():
            self.model = MarigoldDepthPipeline.from_pretrained(choice, variant="fp16").to(DEVICE)
            self.model.set_progress_bar_config(disable=True)
            print("found marigold model")
        elif "zoedepth" in choice.lower():
            self.processor = AutoImageProcessor.from_pretrained(choice)
            self.model = ZoeDepthForDepthEstimation.from_pretrained(choice).to(DEVICE)
            print("found zoedepth model")
        else:
            self.processor = AutoImageProcessor.from_pretrained(choice)
            self.model = AutoModelForDepthEstimation.from_pretrained(choice).to(DEVICE)

    def infer(self, image_path, marigold_steps=4):
        if "marigold" in self.choice.lower():
            image = Image.open(image_path).convert("RGB")
            w0, h0 = image.size
            
            with torch.no_grad():
                pipe_out = self.model(
                    image, 
                    num_inference_steps=marigold_steps, 
                    output_type="pt"
                )
            prediction = pipe_out.prediction

            # Check if resize is needed
            if prediction.shape[-2:] != (h0, w0):
                prediction = torch.nn.functional.interpolate(
                    prediction,
                    size=(h0, w0),
                    mode="bicubic",
                    align_corners=False
                )

            return prediction.squeeze().cpu().numpy()

        # Transformer-based models inference
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        inputs = self.processor(images=image, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predicted_depth = outputs.predicted_depth
        
        # Resize to original image size
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1),
            size=image.shape[:2],
            mode="bicubic",
            align_corners=False,
        )
        
        return prediction.squeeze().cpu().numpy()

In [75]:
# Check if dataset gt is in mm
# dataset = ...
for i in range(len(dataset)):
    _, gt, _, _ = dataset.get_item(i)
    if np.median(gt) > 100:
        print("maybe that measurement is in millimeters... needs to change!")

In [19]:
ibims = IBimsLoader()
nyu = NyuLoader()

def benchmark(model, dataset, relative=True):
    absrel_list = []
    delta_list = []
    
    for i in tqdm(range(len(dataset))):
        img, gt, mask, image_path = dataset.get_item(i)
        #image_path = f"{IBIMS_PATH}/rgb/{name}"
            
        prediction = model.infer(image_path)
        if relative:
            depth_trans, _, _ = align_depth_least_square(gt, prediction, mask)
            absrel = get_absrel(gt[mask], depth_trans[mask])
            delta = get_delta(gt[mask], depth_trans[mask], 1)
        else:
            absrel = get_absrel(gt[mask], prediction[mask])
            delta = get_delta(gt[mask], prediction[mask], 1)
    
        absrel_list.append(absrel)
        delta_list.append(delta)
    
    print(f"Average Abs Rel: {np.mean(absrel_list)}")
    print(f"Average d_1: {np.mean(delta_list)}")

Found 654 RGB candidates. Matching with Depth...


In [22]:
# model : [relative, ibims, nyu]
configurations = {
    "depth-anything/Depth-Anything-V2-Large-hf": [True, True, True],
    "depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf": [False, True, True],
    "Intel/zoedepth-nyu-kitti": [False, True, True],
    "Intel/dpt-large": [True, True, True],
    "Intel/dpt-beit-large-512": [True, True, True],
    "prs-eth/marigold-depth-v1-1": [True, True, True]
}

for model in configurations.keys():
    config = configurations[model]
    relative = config[0]
    print(f"{'='*20}[ {model} ]{'='*20}\n")
    if config[1]:
        print("IBims-1 dataset:")
        benchmark(ModelWrapper(model), ibims, relative=relative)
    if config[2]:
        print("NYU Depth V2 dataset:")
        benchmark(ModelWrapper(model), nyu, relative=relative)
    print(f"{'='*20}[ {model} ]{'='*20}\n")


IBims-1 dataset:


100%|██████████| 100/100 [00:38<00:00,  2.61it/s]


Average Abs Rel: 0.1252612876460376
Average d_1: 0.8572297850934841
NYU Depth V2 dataset:


100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


Average Abs Rel: 0.1659458577632904
Average d_1: 0.7952931966145833


IBims-1 dataset:


100%|██████████| 100/100 [00:37<00:00,  2.69it/s]


Average Abs Rel: 0.1264849104849234
Average d_1: 0.886137028246952
NYU Depth V2 dataset:


100%|██████████| 100/100 [00:35<00:00,  2.84it/s]

Average Abs Rel: 0.22496400773525238
Average d_1: 0.6607259114583335


IBims-1 dataset:





preprocessor_config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

found zoedepth model


100%|██████████| 100/100 [00:30<00:00,  3.26it/s]


Average Abs Rel: 0.20092700231621144
Average d_1: 0.6119498442326409
NYU Depth V2 dataset:
found zoedepth model


100%|██████████| 100/100 [00:27<00:00,  3.69it/s]

Average Abs Rel: 0.14556682109832764
Average d_1: 0.8202607096354169


IBims-1 dataset:





preprocessor_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [00:16<00:00,  5.90it/s]


Average Abs Rel: 0.13541643258733274
Average d_1: 0.8394842777713843
NYU Depth V2 dataset:


Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [00:14<00:00,  6.80it/s]


Average Abs Rel: 0.1611204892396927
Average d_1: 0.7898171223958331


IBims-1 dataset:


preprocessor_config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:43<00:00,  2.30it/s]


Average Abs Rel: 0.12628050729642812
Average d_1: 0.8533818513122847
NYU Depth V2 dataset:


100%|██████████| 100/100 [00:40<00:00,  2.49it/s]


Average Abs Rel: 0.137671560049057
Average d_1: 0.8435399739583335


IBims-1 dataset:


model_index.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/1.73G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

found marigold model


100%|██████████| 100/100 [02:40<00:00,  1.61s/it]


Average Abs Rel: 0.05358334528379531
Average d_1: 0.9676189089772491
NYU Depth V2 dataset:


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

found marigold model


100%|██████████| 100/100 [02:38<00:00,  1.59s/it]

Average Abs Rel: 0.09601327776908875
Average d_1: 0.8955147786458332






In [8]:
benchmark(ModelWrapper("depth-anything/Depth-Anything-V2-Large-hf"), ibims)

preprocessor_config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:42<00:00,  2.34it/s]

Average Abs Rel: 0.1252612876460376
Average d_1: 0.8572297850934841





In [9]:
benchmark(ModelWrapper("depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf"), ibims, relative=False)

preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:38<00:00,  2.60it/s]

Average Abs Rel: 0.1264849104849234
Average d_1: 0.886137028246952





In [17]:
benchmark(ModelWrapper("Intel/zoedepth-nyu-kitti"), ibims, relative=False)

found zoedepth model


100%|██████████| 100/100 [00:36<00:00,  2.71it/s]

Average Abs Rel: 0.20092700231619312
Average d_1: 0.6119498442326409





In [18]:
# MiDaS 3.0
benchmark(ModelWrapper("Intel/dpt-large"), ibims)

preprocessor_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 100/100 [00:18<00:00,  5.44it/s]

Average Abs Rel: 0.13541643258733274
Average d_1: 0.8394842777713843





In [19]:
# MiDaS 3.1
benchmark(ModelWrapper("Intel/dpt-beit-large-512"), ibims)

preprocessor_config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:55<00:00,  1.82it/s]

Average Abs Rel: 0.12628050729642812
Average d_1: 0.8533818513122847





In [8]:
# Marigold with 4 iterations
benchmark(ModelWrapper("prs-eth/marigold-depth-v1-1"), ibims)

model_index.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/1.73G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

found marigold model


100%|██████████| 100/100 [02:43<00:00,  1.63s/it]

Average Abs Rel: 0.053644879019413894
Average d_1: 0.9681597840673872





In [20]:
benchmark(ModelWrapper("depth-anything/Depth-Anything-V2-Large-hf"), nyu)

100%|██████████| 100/100 [00:40<00:00,  2.47it/s]

Average Abs Rel: 0.1659458577632904
Average d_1: 0.7952931966145833





In [21]:
benchmark(ModelWrapper("depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf"), ibims, relative=False)

preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:41<00:00,  2.40it/s]

Average Abs Rel: 0.1264849104849234
Average d_1: 0.886137028246952



