In [2]:
import os
import cv2
import glob
import torch
import random
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image
from tqdm import tqdm
from scipy.io import loadmat
from scipy.stats import spearmanr
from diffusers import MarigoldDepthPipeline
from transformers import AutoImageProcessor, AutoModelForDepthEstimation, ZoeDepthForDepthEstimation

2026-01-09 21:59:40.873372: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767995981.061379      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767995981.112813      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767995981.565590      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767995981.565630      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767995981.565633      55 computation_placer.cc:177] computation placer alr

In [3]:
IBIMS_PATH = "/kaggle/input/ibims-1/iBims-1"
NYU_PATH = "/kaggle/input/nyu-depth-v2/nyu_data/data/nyu2_test"

DEVICE = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
print(DEVICE)

cuda


In [4]:
# https://www.asg.ed.tum.de/lmf/ibims1/

class IBimsLoader:
    def __init__(self, root_dir=IBIMS_PATH):
        self.rgb_files = sorted(glob.glob(os.path.join(root_dir, "rgb", "*.png")))
        self.depth_files = sorted(glob.glob(os.path.join(root_dir, "ibims1_core_mat", "*.mat")))
        
        if len(self.rgb_files) != len(self.depth_files):
            print("Hmm something is wrong with the dataset...")

    def __len__(self):
        return len(self.rgb_files)

    def get_item(self, idx):
        img_path = self.rgb_files[idx]
        img = cv2.imread(img_path)
        #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        depth_path = self.depth_files[idx]
        mat_data = loadmat(depth_path)

        gt_depth = mat_data['data']['depth'][0][0]

        mask = gt_depth > 0.001
        
        return img, gt_depth, mask,img_path

class NyuLoader:
    def __init__(self, root_dir=NYU_PATH, samples_size=100, seed=42):
        self.samples = []
        
        self.rgb_files = sorted(glob.glob(os.path.join(root_dir, "*_colors.png"), recursive=True))
        print(f"Found {len(self.rgb_files)} RGB candidates. Matching with Depth...")

        for rgb_path in self.rgb_files:
            depth_path = rgb_path.replace("_colors.png", "_depth.png")
            if os.path.exists(depth_path):
                self.samples.append((rgb_path, depth_path))

        if samples_size is not None and len(self.samples) > samples_size:
            random.seed(seed)
            self.samples = random.sample(self.samples, samples_size)
    
    def __len__(self):
        return len(self.samples)

    def get_item(self, idx):
        img_path, depth_path = self.samples[idx]
        
        img = cv2.imread(img_path)
        depth_png = cv2.imread(depth_path, -1)
        if depth_png is None:
            raise ValueError(f"Failed to load depth: {depth_path}")

        gt_depth = depth_png.astype(np.float32) / 1000.0
        mask = gt_depth > 0.001

        return img, gt_depth, mask, img_path

In [5]:
#  https://huggingface.co/blog/Isayoften/monocular-depth-estimation-guide

def get_absrel(gt, pred):
    return np.mean(np.abs(gt-pred)/gt)

def get_delta(gt, pred, exponent=1):
    inlier = np.maximum((gt/pred), (pred/gt))
    return np.mean(inlier < 1.25**exponent)

def get_silog(gt, pred):
    """
    Computes Scale Invariant Logarithmic Error.
    Lower is better.
    """
    pred = np.maximum(pred, 1e-5)
    gt = np.maximum(gt, 1e-5)

    err = np.log(pred) - np.log(gt)

    silog = np.sqrt(np.mean(err ** 2) - (np.mean(err)) ** 2)
    
    return silog * 100

def align_depth_least_square(gt_arr, pred_arr, valid_mask_arr):
    ori_shape = pred_arr.shape

    gt = gt_arr.squeeze()  # [H, W]
    pred = pred_arr.squeeze()
    valid_mask = valid_mask_arr.squeeze()
    gt_masked = gt[valid_mask].reshape((-1, 1))
    pred_masked = pred[valid_mask].reshape((-1, 1))

    # numpy solver
    _ones = np.ones_like(pred_masked)
    A = np.concatenate([pred_masked, _ones], axis=-1)
    X = np.linalg.lstsq(A, gt_masked, rcond=None)[0]
    scale, shift = X

    aligned_pred = pred_arr * scale + shift

    # restore dimensions
    aligned_pred = aligned_pred.reshape(ori_shape)

    return aligned_pred, scale, shift

In [6]:
class ModelWrapper:
    def __init__(self, choice):
        self.choice = choice
        
        if "marigold" in choice.lower():
            self.model = MarigoldDepthPipeline.from_pretrained(choice, variant="fp16").to(DEVICE)
            self.model.set_progress_bar_config(disable=True)
            print("found marigold model")
        elif "zoedepth" in choice.lower():
            self.processor = AutoImageProcessor.from_pretrained(choice)
            self.model = ZoeDepthForDepthEstimation.from_pretrained(choice).to(DEVICE)
            print("found zoedepth model")
        else:
            self.processor = AutoImageProcessor.from_pretrained(choice)
            self.model = AutoModelForDepthEstimation.from_pretrained(choice).to(DEVICE)

    def infer(self, image_path, marigold_steps=4):
        if "marigold" in self.choice.lower():
            image = Image.open(image_path).convert("RGB")
            w0, h0 = image.size
            
            with torch.no_grad():
                pipe_out = self.model(
                    image, 
                    num_inference_steps=marigold_steps, 
                    output_type="pt"
                )
            prediction = pipe_out.prediction

            # Check if resize is needed
            if prediction.shape[-2:] != (h0, w0):
                prediction = torch.nn.functional.interpolate(
                    prediction,
                    size=(h0, w0),
                    mode="bicubic",
                    align_corners=False
                )

            return prediction.squeeze().cpu().numpy()

        # Transformer-based models inference
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        inputs = self.processor(images=image, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predicted_depth = outputs.predicted_depth
        
        # Resize to original image size
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1),
            size=image.shape[:2],
            mode="bicubic",
            align_corners=False,
        )
        
        return prediction.squeeze().cpu().numpy()

In [75]:
# Check if dataset gt is in mm
# dataset = ...
for i in range(len(dataset)):
    _, gt, _, _ = dataset.get_item(i)
    if np.median(gt) > 100:
        print("maybe that measurement is in millimeters... needs to change!")

In [7]:
ibims_loader = IBimsLoader()
nyu_loader = NyuLoader()

class ModelConfig:
    def __init__(self, model_choice, display_name, relative=True, ibims=ibims_loader, nyu=nyu_loader):
        self.model = ModelWrapper(model_choice)
        self.display_name = display_name
        self.relative = relative
        self.ibims = ibims
        self.nyu = nyu
        self.absrel = {}
        self.d1 = {}
        self.silog = {}

    def set_results(self, dataset, absrel, d1, silog):
        self.absrel[dataset] = absrel;
        self.d1[dataset] = d1
        self.silog[dataset] = silog

    def get_absrel_result(self, dataset):
        return self.absrel[dataset]

    def get_d1_result(self, dataset):
        return self.d1[dataset]

    def get_silog_result(self, dataset):
        return self.silog[dataset]

    def __check_if_disparity(self, dataset, num_samples=5):
        """
        Checks if model output is Disparity (needs 1/x) or Depth.
        Returns True if the output is Disparity (Negative Correlation with GT).
        """
        correlations = []
        indices = np.linspace(0, len(dataset)-1, num_samples, dtype=int)
        
        for i in indices:
            _, gt, mask, image_path = dataset.get_item(i)
            prediction = self.model.infer(image_path)
            
            if mask.sum() == 0: continue
            
            val_pred = prediction[mask]
            val_gt = gt[mask]
            
            if len(val_gt) > 5000:
                val_pred = val_pred[::100]
                val_gt = val_gt[::100]

            # Spearman Correlation (rank-based, robust to scale)
            corr, _ = spearmanr(val_pred, val_gt)
            correlations.append(corr)
        
        avg_corr = np.mean(correlations)
        
        # If correlation is negative, values decrease as distance increases -> Disparity
        return avg_corr < -0.1
    
    def __basic_benchmark(self, dataset, use_eigen_crop=False, debug=True):
        absrel_list = []
        delta_list = []
        silog_list = []
        
        ds = "ibims"
        if isinstance(dataset, NyuLoader):
            ds = "nyu"

        needs_inversion = self.__check_if_disparity(dataset)
        if debug and needs_inversion:
            print(f"\tModel outputs Disparity (negative correlation). Inverting predictions (1/x)...")
        
        for i in tqdm(range(len(dataset))):
            _, gt, mask, image_path = dataset.get_item(i)
            prediction = self.model.infer(image_path)
            
            if use_eigen_crop:
                # Standard Eigen Crop (Top, Bottom, Left, Right)
                # [45:471, 41:601] is the valid region for 640x480 images
                height, width = gt.shape
                crop_mask = np.zeros((height, width), dtype=bool)
                
                y1, y2 = 45, min(471, height)
                x1, x2 = 41, min(601, width)
                crop_mask[y1:y2, x1:x2] = True
                
                mask = mask & crop_mask

            # Skip invalid masks
            if mask.sum() == 0: continue
            
            if self.relative:
                if needs_inversion:
                    gt_disparsity = np.zeros_like(gt)
                    gt_disparsity[mask] = 1.0 / gt[mask]
                    aligned_disp, _, _ = align_depth_least_square(gt_disparsity, prediction, mask)
                    aligned_disp = np.maximum(aligned_disp, 1e-6)
                    pred_final = 1.0 / aligned_disp
                else:
                    aligned_depth, _, _ = align_depth_least_square(gt, prediction, mask)
                    pred_final = np.maximum(aligned_depth, 1e-6)
            else:
                pred_final = prediction

            gt_valid = gt[mask]
            pred_valid = pred_final[mask]
            absrel = get_absrel(gt_valid, pred_valid)
            delta = get_delta(gt_valid, pred_valid, 1)
            silog = get_silog(gt_valid, pred_valid)
            
            absrel_list.append(absrel)
            delta_list.append(delta)
            silog_list.append(silog)
            
        am = np.mean(absrel_list)
        dm = np.mean(delta_list)
        sm = np.mean(silog_list)
        self.set_results(ds, am, dm, sm)

        if debug:
            print(f"\tAverage AbsRel = {am:.4f}")
            print(f"\tAverage Delta1 = {dm:.4f}")
            print(f"\tAverage SILog  = {sm:.4f}")

    def benchmark(self, debug=True):
        print(f"{'='*20}[ {self.display_name} ]{'='*20}")
        if self.ibims is not None:
            print("IBims-1 dataset:")
            self.__basic_benchmark(self.ibims, debug=debug)
        if self.nyu is not None:
            print("NYU Depth V2 dataset:")
            self.__basic_benchmark(self.nyu, use_eigen_crop=True, debug=debug)
        print(f"{'='*20}[ {self.display_name} ]{'='*20}\n")

Found 654 RGB candidates. Matching with Depth...


In [11]:
configs = [
    ModelConfig("depth-anything/Depth-Anything-V2-Large-hf", "DAV2-Large"),
    ModelConfig("depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf", "DAV2-Indoor-Metric", relative=False),
    ModelConfig("Intel/zoedepth-nyu-kitti", "ZoeDepth", relative=False),
    ModelConfig("Intel/dpt-large", "MiDaS-3.0"),
    ModelConfig("Intel/dpt-beit-large-512", "MiDaS-3.1"),
    ModelConfig("prs-eth/marigold-depth-v1-1", "Marigold-1.1")
]

for config in configs:
    config.benchmark(debug=True)

found zoedepth model


preprocessor_config.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/942 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.37G [00:00<?, ?B/s]

Some weights of DPTForDepthEstimation were not initialized from the model checkpoint at Intel/dpt-large and are newly initialized: ['neck.fusion_stage.layers.0.residual_layer1.convolution1.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution1.weight', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.bias', 'neck.fusion_stage.layers.0.residual_layer1.convolution2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


preprocessor_config.json:   0%|          | 0.00/426 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

config.json: 0.00B [00:00, ?B/s]

scheduler_config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

text_encoder/model.fp16.safetensors:   0%|          | 0.00/681M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

unet/diffusion_pytorch_model.fp16.safete(…):   0%|          | 0.00/1.73G [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

vae/diffusion_pytorch_model.fp16.safeten(…):   0%|          | 0.00/167M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

found marigold model
IBims-1 dataset:
	Model outputs Disparity (negative correlation). Inverting predictions (1/x)...


100%|██████████| 100/100 [00:38<00:00,  2.57it/s]


	Average AbsRel = 0.1335
	Average Delta1 = 0.9783
	Average SILog  = 7.0373
NYU Depth V2 dataset:
	Model outputs Disparity (negative correlation). Inverting predictions (1/x)...


100%|██████████| 100/100 [00:36<00:00,  2.76it/s]


	Average AbsRel = 0.0541
	Average Delta1 = 0.9644
	Average SILog  = 8.5579

IBims-1 dataset:


100%|██████████| 100/100 [00:37<00:00,  2.67it/s]


	Average AbsRel = 0.1265
	Average Delta1 = 0.8861
	Average SILog  = 7.5893
NYU Depth V2 dataset:


100%|██████████| 100/100 [00:35<00:00,  2.84it/s]


	Average AbsRel = 0.2196
	Average Delta1 = 0.6744
	Average SILog  = 9.5989

IBims-1 dataset:


100%|██████████| 100/100 [00:29<00:00,  3.41it/s]


	Average AbsRel = 0.2009
	Average Delta1 = 0.6119
	Average SILog  = 16.8612
NYU Depth V2 dataset:


100%|██████████| 100/100 [00:27<00:00,  3.67it/s]


	Average AbsRel = 0.1464
	Average Delta1 = 0.8267
	Average SILog  = 17.2433

IBims-1 dataset:
	Model outputs Disparity (negative correlation). Inverting predictions (1/x)...


100%|██████████| 100/100 [00:17<00:00,  5.74it/s]


	Average AbsRel = 53.1020
	Average Delta1 = 0.9413
	Average SILog  = 12.1237
NYU Depth V2 dataset:
	Model outputs Disparity (negative correlation). Inverting predictions (1/x)...


100%|██████████| 100/100 [00:14<00:00,  6.75it/s]


	Average AbsRel = 0.1028
	Average Delta1 = 0.9007
	Average SILog  = 13.6986

IBims-1 dataset:
	Model outputs Disparity (negative correlation). Inverting predictions (1/x)...


100%|██████████| 100/100 [00:41<00:00,  2.40it/s]


	Average AbsRel = 1.0422
	Average Delta1 = 0.9661
	Average SILog  = 8.9308
NYU Depth V2 dataset:
	Model outputs Disparity (negative correlation). Inverting predictions (1/x)...


100%|██████████| 100/100 [00:38<00:00,  2.58it/s]


	Average AbsRel = 0.0521
	Average Delta1 = 0.9694
	Average SILog  = 8.1113

IBims-1 dataset:


100%|██████████| 100/100 [02:40<00:00,  1.61s/it]


	Average AbsRel = 0.0543
	Average Delta1 = 0.9677
	Average SILog  = 8.1879
NYU Depth V2 dataset:


100%|██████████| 100/100 [02:38<00:00,  1.58s/it]

	Average AbsRel = 0.0702
	Average Delta1 = 0.9425
	Average SILog  = 9.9901




