In [1]:
!pip install -q transformers diffusers imageio scipy timm accelerate

In [2]:
import os
import cv2
import glob
import torch
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.io import loadmat
from transformers import AutoImageProcessor, AutoModelForDepthEstimation

2026-01-08 12:52:03.868758: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767876724.106527      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767876724.174237      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767876724.730660      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767876724.730704      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767876724.730707      55 computation_placer.cc:177] computation placer alr

In [3]:
IBIMS_PATH = "/kaggle/input/ibims-1/iBims-1"

DEVICE = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
print(DEVICE)

cuda


In [4]:
# https://www.asg.ed.tum.de/lmf/ibims1/

class IBimsLoader:
    def __init__(self, root_dir=IBIMS_PATH):
        self.rgb_files = sorted(glob.glob(os.path.join(root_dir, "rgb", "*.png")))
        self.depth_files = sorted(glob.glob(os.path.join(root_dir, "ibims1_core_mat", "*.mat")))
        
        if len(self.rgb_files) != len(self.depth_files):
            print("Hmm something is wrong with the dataset...")

    def __len__(self):
        return len(self.rgb_files)

    def get_item(self, idx):
        img_path = self.rgb_files[idx]
        img = cv2.imread(img_path)
        #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        depth_path = self.depth_files[idx]
        mat_data = loadmat(depth_path)

        gt_depth = mat_data['data']['depth'][0][0]

        mask = gt_depth > 0.001
        
        return img, gt_depth, mask, os.path.basename(img_path)

In [5]:
def get_absrel(gt, pred):
    return np.mean(np.abs(gt-pred)/gt)

def get_delta(gt, pred, exponent=1):
    inlier = np.maximum((gt/pred), (pred/gt))
    return np.mean(inlier < 1.25**exponent)

def align_depth_least_square(gt_arr, pred_arr, valid_mask_arr):
    ori_shape = pred_arr.shape

    gt = gt_arr.squeeze()  # [H, W]
    pred = pred_arr.squeeze()
    valid_mask = valid_mask_arr.squeeze()
    gt_masked = gt[valid_mask].reshape((-1, 1))
    pred_masked = pred[valid_mask].reshape((-1, 1))

    # numpy solver
    _ones = np.ones_like(pred_masked)
    A = np.concatenate([pred_masked, _ones], axis=-1)
    X = np.linalg.lstsq(A, gt_masked, rcond=None)[0]
    scale, shift = X

    aligned_pred = pred_arr * scale + shift

    # restore dimensions
    aligned_pred = aligned_pred.reshape(ori_shape)

    return aligned_pred, scale, shift

In [6]:
class TransformerModelWrapper:
    def __init__(self, choice):
        self.processor = AutoImageProcessor.from_pretrained(choice)
        self.model = AutoModelForDepthEstimation.from_pretrained(choice).to(DEVICE)

    def infer(self, image_path):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        inputs = self.processor(images=image, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predicted_depth = outputs.predicted_depth
        
        # Resize to original image size
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1),
            size=image.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze().cpu().numpy()
        
        return prediction

In [75]:
# Check if dataset gt is in mm
for i in range(len(dataset)):
    _, gt, _, _ = dataset.get_item(i)
    if np.median(gt) > 100:
        print("maybe that measurement is in millimeters... needs to change!")

In [7]:
ibims = IBimsLoader()

def benchmark(model, dataset, relative=True):
    absrel_list = []
    delta_list = []
    
    for i in tqdm(range(len(dataset))):
        img, gt, mask, name = dataset.get_item(i)
        image_path = f"{IBIMS_PATH}/rgb/{name}"
            
        prediction = model.infer(image_path)
        if relative:
            depth_trans, _, _ = align_depth_least_square(gt, prediction, mask)
            absrel = get_absrel(gt[mask], depth_trans[mask])
            delta = get_delta(gt[mask], depth_trans[mask], 1)
        else:
            absrel = get_absrel(gt[mask], prediction[mask])
            delta = get_delta(gt[mask], prediction[mask], 1)
    
        absrel_list.append(absrel)
        delta_list.append(delta)
    
    print(f"Average Abs Rel: {np.mean(absrel_list)}")
    print(f"Average d_1: {np.mean(delta_list)}")

In [8]:
benchmark(TransformerModelWrapper("depth-anything/Depth-Anything-V2-Large-hf"), ibims)

preprocessor_config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:42<00:00,  2.34it/s]

Average Abs Rel: 0.1252612876460376
Average d_1: 0.8572297850934841





In [9]:
benchmark(TransformerModelWrapper("depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf"), ibims, relative=False)

preprocessor_config.json:   0%|          | 0.00/437 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

100%|██████████| 100/100 [00:38<00:00,  2.60it/s]

Average Abs Rel: 0.1264849104849234
Average d_1: 0.886137028246952



