In [6]:
!pip install -q transformers diffusers imageio scipy timm accelerate

In [29]:
import os
import cv2
import glob
import torch
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from scipy.io import loadmat
from diffusers import MarigoldDepthPipeline
from transformers import AutoImageProcessor, AutoModelForDepthEstimation, ZoeDepthForDepthEstimation

In [19]:
IBIMS_PATH = "/kaggle/input/ibims-1/iBims-1"

DEVICE = torch.device("cuda") if torch.cuda.is_available else torch.device("cpu")
print(DEVICE)

cuda


In [67]:
# https://www.asg.ed.tum.de/lmf/ibims1/

class IBimsLoader:
    def __init__(self, root_dir=IBIMS_PATH):
        self.rgb_files = sorted(glob.glob(os.path.join(root_dir, "rgb", "*.png")))
        self.depth_files = sorted(glob.glob(os.path.join(root_dir, "ibims1_core_mat", "*.mat")))
        
        if len(self.rgb_files) != len(self.depth_files):
            print("Hmm something is wrong with the dataset...")

    def __len__(self):
        return len(self.rgb_files)

    def get_item(self, idx):
        img_path = self.rgb_files[idx]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        depth_path = self.depth_files[idx]
        mat_data = loadmat(depth_path)
        
        key = [k for k in mat_data.keys() if not k.startswith('_')][0]
        data_struct = mat_data[key][0, 0]
        
        gt_depth = data_struct['depth']
        gt_depth = gt_depth.astype(np.float32)

        invalid_mask = data_struct['mask_invalid'].astype(bool)
        
        return img, gt_depth, invalid_mask, os.path.basename(img_path)

In [77]:
class DepthMetrics:
    def __init__(self):
        pass

    def align_scale_shift(self, pred, target):
        """
        MiDaS paper - p.5
        Aligns prediction to target using Least Squares (Scale & Shift).
        Used for Relative Depth Models (MiDaS, Depth Anything Base).
        Formula: s*, t* = argmin || s * pred + t - target ||^2
        """
        mask = (target > 0)
        target_masked = target[mask]
        pred_masked = pred[mask]

        if len(pred_masked) < 10: return pred, 1.0, 0.0
        
        slope, intercept = np.polyfit(pred_masked, target_masked, 1)
        
        pred_aligned = pred * slope + intercept
        return pred_aligned, slope, intercept

    def align_median(self, pred, target):
        """
        Simple Median Scaling. often used for Metric models to correct global scale drift.
        """
        mask = (target > 0)
        scale = np.median(target[mask]) / np.median(pred[mask])
        return pred * scale

    def compute(self, pred, target, invalid_mask=None, align_type="none"):
        """
        Calculates: AbsRel, RMSE, Delta1 (a1).
        align_type: 'none' (for Metric League), 'least_squares' (for Relative League)
        """
        valid_mask = invalid_mask if invalid_mask is not None else np.ones_like(target, dtype=bool)
        
        # GT > 0.1 ensures we don't divide by tiny numbers
        # GT < 80.0 removes infinite sky/sensor errors
        valid_mask = valid_mask & (target > 0.1) & (target < 80.0)
        
        valid_mask = valid_mask & (~np.isnan(target)) & (~np.isnan(pred))
        if valid_mask.sum() == 0: return None

        pred_valid = pred[valid_mask]
        target_valid = target[valid_mask]

        if align_type == "least_squares":
            pred_valid, _, _ = self.align_scale_shift(pred_valid, target_valid)
        elif align_type == "median":
            scale = np.median(target_valid) / np.median(pred_valid)
            pred_valid = pred_valid * scale

        pred_valid = np.clip(pred_valid, 0.001, 80.0)

        # AbsRel: |pred - gt| / gt
        abs_rel = np.mean(np.abs(pred_valid - target_valid) / target_valid)

        # RMSE
        rmse = np.sqrt(np.mean((pred_valid - target_valid) ** 2))

        # Delta Accuracy: max(pred/gt, gt/pred) < 1.25
        thresh = np.maximum((target_valid / pred_valid), (pred_valid / target_valid))
        a1 = (thresh < 1.25).mean()

        return {"abs_rel": abs_rel, "rmse": rmse, "a1": a1}

In [48]:
class TransformerModelWrapper:
    def __init__(self, choice):
        self.processor = AutoImageProcessor.from_pretrained(choice)
        self.model = AutoModelForDepthEstimation.from_pretrained(choice).to(DEVICE)

    def infer(self, image_path):
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        inputs = self.processor(images=image, return_tensors="pt").to(DEVICE)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            predicted_depth = outputs.predicted_depth
        
        # Resize to original image size
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1),
            size=image.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze().cpu().numpy()
        
        return prediction

class ModelRunner:
    def __init__(self, device="cuda"):
        self.device = device
        self.models = {}
        self.processors = {}
    
    def load_depth_anything_v2(self, variant="metric"):
        if variant == "metric":
            mid = "depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf"
        else:
            mid = "depth-anything/Depth-Anything-V2-Small-hf"
            
        print(f"Loading {mid}...")
        self.processors["da_v2"] = AutoImageProcessor.from_pretrained(mid)
        self.models["da_v2"] = AutoModelForDepthEstimation.from_pretrained(mid).to(self.device)
    
    def load_zoedepth(self):
        print("Loading ZoeDepth...")
        mid = "intel-isl/ZoeD_M12_N"
        self.processors["zoe"] = AutoImageProcessor.from_pretrained(mid)
        self.models["zoe"] = ZoeDepthForDepthEstimation.from_pretrained(mid).to(self.device)

    def load_marigold(self):
        print("Loading Marigold (Diffusion)...")
        pipe = MarigoldDepthPipeline.from_pretrained(
            "prs-eth/marigold-v1-0", torch_dtype=torch.float16
        )
        pipe.to(self.device)
        self.models["marigold"] = pipe

    def infer(self, model_name, image_path):
        """
        Generic inference wrapper
        """
        image = cv2.imread(image_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        if model_name == "marigold":
            from PIL import Image
            pil_img = Image.fromarray(image)
            pipe_out = self.models["marigold"](pil_img, num_inference_steps=10) # 10 is fast, 50 is precise
            depth = pipe_out.depth_np
            return depth

        processor = self.processors[model_name]
        model = self.models[model_name]
        
        inputs = processor(images=image, return_tensors="pt").to(self.device)
        
        with torch.no_grad():
            outputs = model(**inputs)
            predicted_depth = outputs.predicted_depth
        
        # Resize to original image size
        prediction = torch.nn.functional.interpolate(
            predicted_depth.unsqueeze(1),
            size=image.shape[:2],
            mode="bicubic",
            align_corners=False,
        ).squeeze().cpu().numpy()
        
        return prediction

In [61]:
def get_final_results(results):
    if len(results) > 0:
        print("\n" + "="*40)
        
        avg_results = {}
        for key in results[0].keys():
            avg_results[key] = np.mean([res[key] for res in results])
    
        print(f"AbsRel (Lower is better):  {avg_results['abs_rel']:.4f}")
        print(f"RMSE   (Lower is better):  {avg_results['rmse']:.4f}")
        print(f"Delta1 (Higher is better): {avg_results['a1']:.4f}")
        print("="*40)
        return avg_results
    else:
        print("No valid results found.")
    return None

def run_transformer_over_dataset(model, dataset, metrics_calc, align_type="none", debug=False):
    results = []
    for i in tqdm(range(len(dataset))):
        img, gt, mask, name = dataset.get_item(i)
        #print(img.shape, gt.shape, name)
        image_path = f"{IBIMS_PATH}/rgb/{name}"
        
        prediction = dav2_base.infer(image_path)
        metrics = metrics_calc.compute(prediction, gt, invalid_mask=mask, align_type=align_type)
        if metrics is not None:
            results.append(metrics)
            if debug: 
                print(f"{name}\tAbsRel: {metrics['abs_rel']:.3f}\tRMSE: {metrics['rmse']:.3f}\tDelta1: {metrics['a1']:.3f}")
                print(f"Pred Median: {np.median(prediction):.2f}, GT Median: {np.median(gt):.2f}")
        else:
            print(f"Something went wrong with {name}. Skipping...")
    
    return get_final_results(results)

In [78]:
dataset = IBimsLoader()
metrics_calc = DepthMetrics()

In [73]:
img, gt, invalid_mask, name = dataset.get_item(12)
print(np.where(invalid_mask == True))

(array([  0,   0,   0, ..., 479, 479, 479]), array([  0,   1,   2, ..., 637, 638, 639]))


In [75]:
# Check if dataset gt is in mm
for i in range(len(dataset)):
    img, gt, invalid_mask, name = dataset.get_item(i)
    if np.median(gt) > 100:
        print("maybe that measurement is in millimeters... needs to change!")
    #print(np.where(invalid_mask == False))

In [79]:
dav2_base = TransformerModelWrapper("depth-anything/Depth-Anything-V2-Small-hf")
run_transformer_over_dataset(dav2_base, dataset, metrics_calc, align_type="least_squares")

100%|██████████| 100/100 [00:13<00:00,  7.44it/s]


AbsRel (Lower is better):  0.1192
RMSE   (Lower is better):  0.5311
Delta1 (Higher is better): 0.8536





{'abs_rel': np.float64(0.11924478010784752),
 'rmse': np.float64(0.5311465352130827),
 'a1': np.float64(0.8535762321108398)}

In [81]:
dav2_metric = TransformerModelWrapper("depth-anything/Depth-Anything-V2-Metric-Indoor-Large-hf")
run_transformer_over_dataset(dav2_metric, dataset, metrics_calc, align_type="none", debug=False)

#print("\nLet's apply some scaling to test what changes")
#run_transformer_over_dataset(dav2_metric, dataset, metrics_calc, align_type="median")

100%|██████████| 100/100 [00:11<00:00,  8.81it/s]


AbsRel (Lower is better):  0.9662
RMSE   (Lower is better):  3.0890
Delta1 (Higher is better): 0.1500





{'abs_rel': np.float32(0.96617407),
 'rmse': np.float32(3.0889754),
 'a1': np.float64(0.14998329868036855)}

In [82]:
run_transformer_over_dataset(dav2_metric, dataset, metrics_calc, align_type="median")

100%|██████████| 100/100 [00:11<00:00,  8.58it/s]


AbsRel (Lower is better):  1.2479
RMSE   (Lower is better):  4.2113
Delta1 (Higher is better): 0.2292





{'abs_rel': np.float32(1.2478518),
 'rmse': np.float32(4.2112966),
 'a1': np.float64(0.22922321199906034)}