In [2]:
# Optional config for better memory efficiency
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [29]:
# Required imports
import cv2
import numpy as np
import scipy.io as sio
import torch
from mapanything.models import MapAnything
from mapanything.utils.image import load_images

In [4]:
# Get inference device
device = "cpu" 

# Init model - This requires internet access or the huggingface hub cache to be pre-downloaded
# For Apache 2.0 license model, use "facebook/map-anything-apache"
model = MapAnything.from_pretrained("facebook/map-anything").to(device)

# Load and preprocess images from a folder or list of paths
#images = ["/Users/jpc/Nextcloud/IST/Escolaridade/PIV2025/Datasets/TaagPIV/20251028_165814.jpg" ] # or ["path/to/img1.jpg", "path/to/img2.jpg", ...]
images = "/Users/jpc/Nextcloud/IST/Escolaridade/PIV2025/AulasPraticas/aula7_3D_registration/plondres" # or ["path/to/img1.jpg", "path/to/img2.jpg", ...]
views = load_images(images)

Loading pretrained dinov2_vitl14 from torch hub


Using cache found in /Users/jpc/.cache/torch/hub/facebookresearch_dinov2_main


In [5]:
# Run inference
predictions = model.infer(
    views,                            # Input views
    memory_efficient_inference=False, # Trades off speed for more views (up to 2000 views on 140 GB)
    use_amp=True,                     # Use mixed precision inference (recommended)
    amp_dtype="bf16",                 # bf16 inference (recommended; falls back to fp16 if bf16 not supported)
    apply_mask=True,                  # Apply masking to dense geometry outputs
    mask_edges=True,                  # Remove edge artifacts by using normals and depth
    apply_confidence_mask=False,      # Filter low-confidence regions
    confidence_percentile=10,         # Remove bottom 10 percentile confidence pixels
)

# Access results for each view - Complete list of metric outputs

for i, pred in enumerate(predictions):
    # Geometry outputs
    pts3d = pred["pts3d"]                     # 3D points in world coordinates (B, H, W, 3)
    pts3d_cam = pred["pts3d_cam"]             # 3D points in camera coordinates (B, H, W, 3)
    depth_z = pred["depth_z"]                 # Z-depth in camera frame (B, H, W, 1)
    depth_along_ray = pred["depth_along_ray"] # Depth along ray in camera frame (B, H, W, 1)

    # Camera outputs
    ray_directions = pred["ray_directions"]   # Ray directions in camera frame (B, H, W, 3)
    intrinsics = pred["intrinsics"]           # Recovered pinhole camera intrinsics (B, 3, 3)
    camera_poses = pred["camera_poses"]       # OpenCV (+X - Right, +Y - Down, +Z - Forward) cam2world poses in world frame (B, 4, 4)
    cam_trans = pred["cam_trans"]             # OpenCV (+X - Right, +Y - Down, +Z - Forward) cam2world translation in world frame (B, 3)
    cam_quats = pred["cam_quats"]             # OpenCV (+X - Right, +Y - Down, +Z - Forward) cam2world quaternion in world frame (B, 4)

    # Quality and masking
    confidence = pred["conf"]                 # Per-pixel confidence scores (B, H, W)
    mask = pred["mask"]                       # Combined validity mask (B, H, W, 1)
    non_ambiguous_mask = pred["non_ambiguous_mask"]                # Non-ambiguous regions (B, H, W)
    non_ambiguous_mask_logits = pred["non_ambiguous_mask_logits"]  # Mask logits (B, H, W)

    # Scaling
    metric_scaling_factor = pred["metric_scaling_factor"]  # Applied metric scaling (B,)

    # Original input
    img_no_norm = pred["img_no_norm"]         # Denormalized input images for visualization (B, H, W, 3)
    confidence = pred["conf"]                 # Per-pixel confidence scores (B, H, W)
    mask = pred["mask"]                       # Combined validity mask (B, H, W, 1)
    non_ambiguous_mask = pred["non_ambiguous_mask"]                # Non-ambiguous regions (B, H, W)
    non_ambiguous_mask_logits = pred["non_ambiguous_mask_logits"]  # Mask logits (B, H, W)




In [21]:
def predictions_to_numpy(predictions):
    """
    Convert a dict of tensors to a dict of squeezed numpy arrays.

    Args:
        predictions (dict): Dictionary where values are torch tensors.

    Returns:
        dict: Dictionary with the same keys, values converted to numpy arrays
              with squeezed dimensions.
    """
    return {
        key: value.detach().cpu().numpy().squeeze()
        for key, value in predictions.items()
    }

In [23]:
d=[]
for i, pred in enumerate(predictions):
    d.append(predictions_to_numpy(pred))


In [34]:



def save_preds_as_assignment_format(
    preds,
    output_dir,
    base_name="frame",
    start_index=0
):
    """
    Save preds data in the assignment-required RGB-D format.

    Args:
        preds (list): List of dictionaries with prediction data.
        output_dir (str): Directory where files will be saved.
        base_name (str): Base filename (default: "frame").
        start_index (int): Starting index for numbering (default: 0).
    """

    os.makedirs(output_dir, exist_ok=True)

    for i, pred in enumerate(preds):
        idx = start_index + i
        idx_str = f"{idx:04d}"

        # ---- Extract required fields ----
        depth = pred["depth_z"]
        K = pred["intrinsics"]
        img = pred["img_no_norm"]

        # ---- Sanity checks ----
        if depth.ndim != 2:
            raise ValueError(f"depth_z must be HxW, got {depth.shape}")

        if K.shape != (3, 3):
            raise ValueError(f"intrinsics must be 3x3, got {K.shape}")

       # ---- Handle image format ----        
        
        # CHW -> HWC if needed
        if img.ndim == 3 and img.shape[0] == 3:
            img = np.transpose(img, (1, 2, 0))
        
        # If image is float, rescale properly
        if img.dtype != np.uint8:
            img = img.astype(np.float32)
        
            # If values are in [0, 1], scale to [0, 255]
            if img.max() <= 1.0:
                img = img * 255.0
        
            img = np.clip(img, 0, 255).astype(np.uint8)
        
        # Convert RGB -> BGR for OpenCV
        img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        # ---- File names ----
        img_filename = f"{base_name}_{idx_str}.jpg"
        depth_filename = f"{base_name}_{idx_str}.mat"

        img_path = os.path.join(output_dir, img_filename)
        depth_path = os.path.join(output_dir, depth_filename)

        # ---- Save files ----
        cv2.imwrite(img_path, img_bgr)

        sio.savemat(
            depth_path,
            {
                "depth": depth.astype(np.float32),
                "K": K.astype(np.float32),
            }
        )

    print(f"Saved {len(preds)} frames to {output_dir}")

In [35]:
output_dir="/Users/jpc/Nextcloud/IST/Escolaridade/PIV2025/AulasPraticas/aula7_3D_registration/plondres/"
# preds is your existing list of dictionaries

save_preds_as_assignment_format(
    preds=d,
    output_dir=output_dir,
    base_name="pl",
    start_index=0
)

Saved 4 frames to /Users/jpc/Nextcloud/IST/Escolaridade/PIV2025/AulasPraticas/aula7_3D_registration/plondres/


In [27]:

from scipy.io import savemat

savemat("/Users/jpc/Nextcloud/IST/Escolaridade/PIV2025/AulasPraticas/aula7_3D_registration/plondres/plondres.mat" ,{"d":d})


In [6]:
del(model)