In [None]:
import torch
import numpy as np
import cv2
from transformers import AutoImageProcessor, Dinov2Model
from PIL import Image
import matplotlib.pyplot as plt

# === Load DINOv2 ===
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
model = Dinov2Model.from_pretrained("facebook/dinov2-base")
model.eval()  # set to evaluation mode

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Using device: {device}")
model.to(device)

# === Load image ===
image_path = "../io/images/color_0000.png"
pil_img = Image.open(image_path).convert("RGB")

# Preprocess
inputs = processor(images=pil_img, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

# === Extract features ===
with torch.no_grad():
    outputs = model(**inputs)

# Get the last hidden state as patch embeddings
# Shape: (batch_size, num_patches+1, feature_dim)
patch_embeddings = outputs.last_hidden_state[0, 1:, :]  # skip [CLS] token

# Reshape to 2D feature map
num_patches = patch_embeddings.shape[0]
feature_dim = patch_embeddings.shape[1]
grid_size = int(np.sqrt(num_patches))
feature_map = patch_embeddings[:grid_size*grid_size].reshape(grid_size, grid_size, feature_dim)

# Compute a simple visualization: mean over feature_dim
feature_map_vis = feature_map.mean(axis=2).cpu().numpy()
feature_map_vis = (feature_map_vis - feature_map_vis.min()) / (feature_map_vis.max() - feature_map_vis.min())
feature_map_vis = cv2.resize(feature_map_vis, pil_img.size)

# === Show original image and feature map ===
plt.figure(figsize=(10,5))
plt.subplot(1,2,1)
plt.title("Original Image")
plt.imshow(pil_img)
plt.axis("off")

plt.subplot(1,2,2)
plt.title("DINOv2 Feature Map")
plt.imshow(feature_map_vis, cmap="viridis")
plt.axis("off")

plt.show()