In [1]:
import json
import numpy as np
import h5py
import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, AutoModel
from transformers import CLIPModel, CLIPProcessor
import torch
import os
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity 
from tqdm import tqdm
import pandas as pd
from PIL import Image

Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu126 for torchao version 0.13.0


In [2]:
# map
with open("../data/database/database_lite.json","r") as f:
    m_idx = json.load(f)
    m_imgs = np.array(m_idx["im_paths"])
    m_loc=np.array(m_idx["loc"])

# query
with open("../data/query/query_lite.json","r") as f:
    q_idx=json.load(f)
    q_imgs=np.array(q_idx["im_paths"])
    q_loc=np.array(q_idx["loc"])
    
# loading the relevance judgements
with h5py.File("../data/london_lite_gt.h5","r") as f:
   fovs = f["fov"][:]
   sim = f["sim"][:].astype(np.uint8)

In [3]:
from huggingface_hub import login
login(new_session=False)

In [4]:
def recall(ranks, pidx, ks):
    recall_at_k = np.zeros(len(ks))
    for qidx in range(ranks.shape[0]):
        for i, k in enumerate(ks):
            if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:
                recall_at_k[i:] += 1
                break

    recall_at_k /= ranks.shape[0]
    return recall_at_k

def apk(pidx, rank, k):
    if len(rank)>k:
        rank = rank[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(rank):
        if p in pidx and p not in rank[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(pidx), k)

def mapk(ranks, pidxs, k):

    return np.mean([apk(a,p,k) for a,p in zip(pidxs, ranks)])

def mapk_many(ranks, pidxs, ks):
    return np.array([mapk(ranks, pidxs, k) for k in ks], dtype=float)

def average_precision(relevant, retrieved):
   precisions = []
   rel = 0
   for i in range(0, len(retrieved)):
      if retrieved[i] in relevant:
         rel += 1
         precisions.append(rel/(i+1))
   return sum(precisions) / len(relevant)

def mean_average_precision(all_relevant, all_retrieved):
   total = 0
   count = 0
   for qid in all_relevant: 
      total += average_precision(all_relevant[qid], all_retrieved.get(qid, []))
      count += 1
   return total / count


def l2_normalize(x, axis=1, eps=1e-12):
   norm = np.linalg.norm(x, axis=axis, keepdims=True)
   return x / (norm + eps)

def get_relevant_images(gt_similarity_matrix, query_idx):
   return np.where(gt_similarity_matrix[query_idx, :] == 1)[0]

def get_retrieved_images(feature_matrix, query_idx):
   return np.argsort(-feature_matrix[query_idx])

def save_results_to_csv(model_name, map_value, recall_at_k, mAPs, csv_path="./results/feature_extraction_evaluation.csv"):
    results_dict = {
        "models_name": model_name,
        "MAP": map_value * 100,
        "Recall@1": recall_at_k[0] * 100,
        "Recall@5": recall_at_k[1] * 100,
        "Recall@10": recall_at_k[2] * 100,
        "Recall@20": recall_at_k[3] * 100,
        "mAP@1": mAPs[0] * 100,
        "mAP@5": mAPs[1] * 100,
        "mAP@10": mAPs[2] * 100,
        "mAP@20": mAPs[3] * 100
    }

    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        if model_name in df['models_name'].values:
            df.loc[df['models_name'] == model_name] = pd.Series(results_dict)
        else:
            df = pd.concat([df, pd.DataFrame([results_dict])], ignore_index=True)
    else:
        df = pd.DataFrame([results_dict])
    
    df.to_csv(csv_path, index=False)
    return df


In [5]:
# map
with open("../data/database/database_lite.json","r") as f:
    m_idx = json.load(f)
    m_imgs = np.array(m_idx["im_paths"])
    m_loc=np.array(m_idx["loc"])

# query
with open("../data/query/query_lite.json","r") as f:
    q_idx=json.load(f)
    q_imgs=np.array(q_idx["im_paths"])
    q_loc=np.array(q_idx["loc"])
    
# loading the relevance judgements
with h5py.File("../data/london_lite_gt.h5","r") as f:
   fovs = f["fov"][:]
   sim = f["sim"][:].astype(np.uint8)

In [6]:
from huggingface_hub import login
login(new_session=False)

In [7]:
def recall(ranks, pidx, ks):
    recall_at_k = np.zeros(len(ks))
    for qidx in range(ranks.shape[0]):
        for i, k in enumerate(ks):
            if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:
                recall_at_k[i:] += 1
                break

    recall_at_k /= ranks.shape[0]
    return recall_at_k

def apk(pidx, rank, k):
    if len(rank)>k:
        rank = rank[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(rank):
        if p in pidx and p not in rank[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(pidx), k)

def mapk(ranks, pidxs, k):

    return np.mean([apk(a,p,k) for a,p in zip(pidxs, ranks)])

def mapk_many(ranks, pidxs, ks):
    return np.array([mapk(ranks, pidxs, k) for k in ks], dtype=float)

def average_precision(relevant, retrieved):
   precisions = []
   rel = 0
   for i in range(0, len(retrieved)):
      if retrieved[i] in relevant:
         rel += 1
         precisions.append(rel/(i+1))
   return sum(precisions) / len(relevant)

def mean_average_precision(all_relevant, all_retrieved):
   total = 0
   count = 0
   for qid in all_relevant: 
      total += average_precision(all_relevant[qid], all_retrieved.get(qid, []))
      count += 1
   return total / count


def l2_normalize(x, axis=1, eps=1e-12):
   norm = np.linalg.norm(x, axis=axis, keepdims=True)
   return x / (norm + eps)

def get_relevant_images(gt_similarity_matrix, query_idx):
   return np.where(gt_similarity_matrix[query_idx, :] == 1)[0]

def get_retrieved_images(feature_matrix, query_idx):
   return np.argsort(-feature_matrix[query_idx])

def save_results_to_csv(model_name, map_value, recall_at_k, mAPs, csv_path="./results/feature_extraction_evaluation.csv"):
    results_dict = {
        "models_name": model_name,
        "MAP": map_value * 100,
        "Recall@1": recall_at_k[0] * 100,
        "Recall@5": recall_at_k[1] * 100,
        "Recall@10": recall_at_k[2] * 100,
        "Recall@20": recall_at_k[3] * 100,
        "mAP@1": mAPs[0] * 100,
        "mAP@5": mAPs[1] * 100,
        "mAP@10": mAPs[2] * 100,
        "mAP@20": mAPs[3] * 100
    }

    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        if model_name in df['models_name'].values:
            df.loc[df['models_name'] == model_name] = pd.Series(results_dict)
        else:
            df = pd.concat([df, pd.DataFrame([results_dict])], ignore_index=True)
    else:
        df = pd.DataFrame([results_dict])
    
    df.to_csv(csv_path, index=False)
    return df


In [8]:
DINOV3_MODELS = {
    # ViT models
    "facebook/dinov3-vith16plus-pretrain-lvd1689m": 1280,
}
CLIP_MODELS = {
    "openai/clip-vit-base-patch32": 768,
}

In [9]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda:1" if torch.cuda.is_available() else "cpu"
model = model.to(device)

m_feats_gem_clip = np.zeros((len(m_imgs), 768), dtype=np.float32)
p = 3.0

for i, img_name in enumerate(tqdm(m_imgs)):
    img = Image.open(os.path.join('../data/', img_name))
    inputs = processor(images=img, return_tensors="pt").to(device)

    with torch.inference_mode():
        vision_outputs = model.vision_model(**inputs)

    m_feats_gem_clip[i] = vision_outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()

q_feats_gem_clip = np.zeros((len(q_imgs), 768), dtype=np.float32)

for i, img_name in enumerate(tqdm(q_imgs)):
    img = Image.open(os.path.join('../data/', img_name))
    inputs = processor(images=img, return_tensors="pt").to(device)

    with torch.inference_mode():
        vision_outputs = model.vision_model(**inputs)

    q_feats_gem_clip[i] = vision_outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()

m_feats_gem_clip = l2_normalize(m_feats_gem_clip, axis=1)
q_feats_gem_clip = l2_normalize(q_feats_gem_clip, axis=1)

del model, processor
if torch.cuda.is_available(): torch.cuda.empty_cache()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
100%|██████████| 1000/1000 [00:10<00:00, 94.05it/s]
100%|██████████| 500/500 [00:05<00:00, 95.84it/s] 


In [10]:
processor = AutoImageProcessor.from_pretrained("facebook/dinov3-vith16plus-pretrain-lvd1689m")
model = AutoModel.from_pretrained("facebook/dinov3-vith16plus-pretrain-lvd1689m") 
device = "cuda:1" if torch.cuda.is_available() else "cpu"
model = model.to(device)

m_feats_gem_dinov3 = np.zeros((len(m_imgs), 1280), dtype=np.float32)
p = 3.0

for i, img_name in enumerate(tqdm(m_imgs)):
    img = Image.open(os.path.join('../data/', img_name))
    inputs = processor(images=img, return_tensors="pt").to(device)

    with torch.inference_mode():
        outputs = model(**inputs)

    m_feats_gem_dinov3[i] = outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()

q_feats_gem_dinov3 = np.zeros((len(q_imgs), 1280), dtype=np.float32)

for i, img_name in enumerate(tqdm(q_imgs)):
    img = Image.open(os.path.join('../data/', img_name))
    inputs = processor(images=img, return_tensors="pt").to(device)

    with torch.inference_mode():
        outputs = model(**inputs)

    q_feats_gem_dinov3[i] = outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()

m_feats_gem_dinov3 = l2_normalize(m_feats_gem_dinov3, axis=1)
q_feats_gem_dinov3 = l2_normalize(q_feats_gem_dinov3, axis=1)

del model, processor
if torch.cuda.is_available(): torch.cuda.empty_cache()

100%|██████████| 1000/1000 [00:52<00:00, 19.02it/s]
100%|██████████| 500/500 [00:26<00:00, 18.55it/s]


In [11]:
m_feats_c = np.concatenate([m_feats_gem_clip, m_feats_gem_dinov3], axis=1)
q_feats_c = np.concatenate([q_feats_gem_clip, q_feats_gem_dinov3], axis=1)

In [13]:
similarities = cosine_similarity(q_feats_c, m_feats_c)

all_rel = {}
all_ret = {}
for query_idx in range(len(similarities)):
    all_rel[query_idx] = get_relevant_images(sim, query_idx)
    all_ret[query_idx] = get_retrieved_images(similarities, query_idx)

ranks = np.argsort(-similarities, axis=1) 

Q = similarities.shape[0]
pidx = [np.array(all_rel[q], dtype=int) for q in range(Q)]

ks = [1, 5, 10, 20]
recall_at_k = recall(ranks, pidx, ks)
mAPs = mapk_many(ranks, pidx, ks)
map_value = mean_average_precision(all_rel, all_ret)

print(f"MAP: {map_value*100:.2f}%")
for k, r, m in zip(ks, recall_at_k, mAPs):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")
print("\n")


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MAP: 37.45%
Recall@1: 44.80%   mAP@1: 44.80%
Recall@5: 66.60%   mAP@5: 32.70%
Recall@10: 76.00%   mAP@10: 32.23%
Recall@20: 85.00%   mAP@20: 34.03%




In [None]:
import sys
sys.exit(1)

In [None]:
MODEL_TO_USE = "dinov3-vith16plus-pretrain-lvd1689m+clip-vit-base-patch32"
df_main = save_results_to_csv(f"{MODEL_TO_USE}_GEM_pool", map_value, recall_at_k, mAPs)

In [None]:
df_final = pd.read_csv("./results/feature_extraction_evaluation.csv")
print(df_final[['models_name', 'MAP', 'Recall@1', 'Recall@20']].sort_values('Recall@1', ascending=False).to_string(index=False))