In [None]:
import json
import numpy as np
import h5py
import matplotlib.pyplot as plt
import tqdm
from transformers import pipeline
from transformers.image_utils import load_image
from transformers import AutoImageProcessor, AutoModel
import torch
import os
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity 


In [None]:
# map
with open("../assignment2/FIR-02/data02/database/database_lite.json","r") as f:
    m_idx = json.load(f)
    m_imgs = np.array(m_idx["im_paths"])
    m_loc=np.array(m_idx["loc"])

# query
with open("../assignment2/FIR-02/data02/query/query_lite.json","r") as f:
    q_idx=json.load(f)
    q_imgs=np.array(q_idx["im_paths"])
    q_loc=np.array(q_idx["loc"])
    
# loading the relevance judgements
with h5py.File("../assignment2/FIR-02/data02/london_lite_gt.h5","r") as f:
   fovs = f["fov"][:]
   sim = f["sim"][:].astype(np.uint8)

In [None]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
def recall_at_k(all_relevant,all_retrieved, k):
   total = 0
   count = 0
   for qid in all_relevant:
      count+= sum(1 for r in all_retrieved[qid][:k] if r in all_relevant[qid]) 
      total+= len(all_relevant[qid])
   
   return count / total
 
def average_precision(relevant, retrieved):
   precisions = []
   rel = 0
   for i in range(0, len(retrieved)):
      if retrieved[i] in relevant:
         rel += 1
         precisions.append(rel/(i+1))
   return sum(precisions) / len(relevant)

def mean_average_precision(all_relevant, all_retrieved):
   total = 0
   count = 0
   for qid in all_relevant: 
      total += average_precision(all_relevant[qid], all_retrieved.get(qid, []))
      count += 1
   return total / count

def l2_normalize(x, axis=1, eps=1e-12):
   norm = np.linalg.norm(x, axis=axis, keepdims=True)
   return x / (norm + eps)

def get_relevant_images(gt_similarity_matrix, query_idx):
   return np.where(gt_similarity_matrix[query_idx, :] == 1)[0]

def get_retrieved_images(feature_matrix, query_idx):
   return np.argsort(feature_matrix[query_idx])

In [None]:
DINO_MODELS = [
   "facebook/dinov3-vits16-pretrain-lvd1689m",
   "facebook/dinov3-vits16plus-pretrain-lvd1689m",
   "facebook/dinov3-vitb16-pretrain-lvd1689m",
   "facebook/dinov3-vitl16-pretrain-lvd1689m",
   "facebook/dinov3-vith16plus-pretrain-lvd1689m",
   "facebook/dinov3-vit7b16-pretrain-lvd1689m",
   "facebook/dinov3-convnext-base-pretrain-lvd1689m",
   "facebook/dinov3-convnext-large-pretrain-lvd1689m",
   "facebook/dinov3-convnext-small-pretrain-lvd1689m",
   "facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
   "facebook/dinov3-vitl16-pretrain-sat493m",
   "facebook/dinov3-vit7b16-pretrain-sat493m"
]

MODEL_TO_USE = "facebook/dinov3-convnext-tiny-pretrain-lvd1689m"

In [None]:
feature_extractor = pipeline(
    model="facebook/dinov3-convnext-tiny-pretrain-lvd1689m",
    task="image-feature-extraction", 
)

pretrained_model_name = "facebook/dinov3-convnext-tiny-pretrain-lvd1689m"
processor = AutoImageProcessor.from_pretrained(pretrained_model_name)
model = AutoModel.from_pretrained(
    pretrained_model_name, 
    device_map="auto", 
)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/111M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


preprocessor_config.json:   0%|          | 0.00/585 [00:00<?, ?B/s]

Device set to use cpu


In [None]:
m_feats = np.zeros((len(m_imgs), 768), dtype=np.float32)
for i,img_name in enumerate(m_imgs):
   img = plt.imread(os.path.join('../assignment2/FIR-02/data02/', img_name))
   img = Image.fromarray(img)
   # m_feats[i] = np.array(feature_extractor(img)[0])
   inputs = processor(images=img, return_tensors="pt").to(model.device)
   with torch.inference_mode():
      outputs = model(**inputs)
   m_feats[i] = outputs.pooler_output[0]

In [None]:
q_feats = np.zeros((len(m_imgs), 768), dtype=np.float32)
for i,img_name in enumerate(q_imgs):
   img = plt.imread(os.path.join('../assignment2/FIR-02/data02/', img_name))
   img = Image.fromarray(img)
   # m_feats[i] = np.array(feature_extractor(img)[0])
   inputs = processor(images=img, return_tensors="pt").to(model.device)
   with torch.inference_mode():
      outputs = model(**inputs)
   q_feats[i] = outputs.pooler_output[0]

In [63]:
similarities = np.zeros((len(q_feats), len(m_feats)))
for i, q_feat in enumerate(q_feats):
   for j,m_feat in enumerate(m_feats):
      similarities[i][j] = cosine_similarity(q_feat,m_feat.T)

In [None]:
all_rel, all_ret = {},{}
for query_idx in range(len(similarities)):
   all_rel[query_idx] = get_relevant_images(sim, query_idx)
   all_ret[query_idx] = get_retrieved_images(similarities,query_idx )

In [97]:
map = mean_average_precision(all_rel, all_ret)
recall_at_1 = recall_at_k(all_rel, all_ret, 1)
recall_at_5 = recall_at_k(all_rel, all_ret, 5)
recall_at_10 = recall_at_k(all_rel, all_ret, 10)
recall_at_20 = recall_at_k(all_rel, all_ret, 20)

In [98]:
print(f"MAP : {map*100:.2f}%")
print(f"recall_at_1 : {recall_at_1}")
print(f"recall_at_5 : {recall_at_1}")
print(f"recall_at_10 : {recall_at_1}")
print(f"recall_at_20 : {recall_at_1}")


MAP : 60.11%
recall_at_1 : 0.6666666666666666
recall_at_5 : 0.6666666666666666
recall_at_10 : 0.6666666666666666
recall_at_20 : 0.6666666666666666
