In [1]:
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

In [30]:
import json
import numpy as np
import h5py
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import pipeline
from transformers import AutoImageProcessor, AutoModel
from transformers import CLIPModel, CLIPProcessor
import torch
import os
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity 
import pandas as pd

In [3]:
# map
with open("../data02/database/database_lite.json","r") as f:
    m_idx = json.load(f)
    m_imgs = np.array(m_idx["im_paths"])
    m_loc=np.array(m_idx["loc"])

# query
with open("../data02/query/query_lite.json","r") as f:
    q_idx=json.load(f)
    q_imgs=np.array(q_idx["im_paths"])
    q_loc=np.array(q_idx["loc"])
    
# loading the relevance judgements
with h5py.File("../data02/london_lite_gt.h5","r") as f:
   fovs = f["fov"][:]
   sim = f["sim"][:].astype(np.uint8)

In [4]:
from huggingface_hub import login
login(new_session=False)

In [33]:
def recall(ranks, pidx, ks):
    recall_at_k = np.zeros(len(ks))
    for qidx in range(ranks.shape[0]):
        for i, k in enumerate(ks):
            if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:
                recall_at_k[i:] += 1
                break

    recall_at_k /= ranks.shape[0]
    return recall_at_k

def apk(pidx, rank, k):
    if len(rank)>k:
        rank = rank[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(rank):
        if p in pidx and p not in rank[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(pidx), k)

def mapk(ranks, pidxs, k):

    return np.mean([apk(a,p,k) for a,p in zip(pidxs, ranks)])

def mapk_many(ranks, pidxs, ks):
    return np.array([mapk(ranks, pidxs, k) for k in ks], dtype=float)

def average_precision(relevant, retrieved):
   precisions = []
   rel = 0
   for i in range(0, len(retrieved)):
      if retrieved[i] in relevant:
         rel += 1
         precisions.append(rel/(i+1))
   return sum(precisions) / len(relevant)

def mean_average_precision(all_relevant, all_retrieved):
   total = 0
   count = 0
   for qid in all_relevant: 
      total += average_precision(all_relevant[qid], all_retrieved.get(qid, []))
      count += 1
   return total / count


def l2_normalize(x, axis=1, eps=1e-12):
   norm = np.linalg.norm(x, axis=axis, keepdims=True)
   return x / (norm + eps)

def get_relevant_images(gt_similarity_matrix, query_idx):
   return np.where(gt_similarity_matrix[query_idx, :] == 1)[0]

def get_retrieved_images(feature_matrix, query_idx):
   return np.argsort(-feature_matrix[query_idx])

def save_results_to_csv(model_name, map_value, recall_at_k, mAPs, csv_path="./results/feature_extraction_evaluation.csv"):
    results_dict = {
        "models_name": model_name,
        "MAP": map_value * 100,
        "Recall@1": recall_at_k[0] * 100,
        "Recall@5": recall_at_k[1] * 100,
        "Recall@10": recall_at_k[2] * 100,
        "Recall@20": recall_at_k[3] * 100,
        "mAP@1": mAPs[0] * 100,
        "mAP@5": mAPs[1] * 100,
        "mAP@10": mAPs[2] * 100,
        "mAP@20": mAPs[3] * 100
    }

    os.makedirs(os.path.dirname(csv_path), exist_ok=True)
    
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        if model_name in df['models_name'].values:
            df.loc[df['models_name'] == model_name] = pd.Series(results_dict)
        else:
            df = pd.concat([df, pd.DataFrame([results_dict])], ignore_index=True)
    else:
        df = pd.DataFrame([results_dict])
    
    df.to_csv(csv_path, index=False)
    return df


In [42]:
CLIP_MODELS = {
    "geolocal/StreetCLIP": 1024,
    "openai/clip-vit-large-patch14": 1024,
    "openai/clip-vit-base-patch16": 768,
    "openai/clip-vit-base-patch32": 768,
}

In [44]:
for MODEL_TO_USE, feat_dim in CLIP_MODELS.items():
    print(f"\n{'='*80}")
    print(f"Loading model: {MODEL_TO_USE}")
    print(f"{'='*80}\n")
    
    try:
        model = CLIPModel.from_pretrained(MODEL_TO_USE)
        processor = CLIPProcessor.from_pretrained(MODEL_TO_USE)
        device = "cuda:1" if torch.cuda.is_available() else "cpu"
        model = model.to(device)
        
    except Exception as e:
        print(e)
        continue
    
    m_feats_cls = np.zeros((len(m_imgs), feat_dim), dtype=np.float32)
    m_feats_mean = np.zeros((len(m_imgs), feat_dim), dtype=np.float32)
    m_feats_mean_no_cls = np.zeros((len(m_imgs), feat_dim), dtype=np.float32)
    m_feats_max = np.zeros((len(m_imgs), feat_dim), dtype=np.float32)
    m_feats_gem = np.zeros((len(m_imgs), feat_dim), dtype=np.float32)
    
    p = 3.0
    
    for i, img_name in enumerate(tqdm(m_imgs)):
        img = Image.open(os.path.join('../data02/', img_name))
        inputs = processor(images=img, return_tensors="pt").to(device)
        
        with torch.inference_mode():
            vision_outputs = model.vision_model(**inputs)
        
        m_feats_cls[i] = vision_outputs.pooler_output[0].cpu().numpy()
        m_feats_mean[i] = vision_outputs.last_hidden_state.mean(dim=1)[0].cpu().numpy()
        m_feats_mean_no_cls[i] = vision_outputs.last_hidden_state[:, 1:, :].mean(dim=1)[0].cpu().numpy()
        m_feats_max[i] = vision_outputs.last_hidden_state.max(dim=1)[0][0].cpu().numpy()
        m_feats_gem[i] = vision_outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()
    
    q_feats_cls = np.zeros((len(q_imgs), feat_dim), dtype=np.float32)
    q_feats_mean = np.zeros((len(q_imgs), feat_dim), dtype=np.float32)
    q_feats_mean_no_cls = np.zeros((len(q_imgs), feat_dim), dtype=np.float32)
    q_feats_max = np.zeros((len(q_imgs), feat_dim), dtype=np.float32)
    q_feats_gem = np.zeros((len(q_imgs), feat_dim), dtype=np.float32)
    
    for i, img_name in enumerate(tqdm(q_imgs)):
        img = Image.open(os.path.join('../data02/', img_name))
        inputs = processor(images=img, return_tensors="pt").to(device)
        
        with torch.inference_mode():
            vision_outputs = model.vision_model(**inputs)
        
        q_feats_cls[i] = vision_outputs.pooler_output[0].cpu().numpy()
        q_feats_mean[i] = vision_outputs.last_hidden_state.mean(dim=1)[0].cpu().numpy()
        q_feats_mean_no_cls[i] = vision_outputs.last_hidden_state[:, 1:, :].mean(dim=1)[0].cpu().numpy()
        q_feats_max[i] = vision_outputs.last_hidden_state.max(dim=1)[0][0].cpu().numpy()
        q_feats_gem[i] = vision_outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()
    
    m_feats_cls = l2_normalize(m_feats_cls, axis=1)
    m_feats_mean = l2_normalize(m_feats_mean, axis=1)
    m_feats_mean_no_cls = l2_normalize(m_feats_mean_no_cls, axis=1)
    m_feats_max = l2_normalize(m_feats_max, axis=1)
    m_feats_gem = l2_normalize(m_feats_gem, axis=1)
    
    q_feats_cls = l2_normalize(q_feats_cls, axis=1)
    q_feats_mean = l2_normalize(q_feats_mean, axis=1)
    q_feats_mean_no_cls = l2_normalize(q_feats_mean_no_cls, axis=1)
    q_feats_max = l2_normalize(q_feats_max, axis=1)
    q_feats_gem = l2_normalize(q_feats_gem, axis=1)
    
    pooling_strategies = {
        "CLS_token": (q_feats_cls, m_feats_cls),
        "Mean_pooling": (q_feats_mean, m_feats_mean),
        "Mean_no_CLS": (q_feats_mean_no_cls, m_feats_mean_no_cls),
        "Max_pooling": (q_feats_max, m_feats_max),
        "GeM_pooling": (q_feats_gem, m_feats_gem),
    }
    
    all_results = []
    best = None
    best_map = 0.0
    best_recall_at_k = None
    best_mAPs = None
    
    for pooling_name, (q_feats, m_feats) in pooling_strategies.items():
        print(f"{'='*60}")
        print(f"Evaluating: {pooling_name}")
        print(f"{'='*60}")
        
        similarities = cosine_similarity(q_feats, m_feats)
        
        all_rel = {}
        all_ret = {}
        for query_idx in range(len(similarities)):
            all_rel[query_idx] = get_relevant_images(sim, query_idx)
            all_ret[query_idx] = get_retrieved_images(similarities, query_idx)

        ranks = np.argsort(-similarities, axis=1) 
        
        Q = similarities.shape[0]
        pidx = [np.array(all_rel[q], dtype=int) for q in range(Q)]
        
        ks = [1, 5, 10, 20]
        recall_at_k = recall(ranks, pidx, ks)
        mAPs = mapk_many(ranks, pidx, ks)
        map_value = mean_average_precision(all_rel, all_ret)
        
        if best is None or map_value > best_map:
            best_map = map_value
            best = pooling_name
            best_recall_at_k = recall_at_k
            best_mAPs = mAPs
        
        print(f"MODEL: {MODEL_TO_USE}")
        print(f"Pooling: {pooling_name}")
        print(f"MAP: {map_value*100:.2f}%")
        for k, r, m in zip(ks, recall_at_k, mAPs):
            print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")
        print("\n")
        
        result_dict = {
            "models_name": f"{MODEL_TO_USE}_{pooling_name}",
            "pooling_strategy": pooling_name,
            "MAP": map_value * 100,
            "Recall@1": recall_at_k[0] * 100,
            "Recall@5": recall_at_k[1] * 100,
            "Recall@10": recall_at_k[2] * 100,
            "Recall@20": recall_at_k[3] * 100,
            "mAP@1": mAPs[0] * 100,
            "mAP@5": mAPs[1] * 100,
            "mAP@10": mAPs[2] * 100,
            "mAP@20": mAPs[3] * 100
        }
        all_results.append(result_dict)
    
    df_results = pd.DataFrame(all_results)
    CSV_PATH = f"./results/pooling_comparison/{MODEL_TO_USE.replace('/', '_')}_pooling_comparison.csv"
    os.makedirs(os.path.dirname(CSV_PATH), exist_ok=True)
    df_results.to_csv(CSV_PATH, index=False)
    
    df_main = save_results_to_csv(f"{MODEL_TO_USE}_{best}", best_map, best_recall_at_k, best_mAPs)
    
    del model
    if torch.cuda.is_available(): torch.cuda.empty_cache()
    
    print("\n")


Loading model: geolocal/StreetCLIP



100%|██████████| 1000/1000 [01:55<00:00,  8.67it/s]
100%|██████████| 500/500 [00:59<00:00,  8.37it/s]
  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


Evaluating: CLS_token
MODEL: geolocal/StreetCLIP
Pooling: CLS_token
MAP: 21.04%
Recall@1: 27.00%   mAP@1: 27.00%
Recall@5: 46.00%   mAP@5: 15.24%
Recall@10: 60.00%   mAP@10: 15.17%
Recall@20: 72.80%   mAP@20: 17.05%


Evaluating: Mean_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: geolocal/StreetCLIP
Pooling: Mean_pooling
MAP: 25.73%
Recall@1: 30.00%   mAP@1: 30.00%
Recall@5: 55.20%   mAP@5: 20.68%
Recall@10: 65.00%   mAP@10: 20.28%
Recall@20: 73.80%   mAP@20: 22.10%


Evaluating: Mean_no_CLS


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: geolocal/StreetCLIP
Pooling: Mean_no_CLS
MAP: 25.71%
Recall@1: 30.00%   mAP@1: 30.00%
Recall@5: 55.40%   mAP@5: 20.71%
Recall@10: 65.00%   mAP@10: 20.26%
Recall@20: 73.80%   mAP@20: 22.09%


Evaluating: Max_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: geolocal/StreetCLIP
Pooling: Max_pooling
MAP: 16.05%
Recall@1: 25.20%   mAP@1: 25.20%
Recall@5: 47.40%   mAP@5: 14.42%
Recall@10: 60.60%   mAP@10: 13.17%
Recall@20: 74.60%   mAP@20: 13.41%


Evaluating: GeM_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: geolocal/StreetCLIP
Pooling: GeM_pooling
MAP: 25.49%
Recall@1: 32.60%   mAP@1: 32.60%
Recall@5: 55.80%   mAP@5: 21.34%
Recall@10: 66.00%   mAP@10: 20.83%
Recall@20: 74.60%   mAP@20: 22.14%





Loading model: openai/clip-vit-large-patch14



100%|██████████| 1000/1000 [01:05<00:00, 15.28it/s]
100%|██████████| 500/500 [00:33<00:00, 15.05it/s]
  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


Evaluating: CLS_token
MODEL: openai/clip-vit-large-patch14
Pooling: CLS_token
MAP: 19.11%
Recall@1: 22.00%   mAP@1: 22.00%
Recall@5: 46.40%   mAP@5: 13.46%
Recall@10: 60.60%   mAP@10: 13.61%
Recall@20: 74.60%   mAP@20: 15.10%


Evaluating: Mean_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-large-patch14
Pooling: Mean_pooling
MAP: 28.64%
Recall@1: 32.40%   mAP@1: 32.40%
Recall@5: 55.40%   mAP@5: 22.51%
Recall@10: 67.20%   mAP@10: 22.22%
Recall@20: 80.40%   mAP@20: 24.78%


Evaluating: Mean_no_CLS


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-large-patch14
Pooling: Mean_no_CLS
MAP: 28.64%
Recall@1: 32.60%   mAP@1: 32.60%
Recall@5: 55.20%   mAP@5: 22.50%
Recall@10: 67.00%   mAP@10: 22.24%
Recall@20: 80.40%   mAP@20: 24.80%


Evaluating: Max_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-large-patch14
Pooling: Max_pooling
MAP: 17.24%
Recall@1: 25.60%   mAP@1: 25.60%
Recall@5: 49.00%   mAP@5: 15.10%
Recall@10: 58.80%   mAP@10: 14.16%
Recall@20: 70.40%   mAP@20: 14.49%


Evaluating: GeM_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-large-patch14
Pooling: GeM_pooling
MAP: 26.55%
Recall@1: 30.20%   mAP@1: 30.20%
Recall@5: 52.60%   mAP@5: 21.45%
Recall@10: 65.40%   mAP@10: 20.88%
Recall@20: 81.20%   mAP@20: 22.77%





Loading model: openai/clip-vit-base-patch16



100%|██████████| 1000/1000 [00:18<00:00, 55.52it/s]
100%|██████████| 500/500 [00:28<00:00, 17.30it/s]
  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


Evaluating: CLS_token
MODEL: openai/clip-vit-base-patch16
Pooling: CLS_token
MAP: 20.70%
Recall@1: 26.60%   mAP@1: 26.60%
Recall@5: 49.00%   mAP@5: 17.27%
Recall@10: 60.80%   mAP@10: 16.23%
Recall@20: 72.00%   mAP@20: 17.11%


Evaluating: Mean_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch16
Pooling: Mean_pooling
MAP: 24.92%
Recall@1: 31.00%   mAP@1: 31.00%
Recall@5: 51.80%   mAP@5: 19.65%
Recall@10: 64.00%   mAP@10: 19.09%
Recall@20: 76.80%   mAP@20: 20.89%


Evaluating: Mean_no_CLS


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch16
Pooling: Mean_no_CLS
MAP: 24.87%
Recall@1: 31.00%   mAP@1: 31.00%
Recall@5: 51.40%   mAP@5: 19.60%
Recall@10: 64.20%   mAP@10: 19.05%
Recall@20: 76.60%   mAP@20: 20.84%


Evaluating: Max_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch16
Pooling: Max_pooling
MAP: 21.02%
Recall@1: 28.00%   mAP@1: 28.00%
Recall@5: 49.80%   mAP@5: 16.99%
Recall@10: 66.20%   mAP@10: 16.63%
Recall@20: 79.40%   mAP@20: 17.48%


Evaluating: GeM_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch16
Pooling: GeM_pooling
MAP: 21.92%
Recall@1: 28.40%   mAP@1: 28.40%
Recall@5: 48.80%   mAP@5: 16.95%
Recall@10: 64.80%   mAP@10: 16.76%
Recall@20: 76.40%   mAP@20: 18.12%





Loading model: openai/clip-vit-base-patch32



100%|██████████| 1000/1000 [00:10<00:00, 95.56it/s]
100%|██████████| 500/500 [00:05<00:00, 95.93it/s]
  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


Evaluating: CLS_token
MODEL: openai/clip-vit-base-patch32
Pooling: CLS_token
MAP: 20.10%
Recall@1: 25.20%   mAP@1: 25.20%
Recall@5: 47.20%   mAP@5: 16.26%
Recall@10: 63.40%   mAP@10: 15.90%
Recall@20: 72.00%   mAP@20: 16.57%


Evaluating: Mean_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch32
Pooling: Mean_pooling
MAP: 25.37%
Recall@1: 35.60%   mAP@1: 35.60%
Recall@5: 55.60%   mAP@5: 20.79%
Recall@10: 65.60%   mAP@10: 19.86%
Recall@20: 75.00%   mAP@20: 21.27%


Evaluating: Mean_no_CLS


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch32
Pooling: Mean_no_CLS
MAP: 25.44%
Recall@1: 35.40%   mAP@1: 35.40%
Recall@5: 55.60%   mAP@5: 20.86%
Recall@10: 65.40%   mAP@10: 19.93%
Recall@20: 75.00%   mAP@20: 21.31%


Evaluating: Max_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch32
Pooling: Max_pooling
MAP: 22.22%
Recall@1: 32.60%   mAP@1: 32.60%
Recall@5: 58.40%   mAP@5: 18.76%
Recall@10: 70.40%   mAP@10: 17.68%
Recall@20: 82.00%   mAP@20: 18.53%


Evaluating: GeM_pooling


  if np.sum(np.in1d(ranks[qidx,:k], pidx[qidx])) > 0:


MODEL: openai/clip-vit-base-patch32
Pooling: GeM_pooling
MAP: 28.10%
Recall@1: 38.80%   mAP@1: 38.80%
Recall@5: 61.40%   mAP@5: 24.62%
Recall@10: 72.80%   mAP@10: 23.33%
Recall@20: 82.20%   mAP@20: 24.15%






In [45]:
df_final = pd.read_csv("./results/feature_extraction_evaluation.csv")
print("\n=== FINAL MODEL COMPARISON (Best pooling for each) ===")
print(df_final[['models_name', 'MAP', 'Recall@1', 'Recall@20']].sort_values('MAP', ascending=False).to_string(index=False))


=== FINAL MODEL COMPARISON (Best pooling for each) ===
                                                 models_name       MAP  Recall@1  Recall@20
    facebook/dinov3-vith16plus-pretrain-lvd1689m_GeM_pooling 36.461502      44.0       82.4
        facebook/dinov3-vitb16-pretrain-lvd1689m_GeM_pooling 34.103124      41.4       85.4
        facebook/dinov3-vitl16-pretrain-lvd1689m_Max_pooling 34.050443      40.4       80.2
          facebook/dinov3-vits16-pretrain-lvd1689m_CLS_token 30.767844      38.0       77.6
      facebook/dinov3-vits16plus-pretrain-lvd1689m_CLS_token 29.901573      33.6       73.6
                   openai/clip-vit-large-patch14_Mean_no_CLS 28.641709      32.6       80.4
                    openai/clip-vit-base-patch32_GeM_pooling 28.097766      38.8       82.2
                            geolocal/StreetCLIP_Mean_pooling 25.725327      30.0       73.8
                   openai/clip-vit-base-patch16_Mean_pooling 24.915458      31.0       76.8
 facebook/dinov3-convnex