## Setup: Upgrade Transformers (Run Once)

# GPS-Enhanced Image Retrieval

In [1]:
import json
import numpy as np
import h5py
import torch
import os
from PIL import Image
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoModel
from transformers import CLIPModel, CLIPProcessor
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pickle
from gps_helpers import cluster_locations, compute_location_centroids, get_cluster_members, compute_gps_distances

Skipping import of cpp extensions due to incompatible torch version 2.9.0+cu126 for torchao version 0.13.0


## Load Data

In [2]:
DATA_PATH = "../data"

In [3]:
with open(f"{DATA_PATH}/database/database_lite.json", "r") as f:
    db_data = json.load(f)
    db_imgs = np.array(db_data["im_paths"])
    db_loc = np.array(db_data["loc"])

with open(f"{DATA_PATH}/query/query_lite.json", "r") as f:
    query_data = json.load(f)
    query_imgs = np.array(query_data["im_paths"])
    query_loc = np.array(query_data["loc"])

with h5py.File(f"{DATA_PATH}/london_lite_gt.h5", "r") as f:
    gt_sim = f["sim"][:].astype(np.uint8)

print(f"Database: {len(db_imgs)} images")
print(f"Query: {len(query_imgs)} images")

Database: 1000 images
Query: 500 images


## Evaluation Functions

In [4]:
def recall(ranks, pidx, ks):
    recall_at_k = np.zeros(len(ks))
    for qidx in range(ranks.shape[0]):
        for i, k in enumerate(ks):
            if np.sum(np.isin(ranks[qidx, :k], pidx[qidx])) > 0:
                recall_at_k[i:] += 1
                break
    recall_at_k /= ranks.shape[0]
    return recall_at_k


def apk(pidx, rank, k):
    if len(rank) > k:
        rank = rank[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(rank):
        if p in pidx and p not in rank[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(pidx), k)


def mapk(ranks, pidxs, k):
    return np.mean([apk(a, p, k) for a, p in zip(pidxs, ranks)])


def mapk_many(ranks, pidxs, ks):
    return np.array([mapk(ranks, pidxs, k) for k in ks], dtype=float)


def average_precision(relevant, retrieved):
    precisions = []
    rel = 0
    for i in range(len(retrieved)):
        if retrieved[i] in relevant:
            rel += 1
            precisions.append(rel / (i + 1))
    return sum(precisions) / len(relevant) if len(relevant) > 0 else 0


def mean_average_precision(all_relevant, all_retrieved):
    total = 0
    for qid in all_relevant:
        total += average_precision(all_relevant[qid], all_retrieved.get(qid, []))
    return total / len(all_relevant)


def l2_normalize(x, axis=1, eps=1e-12):
    norm = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (norm + eps)


def get_relevant_images(gt_similarity_matrix, query_idx):
    return np.where(gt_similarity_matrix[query_idx, :] == 1)[0]

## Extract Features

In [5]:
MODEL_NAME = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
FEAT_DIM = 1280
POOLING = "GeM"
GEM_P = 3.0

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# DINOv3 models use DINOv2 processor (compatible)
processor = AutoImageProcessor.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
print(f"Loaded {MODEL_NAME}")

Using device: cuda:0
Loaded facebook/dinov3-vith16plus-pretrain-lvd1689m


In [6]:
def extract_features_gem(image_paths, model, processor, device, p=3.0):
    features = np.zeros((len(image_paths), FEAT_DIM), dtype=np.float32)
    for i, img_path in enumerate(tqdm(image_paths)):
        img = Image.open(os.path.join("data/", img_path))
        inputs = processor(images=img, return_tensors="pt").to(device)
        with torch.inference_mode():
            outputs = model(**inputs)
        gem_feat = outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1.0 / p)[0]
        features[i] = gem_feat.cpu().numpy()
    return l2_normalize(features, axis=1)

In [7]:
print("Extracting database features...")
db_features = extract_features_gem(db_imgs, model, processor, device, p=GEM_P)

print("Extracting query features...")
query_features = extract_features_gem(query_imgs, model, processor, device, p=GEM_P)

print(f"Database features: {db_features.shape}")
print(f"Query features: {query_features.shape}")

Extracting database features...


100%|██████████| 1000/1000 [00:54<00:00, 18.44it/s]


Extracting query features...


100%|██████████| 500/500 [00:26<00:00, 18.63it/s]

Database features: (1000, 1280)
Query features: (500, 1280)





## GPS Clustering

In [8]:
EPS = 50
MIN_SAMPLES = 2

db_clusters = cluster_locations(db_loc, eps=EPS, min_samples=MIN_SAMPLES)

n_clusters = len(np.unique(db_clusters[db_clusters >= 0]))
n_noise = np.sum(db_clusters == -1)

print(f"Number of location clusters: {n_clusters}")
print(f"Noise points: {n_noise}")


Number of location clusters: 32
Noise points: 0


## Compute Location Centroids

In [9]:
centroids, cluster_members = compute_location_centroids(db_features, db_clusters)

centroid_matrix = np.zeros((len(centroids), FEAT_DIM), dtype=np.float32)
cluster_id_to_idx = {}
idx_to_cluster_id = {}
for idx, (cluster_id, centroid) in enumerate(centroids.items()):
    centroid_matrix[idx] = centroid
    cluster_id_to_idx[cluster_id] = idx
    idx_to_cluster_id[idx] = cluster_id

centroid_matrix = l2_normalize(centroid_matrix, axis=1)
print(f"Computed {len(centroids)} location centroids")

Computed 32 location centroids


## Prepare Ground Truth

In [10]:
Q = len(query_imgs)
all_rel = {q: get_relevant_images(gt_sim, q) for q in range(Q)}
pidx = [np.array(all_rel[q], dtype=int) for q in range(Q)]
ks = [1, 5, 10, 20]

## Baseline: Standard Retrieval (No GPS)

In [11]:
similarities = cosine_similarity(query_features, db_features)
ranks = np.argsort(-similarities, axis=1)

all_ret = {q: ranks[q] for q in range(Q)}
recall_baseline = recall(ranks, pidx, ks)
mAPs_baseline = mapk_many(ranks, pidx, ks)
map_baseline = mean_average_precision(all_rel, all_ret)

print("=" * 60)
print("BASELINE: Standard Retrieval (No GPS)")
print("=" * 60)
print(f"MAP: {map_baseline*100:.2f}%")
for k, r, m in zip(ks, recall_baseline, mAPs_baseline):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

BASELINE: Standard Retrieval (No GPS)
MAP: 36.46%
Recall@1: 44.00%   mAP@1: 44.00%
Recall@5: 64.80%   mAP@5: 31.74%
Recall@10: 73.40%   mAP@10: 31.21%
Recall@20: 82.40%   mAP@20: 33.09%


## Approach A: Two-Stage Retrieval

In [12]:
centroid_similarities = cosine_similarity(query_features, centroid_matrix)

In [13]:
best_clusters = np.argmax(centroid_similarities, axis=1)

ranks_a = np.zeros((Q, len(db_imgs)), dtype=int)

for q_idx in range(Q):
    best_cluster_idx = best_clusters[q_idx]
    cluster_id = idx_to_cluster_id[best_cluster_idx]
    
    cluster_member_indices = np.array(cluster_members[cluster_id])
    
    cluster_sims = similarities[q_idx, cluster_member_indices]
    sorted_cluster_indices = cluster_member_indices[np.argsort(-cluster_sims)]
    
    all_other_indices = np.setdiff1d(np.arange(len(db_imgs)), cluster_member_indices)
    other_sims = similarities[q_idx, all_other_indices]
    sorted_other_indices = all_other_indices[np.argsort(-other_sims)]
    
    ranks_a[q_idx] = np.concatenate([sorted_cluster_indices, sorted_other_indices])

all_ret_a = {q: ranks_a[q] for q in range(Q)}
recall_a = recall(ranks_a, pidx, ks)
mAPs_a = mapk_many(ranks_a, pidx, ks)
map_a = mean_average_precision(all_rel, all_ret_a)

print("=" * 60)
print("APPROACH A: Two-Stage Retrieval")
print("=" * 60)
print(f"MAP: {map_a*100:.2f}%")
for k, r, m in zip(ks, recall_a, mAPs_a):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

APPROACH A: Two-Stage Retrieval
MAP: 40.07%
Recall@1: 43.20%   mAP@1: 43.20%
Recall@5: 61.60%   mAP@5: 32.13%
Recall@10: 70.00%   mAP@10: 32.79%
Recall@20: 77.20%   mAP@20: 35.79%


## Approach B: Weighted Similarity (Grid Search)

In [14]:
EPS = 50
MIN_SAMPLES = 2

db_clusters = cluster_locations(db_loc, eps=EPS, min_samples=MIN_SAMPLES)

n_clusters = len(np.unique(db_clusters[db_clusters >= 0]))
n_noise = np.sum(db_clusters == -1)

print(f"Number of location clusters: {n_clusters}")
print(f"Noise points: {n_noise}")


centroids, cluster_members = compute_location_centroids(db_features, db_clusters)

centroid_matrix = np.zeros((len(centroids), FEAT_DIM), dtype=np.float32)
cluster_id_to_idx = {}
idx_to_cluster_id = {}
for idx, (cluster_id, centroid) in enumerate(centroids.items()):
    centroid_matrix[idx] = centroid
    cluster_id_to_idx[cluster_id] = idx
    idx_to_cluster_id[idx] = cluster_id

centroid_matrix = l2_normalize(centroid_matrix, axis=1)
print(f"Computed {len(centroids)} location centroids")
centroid_similarities = cosine_similarity(query_features, centroid_matrix)


Number of location clusters: 32
Noise points: 0
Computed 32 location centroids


In [15]:
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results_b = []

for alpha in tqdm(alphas, desc="Grid search over alpha"):
    combined_similarities = np.zeros_like(similarities)
    
    for q_idx in range(Q):
        for db_idx in range(len(db_imgs)):
            cluster_id = db_clusters[db_idx]
            
            if cluster_id >= 0:
                cluster_idx = cluster_id_to_idx[cluster_id]
                sim_to_centroid = centroid_similarities[q_idx, cluster_idx]
            else:
                sim_to_centroid = 0
            
            sim_to_image = similarities[q_idx, db_idx]
            combined_similarities[q_idx, db_idx] = alpha * sim_to_centroid + (1 - alpha) * sim_to_image
    
    ranks_b = np.argsort(-combined_similarities, axis=1)
    all_ret_b = {q: ranks_b[q] for q in range(Q)}
    recall_b = recall(ranks_b, pidx, ks)
    mAPs_b = mapk_many(ranks_b, pidx, ks)
    map_b = mean_average_precision(all_rel, all_ret_b)
    
    results_b.append({
        "alpha": alpha,
        "MAP": map_b,
        "Recall@1": recall_b[0],
        "Recall@5": recall_b[1],
        "Recall@10": recall_b[2],
        "Recall@20": recall_b[3],
        "mAP@1": mAPs_b[0],
        "mAP@5": mAPs_b[1],
        "mAP@10": mAPs_b[2],
        "mAP@20": mAPs_b[3],
    })

best_result_b = max(results_b, key=lambda x: x["Recall@1"])
best_alpha = best_result_b["alpha"]

print("=" * 60)
print("APPROACH B: Weighted Similarity (Best Alpha)")
print("=" * 60)
print(f"Best alpha: {best_alpha}")
print(f"MAP: {best_result_b['MAP']*100:.2f}%")
print(f"Recall@1: {best_result_b['Recall@1']*100:.2f}%   mAP@1: {best_result_b['mAP@1']*100:.2f}%")
print(f"Recall@5: {best_result_b['Recall@5']*100:.2f}%   mAP@5: {best_result_b['mAP@5']*100:.2f}%")
print(f"Recall@10: {best_result_b['Recall@10']*100:.2f}%   mAP@10: {best_result_b['mAP@10']*100:.2f}%")
print(f"Recall@20: {best_result_b['Recall@20']*100:.2f}%   mAP@20: {best_result_b['mAP@20']*100:.2f}%")

Grid search over alpha: 100%|██████████| 9/9 [00:16<00:00,  1.83s/it]

APPROACH B: Weighted Similarity (Best Alpha)
Best alpha: 0.5
MAP: 39.23%
Recall@1: 44.40%   mAP@1: 44.40%
Recall@5: 65.60%   mAP@5: 32.65%
Recall@10: 75.60%   mAP@10: 32.96%
Recall@20: 81.80%   mAP@20: 35.45%





## Approach C: GPS-Filtered Retrieval

In [16]:
GPS_RADIUS = 100

gps_distances = compute_gps_distances(query_loc, db_loc)
ranks_c = np.zeros((Q, len(db_imgs)), dtype=int)

for q_idx in range(Q):
    nearby_mask = gps_distances[q_idx] <= GPS_RADIUS
    nearby_indices = np.where(nearby_mask)[0]
    far_indices = np.where(~nearby_mask)[0]
    
    nearby_sims = similarities[q_idx, nearby_indices]
    sorted_nearby = nearby_indices[np.argsort(-nearby_sims)]
    
    far_sims = similarities[q_idx, far_indices]
    sorted_far = far_indices[np.argsort(-far_sims)]
    
    ranks_c[q_idx] = np.concatenate([sorted_nearby, sorted_far])

all_ret_c = {q: ranks_c[q] for q in range(Q)}
recall_c = recall(ranks_c, pidx, ks)
mAPs_c = mapk_many(ranks_c, pidx, ks)
map_c = mean_average_precision(all_rel, all_ret_c)

print("=" * 60)
print(f"APPROACH C: GPS-Filtered Retrieval (radius={GPS_RADIUS})")
print("=" * 60)
print(f"MAP: {map_c*100:.2f}%")
for k, r, m in zip(ks, recall_c, mAPs_c):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

APPROACH C: GPS-Filtered Retrieval (radius=100)
MAP: 60.85%
Recall@1: 63.20%   mAP@1: 63.20%
Recall@5: 87.80%   mAP@5: 49.13%
Recall@10: 97.20%   mAP@10: 51.41%
Recall@20: 100.00%   mAP@20: 57.30%


## Results Comparison

In [17]:
comparison = pd.DataFrame([
    {
        "Approach": "Baseline (No GPS)",
        "MAP": map_baseline * 100,
        "Recall@1": recall_baseline[0] * 100,
        "Recall@5": recall_baseline[1] * 100,
        "Recall@10": recall_baseline[2] * 100,
        "Recall@20": recall_baseline[3] * 100,
        "mAP@1": mAPs_baseline[0] * 100,
        "mAP@5": mAPs_baseline[1] * 100,
        "mAP@10": mAPs_baseline[2] * 100,
        "mAP@20": mAPs_baseline[3] * 100,
    },
    {
        "Approach": "Approach A (Two-Stage)",
        "MAP": map_a * 100,
        "Recall@1": recall_a[0] * 100,
        "Recall@5": recall_a[1] * 100,
        "Recall@10": recall_a[2] * 100,
        "Recall@20": recall_a[3] * 100,
        "mAP@1": mAPs_a[0] * 100,
        "mAP@5": mAPs_a[1] * 100,
        "mAP@10": mAPs_a[2] * 100,
        "mAP@20": mAPs_a[3] * 100,
    },
    {
        "Approach": f"Approach B (Weighted, α={best_alpha})",
        "MAP": best_result_b["MAP"] * 100,
        "Recall@1": best_result_b["Recall@1"] * 100,
        "Recall@5": best_result_b["Recall@5"] * 100,
        "Recall@10": best_result_b["Recall@10"] * 100,
        "Recall@20": best_result_b["Recall@20"] * 100,
        "mAP@1": best_result_b["mAP@1"] * 100,
        "mAP@5": best_result_b["mAP@5"] * 100,
        "mAP@10": best_result_b["mAP@10"] * 100,
        "mAP@20": best_result_b["mAP@20"] * 100,
    },
    {
        "Approach": f"Approach C (GPS-Filtered, r={GPS_RADIUS})",
        "MAP": map_c * 100,
        "Recall@1": recall_c[0] * 100,
        "Recall@5": recall_c[1] * 100,
        "Recall@10": recall_c[2] * 100,
        "Recall@20": recall_c[3] * 100,
        "mAP@1": mAPs_c[0] * 100,
        "mAP@5": mAPs_c[1] * 100,
        "mAP@10": mAPs_c[2] * 100,
        "mAP@20": mAPs_c[3] * 100,
    },
])

print("\n" + "=" * 100)
print("FINAL COMPARISON")
print("=" * 100)
print(comparison.to_string(index=False))

comparison.to_csv("results/gps_enhanced_comparison.csv", index=False)
print("\nResults saved to results/gps_enhanced_comparison.csv")


FINAL COMPARISON
                        Approach       MAP  Recall@1  Recall@5  Recall@10  Recall@20  mAP@1     mAP@5    mAP@10    mAP@20
               Baseline (No GPS) 36.461470      44.0      64.8       73.4       82.4   44.0 31.742111 31.205710 33.091992
          Approach A (Two-Stage) 40.071390      43.2      61.6       70.0       77.2   43.2 32.125611 32.789575 35.788479
    Approach B (Weighted, α=0.5) 39.227827      44.4      65.6       75.6       81.8   44.4 32.649000 32.963110 35.451960
Approach C (GPS-Filtered, r=100) 60.853399      63.2      87.8       97.2      100.0   63.2 49.133611 51.406071 57.300976

Results saved to results/gps_enhanced_comparison.csv


## Alpha Grid Search Results

In [18]:
df_alpha = pd.DataFrame(results_b)
df_alpha["MAP"] = df_alpha["MAP"] * 100
df_alpha[[f"Recall@{k}" for k in ks]] = df_alpha[[f"Recall@{k}" for k in ks]] * 100
df_alpha[[f"mAP@{k}" for k in ks]] = df_alpha[[f"mAP@{k}" for k in ks]] * 100

print("\nApproach B: Alpha Grid Search Results")
print(df_alpha.to_string(index=False))

df_alpha.to_csv("results/approach_b_alpha_grid.csv", index=False)


Approach B: Alpha Grid Search Results
 alpha       MAP  Recall@1  Recall@5  Recall@10  Recall@20  mAP@1     mAP@5    mAP@10    mAP@20
   0.1 37.083606      44.2      65.6       74.2       81.8   44.2 32.030944 31.637477 33.611121
   0.2 37.682317      44.2      65.4       75.0       82.0   44.2 32.329944 32.071909 34.100365
   0.3 38.235874      44.2      65.8       75.2       82.0   44.2 32.452722 32.424211 34.516383
   0.4 38.794110      44.2      66.0       75.6       82.4   44.2 32.719111 32.676867 35.044726
   0.5 39.227827      44.4      65.6       75.6       81.8   44.4 32.649000 32.963110 35.451960
   0.6 39.552104      44.2      65.0       74.8       81.6   44.2 32.622444 33.051359 35.763928
   0.7 39.774517      43.8      63.8       73.6       80.8   43.8 32.593611 33.096785 35.870922
   0.8 39.756476      43.0      63.4       71.8       78.0   43.0 32.290278 32.889714 35.709419
   0.9 39.960322      43.6      62.6       71.0       76.8   43.6 32.349944 32.947992 35.857313


## Using 2 backbones (dinov3-vith16plus-pretrain-lvd1689m and openai/clip-vit-base-patch32) + approach A and B

### Extract features

In [19]:
MODEL_NAMES = [
    "facebook/dinov3-vith16plus-pretrain-lvd1689m",
    "openai/clip-vit-base-patch32"
]

FEAT_DIM = 1280 +768
POOLING = "GeM"
GEM_P = 3.0

device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

dino_processor = AutoImageProcessor.from_pretrained(MODEL_NAMES[0])
dino_model = AutoModel.from_pretrained(MODEL_NAMES[0]).to(device)
print(f"Loaded {MODEL_NAMES[0]}")

clip_model = CLIPModel.from_pretrained(MODEL_NAMES[1]).to(device)
clip_processor = CLIPProcessor.from_pretrained(MODEL_NAMES[1])
print(f"Loaded {MODEL_NAMES[1]}")

Using device: cuda:0
Loaded facebook/dinov3-vith16plus-pretrain-lvd1689m


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded openai/clip-vit-base-patch32


In [20]:
def extract_features_gem_dino_clip(image_paths, dino_model, clip_model, 
                                    dino_processor, clip_processor, device, p=3.0):
    num_images = len(image_paths)
    
    clip_feats = np.zeros((num_images, 768), dtype=np.float32)
    dino_feats = np.zeros((num_images, 1280), dtype=np.float32)
    
    clip_model = clip_model.to(device)
    dino_model = dino_model.to(device)
    
    clip_model.eval()
    dino_model.eval()
    
    for i, img_path in enumerate(tqdm(image_paths)):
        img = Image.open(os.path.join("data/", img_path))
        
        clip_inputs = clip_processor(images=img, return_tensors="pt").to(device)
        with torch.inference_mode():
            vision_outputs = clip_model.vision_model(**clip_inputs)
        clip_tokens = vision_outputs.last_hidden_state[:, 1:, :]
        clip_gem = clip_tokens.clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()
        clip_feats[i] = clip_gem
        
        dino_inputs = dino_processor(images=img, return_tensors="pt").to(device)
        with torch.inference_mode():
            dino_outputs = dino_model(**dino_inputs)
        dino_tokens = dino_outputs.last_hidden_state[:, 1:, :]
        dino_gem = dino_tokens.clamp(min=1e-6).pow(p).mean(dim=1).pow(1./p)[0].cpu().numpy()
        dino_feats[i] = dino_gem
    
    clip_feats = l2_normalize(clip_feats, axis=1)
    dino_feats = l2_normalize(dino_feats, axis=1)
    
    combined_feats = np.concatenate([clip_feats, dino_feats], axis=1)
    
    return combined_feats

In [21]:
print("Extracting database features...")
db_features = extract_features_gem_dino_clip(db_imgs, dino_model,clip_model, dino_processor,clip_processor, device, p=GEM_P)

print("Extracting query features...")
query_features = extract_features_gem_dino_clip(query_imgs, dino_model,clip_model, dino_processor,clip_processor, device, p=GEM_P)

print(f"Database features: {db_features.shape}")
print(f"Query features: {query_features.shape}")

Extracting database features...


100%|██████████| 1000/1000 [00:59<00:00, 16.72it/s]


Extracting query features...


100%|██████████| 500/500 [00:29<00:00, 16.71it/s]

Database features: (1000, 2048)
Query features: (500, 2048)





### GPS clustering

In [22]:
EPS = 50
MIN_SAMPLES = 2

db_clusters = cluster_locations(db_loc, eps=EPS, min_samples=MIN_SAMPLES)

n_clusters = len(np.unique(db_clusters[db_clusters >= 0]))
n_noise = np.sum(db_clusters == -1)

print(f"Number of location clusters: {n_clusters}")
print(f"Noise points: {n_noise}")


Number of location clusters: 32
Noise points: 0


### Compute location centroids

In [23]:
centroids, cluster_members = compute_location_centroids(db_features, db_clusters)

centroid_matrix = np.zeros((len(centroids), FEAT_DIM), dtype=np.float32)
cluster_id_to_idx = {}
idx_to_cluster_id = {}
for idx, (cluster_id, centroid) in enumerate(centroids.items()):
    centroid_matrix[idx] = centroid
    cluster_id_to_idx[cluster_id] = idx
    idx_to_cluster_id[idx] = cluster_id

centroid_matrix = l2_normalize(centroid_matrix, axis=1)
print(f"Computed {len(centroids)} location centroids")

Computed 32 location centroids


### Prepare ground truths

In [24]:
Q = len(query_imgs)
all_rel = {q: get_relevant_images(gt_sim, q) for q in range(Q)}
pidx = [np.array(all_rel[q], dtype=int) for q in range(Q)]
ks = [1, 5, 10, 20]

### Baseline: retrieval (no gps)

In [25]:
similarities = cosine_similarity(query_features, db_features)
ranks = np.argsort(-similarities, axis=1)

all_ret = {q: ranks[q] for q in range(Q)}
recall_baseline = recall(ranks, pidx, ks)
mAPs_baseline = mapk_many(ranks, pidx, ks)
map_baseline = mean_average_precision(all_rel, all_ret)

print("=" * 60)
print("BASELINE: Standard Retrieval (No GPS)")
print("=" * 60)
print(f"MAP: {map_baseline*100:.2f}%")
for k, r, m in zip(ks, recall_baseline, mAPs_baseline):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

BASELINE: Standard Retrieval (No GPS)
MAP: 37.45%
Recall@1: 44.80%   mAP@1: 44.80%
Recall@5: 66.60%   mAP@5: 32.70%
Recall@10: 76.00%   mAP@10: 32.23%
Recall@20: 85.00%   mAP@20: 34.03%


### Approach A: Two-Stage Retrieval

In [26]:
centroid_similarities = cosine_similarity(query_features, centroid_matrix)

In [27]:
best_clusters = np.argmax(centroid_similarities, axis=1)

ranks_a = np.zeros((Q, len(db_imgs)), dtype=int)

for q_idx in range(Q):
    best_cluster_idx = best_clusters[q_idx]
    cluster_id = idx_to_cluster_id[best_cluster_idx]
    
    cluster_member_indices = np.array(cluster_members[cluster_id])
    
    cluster_sims = similarities[q_idx, cluster_member_indices]
    sorted_cluster_indices = cluster_member_indices[np.argsort(-cluster_sims)]
    
    all_other_indices = np.setdiff1d(np.arange(len(db_imgs)), cluster_member_indices)
    other_sims = similarities[q_idx, all_other_indices]
    sorted_other_indices = all_other_indices[np.argsort(-other_sims)]
    
    ranks_a[q_idx] = np.concatenate([sorted_cluster_indices, sorted_other_indices])

all_ret_a = {q: ranks_a[q] for q in range(Q)}
recall_a = recall(ranks_a, pidx, ks)
mAPs_a = mapk_many(ranks_a, pidx, ks)
map_a = mean_average_precision(all_rel, all_ret_a)

print("=" * 60)
print("APPROACH A: Two-Stage Retrieval")
print("=" * 60)
print(f"MAP: {map_a*100:.2f}%")
for k, r, m in zip(ks, recall_a, mAPs_a):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

APPROACH A: Two-Stage Retrieval
MAP: 40.89%
Recall@1: 44.40%   mAP@1: 44.40%
Recall@5: 62.00%   mAP@5: 33.46%
Recall@10: 70.40%   mAP@10: 33.83%
Recall@20: 77.00%   mAP@20: 36.69%


### Approach B: Weighted Similarity (Grid Search)

In [28]:
EPS = 50
MIN_SAMPLES = 2

db_clusters = cluster_locations(db_loc, eps=EPS, min_samples=MIN_SAMPLES)

n_clusters = len(np.unique(db_clusters[db_clusters >= 0]))
n_noise = np.sum(db_clusters == -1)

print(f"Number of location clusters: {n_clusters}")
print(f"Noise points: {n_noise}")


centroids, cluster_members = compute_location_centroids(db_features, db_clusters)

centroid_matrix = np.zeros((len(centroids), FEAT_DIM), dtype=np.float32)
cluster_id_to_idx = {}
idx_to_cluster_id = {}
for idx, (cluster_id, centroid) in enumerate(centroids.items()):
    centroid_matrix[idx] = centroid
    cluster_id_to_idx[cluster_id] = idx
    idx_to_cluster_id[idx] = cluster_id

centroid_matrix = l2_normalize(centroid_matrix, axis=1)
print(f"Computed {len(centroids)} location centroids")
centroid_similarities = cosine_similarity(query_features, centroid_matrix)


Number of location clusters: 32
Noise points: 0
Computed 32 location centroids


In [29]:
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results_b = []

for alpha in tqdm(alphas, desc="Grid search over alpha"):
    combined_similarities = np.zeros_like(similarities)
    
    for q_idx in range(Q):
        for db_idx in range(len(db_imgs)):
            cluster_id = db_clusters[db_idx]
            
            if cluster_id >= 0:
                cluster_idx = cluster_id_to_idx[cluster_id]
                sim_to_centroid = centroid_similarities[q_idx, cluster_idx]
            else:
                sim_to_centroid = 0
            
            sim_to_image = similarities[q_idx, db_idx]
            combined_similarities[q_idx, db_idx] = alpha * sim_to_centroid + (1 - alpha) * sim_to_image
    
    ranks_b = np.argsort(-combined_similarities, axis=1)
    all_ret_b = {q: ranks_b[q] for q in range(Q)}
    recall_b = recall(ranks_b, pidx, ks)
    mAPs_b = mapk_many(ranks_b, pidx, ks)
    map_b = mean_average_precision(all_rel, all_ret_b)
    
    results_b.append({
        "alpha": alpha,
        "MAP": map_b,
        "Recall@1": recall_b[0],
        "Recall@5": recall_b[1],
        "Recall@10": recall_b[2],
        "Recall@20": recall_b[3],
        "mAP@1": mAPs_b[0],
        "mAP@5": mAPs_b[1],
        "mAP@10": mAPs_b[2],
        "mAP@20": mAPs_b[3],
    })

best_result_b = max(results_b, key=lambda x: x["Recall@1"])
best_alpha = best_result_b["alpha"]

print("=" * 60)
print("APPROACH B: Weighted Similarity (Best Alpha)")
print("=" * 60)
print(f"Best alpha: {best_alpha}")
print(f"MAP: {best_result_b['MAP']*100:.2f}%")
print(f"Recall@1: {best_result_b['Recall@1']*100:.2f}%   mAP@1: {best_result_b['mAP@1']*100:.2f}%")
print(f"Recall@5: {best_result_b['Recall@5']*100:.2f}%   mAP@5: {best_result_b['mAP@5']*100:.2f}%")
print(f"Recall@10: {best_result_b['Recall@10']*100:.2f}%   mAP@10: {best_result_b['mAP@10']*100:.2f}%")
print(f"Recall@20: {best_result_b['Recall@20']*100:.2f}%   mAP@20: {best_result_b['mAP@20']*100:.2f}%")

Grid search over alpha: 100%|██████████| 9/9 [00:16<00:00,  1.80s/it]

APPROACH B: Weighted Similarity (Best Alpha)
Best alpha: 0.6
MAP: 40.88%
Recall@1: 46.40%   mAP@1: 46.40%
Recall@5: 66.60%   mAP@5: 34.36%
Recall@10: 76.20%   mAP@10: 34.57%
Recall@20: 81.80%   mAP@20: 36.98%





### Approach C: GPS-Filtered Retrieval

In [30]:
GPS_RADIUS = 100

gps_distances = compute_gps_distances(query_loc, db_loc)
ranks_c = np.zeros((Q, len(db_imgs)), dtype=int)

for q_idx in range(Q):
    nearby_mask = gps_distances[q_idx] <= GPS_RADIUS
    nearby_indices = np.where(nearby_mask)[0]
    far_indices = np.where(~nearby_mask)[0]
    
    nearby_sims = similarities[q_idx, nearby_indices]
    sorted_nearby = nearby_indices[np.argsort(-nearby_sims)]
    
    far_sims = similarities[q_idx, far_indices]
    sorted_far = far_indices[np.argsort(-far_sims)]
    
    ranks_c[q_idx] = np.concatenate([sorted_nearby, sorted_far])

all_ret_c = {q: ranks_c[q] for q in range(Q)}
recall_c = recall(ranks_c, pidx, ks)
mAPs_c = mapk_many(ranks_c, pidx, ks)
map_c = mean_average_precision(all_rel, all_ret_c)

print("=" * 60)
print(f"APPROACH C: GPS-Filtered Retrieval (radius={GPS_RADIUS})")
print("=" * 60)
print(f"MAP: {map_c*100:.2f}%")
for k, r, m in zip(ks, recall_c, mAPs_c):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

APPROACH C: GPS-Filtered Retrieval (radius=100)
MAP: 61.55%
Recall@1: 63.20%   mAP@1: 63.20%
Recall@5: 89.80%   mAP@5: 50.34%
Recall@10: 97.20%   mAP@10: 52.03%
Recall@20: 100.00%   mAP@20: 58.09%


### Results Comparison

In [31]:
comparison = pd.DataFrame([
    {
        "Approach": "Baseline (No GPS)",
        "MAP": map_baseline * 100,
        "Recall@1": recall_baseline[0] * 100,
        "Recall@5": recall_baseline[1] * 100,
        "Recall@10": recall_baseline[2] * 100,
        "Recall@20": recall_baseline[3] * 100,
        "mAP@1": mAPs_baseline[0] * 100,
        "mAP@5": mAPs_baseline[1] * 100,
        "mAP@10": mAPs_baseline[2] * 100,
        "mAP@20": mAPs_baseline[3] * 100,
    },
    {
        "Approach": "Approach A (Two-Stage)",
        "MAP": map_a * 100,
        "Recall@1": recall_a[0] * 100,
        "Recall@5": recall_a[1] * 100,
        "Recall@10": recall_a[2] * 100,
        "Recall@20": recall_a[3] * 100,
        "mAP@1": mAPs_a[0] * 100,
        "mAP@5": mAPs_a[1] * 100,
        "mAP@10": mAPs_a[2] * 100,
        "mAP@20": mAPs_a[3] * 100,
    },
    {
        "Approach": f"Approach B (Weighted, α={best_alpha})",
        "MAP": best_result_b["MAP"] * 100,
        "Recall@1": best_result_b["Recall@1"] * 100,
        "Recall@5": best_result_b["Recall@5"] * 100,
        "Recall@10": best_result_b["Recall@10"] * 100,
        "Recall@20": best_result_b["Recall@20"] * 100,
        "mAP@1": best_result_b["mAP@1"] * 100,
        "mAP@5": best_result_b["mAP@5"] * 100,
        "mAP@10": best_result_b["mAP@10"] * 100,
        "mAP@20": best_result_b["mAP@20"] * 100,
    },
    {
        "Approach": f"Approach C (GPS-Filtered, r={GPS_RADIUS})",
        "MAP": map_c * 100,
        "Recall@1": recall_c[0] * 100,
        "Recall@5": recall_c[1] * 100,
        "Recall@10": recall_c[2] * 100,
        "Recall@20": recall_c[3] * 100,
        "mAP@1": mAPs_c[0] * 100,
        "mAP@5": mAPs_c[1] * 100,
        "mAP@10": mAPs_c[2] * 100,
        "mAP@20": mAPs_c[3] * 100,
    },
])

print("\n" + "=" * 100)
print("FINAL COMPARISON")
print("=" * 100)
print(comparison.to_string(index=False))

comparison.to_csv("results/gps_enhanced_comparison_dino+clip.csv", index=False)


FINAL COMPARISON
                        Approach       MAP  Recall@1  Recall@5  Recall@10  Recall@20  mAP@1     mAP@5    mAP@10    mAP@20
               Baseline (No GPS) 37.447127      44.8      66.6       76.0       85.0   44.8 32.698889 32.232180 34.028459
          Approach A (Two-Stage) 40.886735      44.4      62.0       70.4       77.0   44.4 33.459833 33.831942 36.690555
    Approach B (Weighted, α=0.6) 40.884586      46.4      66.6       76.2       81.8   46.4 34.358167 34.570013 36.978218
Approach C (GPS-Filtered, r=100) 61.548770      63.2      89.8       97.2      100.0   63.2 50.338833 52.032332 58.094165


### Alpha Grid Search Results

In [32]:
df_alpha = pd.DataFrame(results_b)
df_alpha["MAP"] = df_alpha["MAP"] * 100
df_alpha[[f"Recall@{k}" for k in ks]] = df_alpha[[f"Recall@{k}" for k in ks]] * 100
df_alpha[[f"mAP@{k}" for k in ks]] = df_alpha[[f"mAP@{k}" for k in ks]] * 100

print("\nApproach B: Alpha Grid Search Results")
print(df_alpha.to_string(index=False))

df_alpha.to_csv("results/approach_b_alpha_grid_dino+clip.csv", index=False)


Approach B: Alpha Grid Search Results
 alpha       MAP  Recall@1  Recall@5  Recall@10  Recall@20  mAP@1     mAP@5    mAP@10    mAP@20
   0.1 38.076366      45.0      66.4       76.8       85.2   45.0 33.006333 32.740402 34.621068
   0.2 38.786303      45.6      67.0       77.0       84.8   45.6 33.440889 33.205196 35.218386
   0.3 39.488197      45.8      67.4       77.0       83.6   45.8 33.945889 33.720074 35.874158
   0.4 40.114763      46.2      66.8       76.8       83.6   46.2 34.130389 34.194663 36.445780
   0.5 40.634004      46.2      67.2       76.2       82.6   46.2 34.428889 34.474736 36.835643
   0.6 40.884586      46.4      66.6       76.2       81.8   46.4 34.358167 34.570013 36.978218
   0.7 40.948265      46.2      65.6       74.6       80.4   46.2 34.180167 34.474785 36.945413
   0.8 40.967179      45.4      64.0       73.2       78.6   45.4 33.930833 34.267663 36.880895
   0.9 40.958091      44.8      63.6       72.4       77.0   44.8 33.719167 34.094798 36.873809
