## Setup: Upgrade Transformers (Run Once)

# GPS-Enhanced Image Retrieval

In [2]:
import json
import numpy as np
import h5py
import torch
import os
from PIL import Image
from tqdm import tqdm
from transformers import AutoImageProcessor, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pickle
from gps_helpers import cluster_locations, compute_location_centroids, get_cluster_members, compute_gps_distances

2025-10-26 21:32:44.214629: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-26 21:32:44.255781: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Data

In [3]:
with open("data/database/database_lite.json", "r") as f:
    db_data = json.load(f)
    db_imgs = np.array(db_data["im_paths"])
    db_loc = np.array(db_data["loc"])

with open("data/query/query_lite.json", "r") as f:
    query_data = json.load(f)
    query_imgs = np.array(query_data["im_paths"])
    query_loc = np.array(query_data["loc"])

with h5py.File("data/london_lite_gt.h5", "r") as f:
    gt_sim = f["sim"][:].astype(np.uint8)

print(f"Database: {len(db_imgs)} images")
print(f"Query: {len(query_imgs)} images")

Database: 1000 images
Query: 500 images


## Evaluation Functions

In [4]:
def recall(ranks, pidx, ks):
    recall_at_k = np.zeros(len(ks))
    for qidx in range(ranks.shape[0]):
        for i, k in enumerate(ks):
            if np.sum(np.in1d(ranks[qidx, :k], pidx[qidx])) > 0:
                recall_at_k[i:] += 1
                break
    recall_at_k /= ranks.shape[0]
    return recall_at_k


def apk(pidx, rank, k):
    if len(rank) > k:
        rank = rank[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(rank):
        if p in pidx and p not in rank[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(pidx), k)


def mapk(ranks, pidxs, k):
    return np.mean([apk(a, p, k) for a, p in zip(pidxs, ranks)])


def mapk_many(ranks, pidxs, ks):
    return np.array([mapk(ranks, pidxs, k) for k in ks], dtype=float)


def average_precision(relevant, retrieved):
    precisions = []
    rel = 0
    for i in range(len(retrieved)):
        if retrieved[i] in relevant:
            rel += 1
            precisions.append(rel / (i + 1))
    return sum(precisions) / len(relevant) if len(relevant) > 0 else 0


def mean_average_precision(all_relevant, all_retrieved):
    total = 0
    for qid in all_relevant:
        total += average_precision(all_relevant[qid], all_retrieved.get(qid, []))
    return total / len(all_relevant)


def l2_normalize(x, axis=1, eps=1e-12):
    norm = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / (norm + eps)


def get_relevant_images(gt_similarity_matrix, query_idx):
    return np.where(gt_similarity_matrix[query_idx, :] == 1)[0]

## Extract Features

In [5]:
MODEL_NAME = "facebook/dinov3-vith16plus-pretrain-lvd1689m"
FEAT_DIM = 1280
POOLING = "GeM"
GEM_P = 3.0

device = "cuda:1" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# DINOv3 models use DINOv2 processor (compatible)
processor = AutoImageProcessor.from_pretrained("facebook/dinov2-large")
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
print(f"Loaded {MODEL_NAME}")

Using device: cuda:1


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loaded facebook/dinov3-vith16plus-pretrain-lvd1689m


In [6]:
def extract_features_gem(image_paths, model, processor, device, p=3.0):
    features = np.zeros((len(image_paths), FEAT_DIM), dtype=np.float32)
    for i, img_path in enumerate(tqdm(image_paths)):
        img = Image.open(os.path.join("data/", img_path))
        inputs = processor(images=img, return_tensors="pt").to(device)
        with torch.inference_mode():
            outputs = model(**inputs)
        gem_feat = outputs.last_hidden_state[:, 1:, :].clamp(min=1e-6).pow(p).mean(dim=1).pow(1.0 / p)[0]
        features[i] = gem_feat.cpu().numpy()
    return l2_normalize(features, axis=1)

In [7]:
print("Extracting database features...")
db_features = extract_features_gem(db_imgs, model, processor, device, p=GEM_P)

print("Extracting query features...")
query_features = extract_features_gem(query_imgs, model, processor, device, p=GEM_P)

print(f"Database features: {db_features.shape}")
print(f"Query features: {query_features.shape}")

Extracting database features...


100%|██████████| 1000/1000 [05:07<00:00,  3.26it/s]


Extracting query features...


100%|██████████| 500/500 [02:33<00:00,  3.26it/s]

Database features: (1000, 1280)
Query features: (500, 1280)





## GPS Clustering

In [8]:
EPS = 25
MIN_SAMPLES = 2

db_clusters = cluster_locations(db_loc, eps=EPS, min_samples=MIN_SAMPLES)

n_clusters = len(np.unique(db_clusters[db_clusters >= 0]))
n_noise = np.sum(db_clusters == -1)

print(f"Number of location clusters: {n_clusters}")
print(f"Noise points: {n_noise}")

Number of location clusters: 46
Noise points: 5


## Compute Location Centroids

In [9]:
centroids, cluster_members = compute_location_centroids(db_features, db_clusters)

centroid_matrix = np.zeros((len(centroids), FEAT_DIM), dtype=np.float32)
cluster_id_to_idx = {}
for idx, (cluster_id, centroid) in enumerate(centroids.items()):
    centroid_matrix[idx] = centroid
    cluster_id_to_idx[cluster_id] = idx

print(f"Computed {len(centroids)} location centroids")

Computed 46 location centroids


## Prepare Ground Truth

In [10]:
Q = len(query_imgs)
all_rel = {q: get_relevant_images(gt_sim, q) for q in range(Q)}
pidx = [np.array(all_rel[q], dtype=int) for q in range(Q)]
ks = [1, 5, 10, 20]

## Baseline: Standard Retrieval (No GPS)

In [11]:
similarities = cosine_similarity(query_features, db_features)
ranks = np.argsort(-similarities, axis=1)

all_ret = {q: ranks[q] for q in range(Q)}
recall_baseline = recall(ranks, pidx, ks)
mAPs_baseline = mapk_many(ranks, pidx, ks)
map_baseline = mean_average_precision(all_rel, all_ret)

print("=" * 60)
print("BASELINE: Standard Retrieval (No GPS)")
print("=" * 60)
print(f"MAP: {map_baseline*100:.2f}%")
for k, r, m in zip(ks, recall_baseline, mAPs_baseline):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

BASELINE: Standard Retrieval (No GPS)
MAP: 32.40%
Recall@1: 39.00%   mAP@1: 39.00%
Recall@5: 59.00%   mAP@5: 28.00%
Recall@10: 69.60%   mAP@10: 26.98%
Recall@20: 81.20%   mAP@20: 28.76%


## Approach A: Two-Stage Retrieval

In [12]:
centroid_similarities = cosine_similarity(query_features, centroid_matrix)
best_clusters = np.argmax(centroid_similarities, axis=1)

ranks_a = np.zeros((Q, len(db_imgs)), dtype=int)

for q_idx in range(Q):
    best_cluster_idx = best_clusters[q_idx]
    cluster_id = list(centroids.keys())[best_cluster_idx]
    
    cluster_member_indices = np.array(cluster_members[cluster_id])
    
    cluster_sims = similarities[q_idx, cluster_member_indices]
    sorted_cluster_indices = cluster_member_indices[np.argsort(-cluster_sims)]
    
    all_other_indices = np.setdiff1d(np.arange(len(db_imgs)), cluster_member_indices)
    other_sims = similarities[q_idx, all_other_indices]
    sorted_other_indices = all_other_indices[np.argsort(-other_sims)]
    
    ranks_a[q_idx] = np.concatenate([sorted_cluster_indices, sorted_other_indices])

all_ret_a = {q: ranks_a[q] for q in range(Q)}
recall_a = recall(ranks_a, pidx, ks)
mAPs_a = mapk_many(ranks_a, pidx, ks)
map_a = mean_average_precision(all_rel, all_ret_a)

print("=" * 60)
print("APPROACH A: Two-Stage Retrieval")
print("=" * 60)
print(f"MAP: {map_a*100:.2f}%")
for k, r, m in zip(ks, recall_a, mAPs_a):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

APPROACH A: Two-Stage Retrieval
MAP: 35.38%
Recall@1: 34.40%   mAP@1: 34.40%
Recall@5: 49.40%   mAP@5: 27.06%
Recall@10: 60.00%   mAP@10: 28.15%
Recall@20: 68.40%   mAP@20: 31.23%


## Approach B: Weighted Similarity (Grid Search)

In [13]:
alphas = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
results_b = []

for alpha in tqdm(alphas, desc="Grid search over alpha"):
    combined_similarities = np.zeros_like(similarities)
    
    for q_idx in range(Q):
        for db_idx in range(len(db_imgs)):
            cluster_id = db_clusters[db_idx]
            
            if cluster_id >= 0:
                cluster_idx = cluster_id_to_idx[cluster_id]
                sim_to_centroid = centroid_similarities[q_idx, cluster_idx]
            else:
                sim_to_centroid = 0
            
            sim_to_image = similarities[q_idx, db_idx]
            combined_similarities[q_idx, db_idx] = alpha * sim_to_centroid + (1 - alpha) * sim_to_image
    
    ranks_b = np.argsort(-combined_similarities, axis=1)
    all_ret_b = {q: ranks_b[q] for q in range(Q)}
    recall_b = recall(ranks_b, pidx, ks)
    mAPs_b = mapk_many(ranks_b, pidx, ks)
    map_b = mean_average_precision(all_rel, all_ret_b)
    
    results_b.append({
        "alpha": alpha,
        "MAP": map_b,
        "Recall@1": recall_b[0],
        "Recall@5": recall_b[1],
        "Recall@10": recall_b[2],
        "Recall@20": recall_b[3],
        "mAP@1": mAPs_b[0],
        "mAP@5": mAPs_b[1],
        "mAP@10": mAPs_b[2],
        "mAP@20": mAPs_b[3],
    })

best_result_b = max(results_b, key=lambda x: x["MAP"])
best_alpha = best_result_b["alpha"]

print("=" * 60)
print("APPROACH B: Weighted Similarity (Best Alpha)")
print("=" * 60)
print(f"Best alpha: {best_alpha}")
print(f"MAP: {best_result_b['MAP']*100:.2f}%")
print(f"Recall@1: {best_result_b['Recall@1']*100:.2f}%   mAP@1: {best_result_b['mAP@1']*100:.2f}%")
print(f"Recall@5: {best_result_b['Recall@5']*100:.2f}%   mAP@5: {best_result_b['mAP@5']*100:.2f}%")
print(f"Recall@10: {best_result_b['Recall@10']*100:.2f}%   mAP@10: {best_result_b['mAP@10']*100:.2f}%")
print(f"Recall@20: {best_result_b['Recall@20']*100:.2f}%   mAP@20: {best_result_b['mAP@20']*100:.2f}%")

Grid search over alpha: 100%|██████████| 9/9 [00:25<00:00,  2.81s/it]

APPROACH B: Weighted Similarity (Best Alpha)
Best alpha: 0.8
MAP: 36.07%
Recall@1: 36.20%   mAP@1: 36.20%
Recall@5: 52.80%   mAP@5: 28.29%
Recall@10: 64.20%   mAP@10: 28.83%
Recall@20: 73.00%   mAP@20: 32.04%





## Approach C: GPS-Filtered Retrieval

In [14]:
GPS_RADIUS = 100

gps_distances = compute_gps_distances(query_loc, db_loc)
ranks_c = np.zeros((Q, len(db_imgs)), dtype=int)

for q_idx in range(Q):
    nearby_mask = gps_distances[q_idx] <= GPS_RADIUS
    nearby_indices = np.where(nearby_mask)[0]
    far_indices = np.where(~nearby_mask)[0]
    
    nearby_sims = similarities[q_idx, nearby_indices]
    sorted_nearby = nearby_indices[np.argsort(-nearby_sims)]
    
    far_sims = similarities[q_idx, far_indices]
    sorted_far = far_indices[np.argsort(-far_sims)]
    
    ranks_c[q_idx] = np.concatenate([sorted_nearby, sorted_far])

all_ret_c = {q: ranks_c[q] for q in range(Q)}
recall_c = recall(ranks_c, pidx, ks)
mAPs_c = mapk_many(ranks_c, pidx, ks)
map_c = mean_average_precision(all_rel, all_ret_c)

print("=" * 60)
print(f"APPROACH C: GPS-Filtered Retrieval (radius={GPS_RADIUS})")
print("=" * 60)
print(f"MAP: {map_c*100:.2f}%")
for k, r, m in zip(ks, recall_c, mAPs_c):
    print(f"Recall@{k}: {r*100:.2f}%   mAP@{k}: {m*100:.2f}%")

APPROACH C: GPS-Filtered Retrieval (radius=100)
MAP: 58.93%
Recall@1: 59.20%   mAP@1: 59.20%
Recall@5: 88.60%   mAP@5: 47.01%
Recall@10: 97.60%   mAP@10: 49.19%
Recall@20: 99.80%   mAP@20: 55.32%


## Results Comparison

In [15]:
comparison = pd.DataFrame([
    {
        "Approach": "Baseline (No GPS)",
        "MAP": map_baseline * 100,
        "Recall@1": recall_baseline[0] * 100,
        "Recall@5": recall_baseline[1] * 100,
        "Recall@10": recall_baseline[2] * 100,
        "Recall@20": recall_baseline[3] * 100,
        "mAP@1": mAPs_baseline[0] * 100,
        "mAP@5": mAPs_baseline[1] * 100,
        "mAP@10": mAPs_baseline[2] * 100,
        "mAP@20": mAPs_baseline[3] * 100,
    },
    {
        "Approach": "Approach A (Two-Stage)",
        "MAP": map_a * 100,
        "Recall@1": recall_a[0] * 100,
        "Recall@5": recall_a[1] * 100,
        "Recall@10": recall_a[2] * 100,
        "Recall@20": recall_a[3] * 100,
        "mAP@1": mAPs_a[0] * 100,
        "mAP@5": mAPs_a[1] * 100,
        "mAP@10": mAPs_a[2] * 100,
        "mAP@20": mAPs_a[3] * 100,
    },
    {
        "Approach": f"Approach B (Weighted, α={best_alpha})",
        "MAP": best_result_b["MAP"] * 100,
        "Recall@1": best_result_b["Recall@1"] * 100,
        "Recall@5": best_result_b["Recall@5"] * 100,
        "Recall@10": best_result_b["Recall@10"] * 100,
        "Recall@20": best_result_b["Recall@20"] * 100,
        "mAP@1": best_result_b["mAP@1"] * 100,
        "mAP@5": best_result_b["mAP@5"] * 100,
        "mAP@10": best_result_b["mAP@10"] * 100,
        "mAP@20": best_result_b["mAP@20"] * 100,
    },
    {
        "Approach": f"Approach C (GPS-Filtered, r={GPS_RADIUS})",
        "MAP": map_c * 100,
        "Recall@1": recall_c[0] * 100,
        "Recall@5": recall_c[1] * 100,
        "Recall@10": recall_c[2] * 100,
        "Recall@20": recall_c[3] * 100,
        "mAP@1": mAPs_c[0] * 100,
        "mAP@5": mAPs_c[1] * 100,
        "mAP@10": mAPs_c[2] * 100,
        "mAP@20": mAPs_c[3] * 100,
    },
])

print("\n" + "=" * 100)
print("FINAL COMPARISON")
print("=" * 100)
print(comparison.to_string(index=False))

comparison.to_csv("results/gps_enhanced_comparison.csv", index=False)
print("\nResults saved to results/gps_enhanced_comparison.csv")


FINAL COMPARISON
                        Approach       MAP  Recall@1  Recall@5  Recall@10  Recall@20  mAP@1     mAP@5    mAP@10    mAP@20
               Baseline (No GPS) 32.398688      39.0      59.0       69.6       81.2   39.0 28.000333 26.979112 28.762485
          Approach A (Two-Stage) 35.384893      34.4      49.4       60.0       68.4   34.4 27.060778 28.147414 31.228387
    Approach B (Weighted, α=0.8) 36.070755      36.2      52.8       64.2       73.0   36.2 28.294000 28.827496 32.040138
Approach C (GPS-Filtered, r=100) 58.931081      59.2      88.6       97.6       99.8   59.2 47.009333 49.191950 55.323949

Results saved to results/gps_enhanced_comparison.csv


## Alpha Grid Search Results

In [16]:
df_alpha = pd.DataFrame(results_b)
df_alpha["MAP"] = df_alpha["MAP"] * 100
df_alpha[[f"Recall@{k}" for k in ks]] = df_alpha[[f"Recall@{k}" for k in ks]] * 100
df_alpha[[f"mAP@{k}" for k in ks]] = df_alpha[[f"mAP@{k}" for k in ks]] * 100

print("\nApproach B: Alpha Grid Search Results")
print(df_alpha.to_string(index=False))

df_alpha.to_csv("results/approach_b_alpha_grid.csv", index=False)


Approach B: Alpha Grid Search Results
 alpha       MAP  Recall@1  Recall@5  Recall@10  Recall@20  mAP@1     mAP@5    mAP@10    mAP@20
   0.1 32.969579      39.0      58.2       70.0       81.2   39.0 28.256667 27.359787 29.310834
   0.2 33.733421      39.4      57.8       70.4       80.2   39.4 28.548667 28.025644 29.970669
   0.3 34.480290      39.2      57.4       71.0       79.8   39.2 28.831500 28.610519 30.645075
   0.4 35.054027      38.4      57.8       70.6       79.2   38.4 28.958222 28.967090 31.206080
   0.5 35.454336      38.0      56.8       68.8       78.2   38.0 28.735889 28.928123 31.542019
   0.6 35.842832      37.2      54.8       68.4       76.6   37.2 28.660444 28.928630 31.975539
   0.7 36.032316      36.6      54.2       65.6       75.4   36.6 28.569722 28.888478 32.057264
   0.8 36.070755      36.2      52.8       64.2       73.0   36.2 28.294000 28.827496 32.040138
   0.9 35.783587      35.0      50.6       61.6       69.6   35.0 27.533944 28.406957 31.627601
