A Nearest Neighbors approach to predicting coordinates of street view images. Must generate image embeddings before running. 

In [None]:
import pandas as pd
import random
from PIL import Image
import numpy as np
from transformers import CLIPProcessor, CLIPModel
from torch import nn
import torch
import torch.nn.functional as F
import faiss
import io
import dask.dataframe as dd
import folium

In [2]:
# coords and image_feats have corresponding indexes

df = pd.read_csv('coordinates.csv')
coords = df[["latitude", "longitude"]].to_numpy(dtype=np.float32)
image_feats = np.load('image_embeddings.npy')

In [30]:
# define StreetCLIP model and processor

device = 'cuda' if torch.cuda.is_available() else 'cpu'

street_clip_model = CLIPModel.from_pretrained("geolocal/StreetCLIP").to(device)
street_clip_processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP")

In [31]:
def embed_image(byte_encoding):
    '''
    Generate an image embedding using StreetCLIP. Uses the images byte encoding

    args: the image byte encoding
    returns: a tensor of the image embedding
    '''
    # Load image
    image = Image.open(io.BytesIO(byte_encoding)).convert('RGB')

    # Process image
    inputs = street_clip_processor(images=image, return_tensors="pt", padding=True)
    for k in inputs:
        inputs[k] = inputs[k].to(device)

    with torch.no_grad():
        image_feat = street_clip_model.get_image_features(**inputs)
        image_feat = image_feat / image_feat.norm(dim=-1, keepdim=True)

    return image_feat.squeeze(0)

In [32]:
def haversine_distance(true_lat, true_lon, pred_lat, pred_lon):
    '''
    Computes haversine distance between two points.
    
    Parameters:
    - true_lat, true_lon: float (degrees)
    - pred_lat, pred_lon: float (degrees)
    
    Returns:
    - distance in kilometers (float)
    '''
    R = 6371.0  # Earth radius in km

    # Convert degrees to radians
    true_lat_rad = np.deg2rad(true_lat)
    true_lon_rad = np.deg2rad(true_lon)
    pred_lat_rad = np.deg2rad(pred_lat)
    pred_lon_rad = np.deg2rad(pred_lon)

    # Differences
    dlat = pred_lat_rad - true_lat_rad
    dlon = pred_lon_rad - true_lon_rad

    # Haversine formula
    a = np.sin(dlat / 2)**2 + np.cos(true_lat_rad) * np.cos(pred_lat_rad) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

    return R * c


In [33]:
def knn_predict(embedding, k):
    '''
    A nearest neighbors approach to predicting coordinates. Finds the k nearest neighbors and returns the average of their coordinates. 

    embedding: a (768,) NumPy array representing one image embedding
    k: the number of nearest embeddings calculated
    Returns: (lat, lon) predicted by averaging k nearest neighbors
    '''

    # Ensure input is 2D for FAISS (shape: [1, 768])
    embedding = embedding.reshape(1, -1).astype('float32')

    # Perform FAISS search
    D,I = index.search(embedding, k)  # I: (1, k)s

    # Average the coordinates of nearest neighbors
    pred_coord = coords[I[0]].mean(axis=0)  # (2,)

    return pred_coord


In [34]:
# create Facebook AI Similarity Search (FAISS) to effeciently search image embeddings

# Normalize training embeddings
image_feats = image_feats.astype(np.float32)
image_feats = F.normalize(torch.from_numpy(image_feats), dim=1).numpy()

# Ensure coords are float32
coords = coords.astype(np.float32)

# Build FAISS index with inner product
index = faiss.IndexFlatIP(image_feats.shape[1])
index.add(image_feats)

In [8]:
# access HuggingFace streetview dataset to test KNN coordinate predictions
# https://huggingface.co/datasets/stochastic/random_streetview_images_pano_v0.0.2

test_images = dd.read_parquet("hf://datasets/stochastic/random_streetview_images_pano_v0.0.2/data/train-*-of-*.parquet")
row0 = test_images.head(1000)  # returns a Pandas DataFrame with the first 1000 images

In [None]:
rand = random.randint(0,999)

# parse HuggingFace dataset
true_lat = row0.iloc[rand]['latitude']
true_lon = row0.iloc[rand]['longitude']
img = row0.iloc[rand]['image'] # image is stored as a byte encoding
address = row0.iloc[rand]['address']

# generate image embedding
embedding = embed_image(img['bytes']).numpy()

# run KNN 
pred_coords = knn_predict(embedding, k=5)

# create map
m = folium.Map(location=(true_lat, true_lon))

# add ground truth coordinates
folium.Marker(
    [true_lat, true_lon],
    popup="Ground Truth",
    icon=folium.Icon(color='blue', icon='info-sign')
).add_to(m)

# add predicted coordinates
folium.Marker(
    [pred_coords[0], pred_coords[1]],
    popup="Prediction",
    icon=folium.Icon(color='red', icon='info-sign')
).add_to(m)

print('error (km)', haversine_distance(float(true_lat), float(true_lon), pred_coords[0],pred_coords[1]))

streetview = Image.open(io.BytesIO(img['bytes'])).convert('RGB')
plt.figure(figsize=(20, 10))
plt.imshow(streetview)
plt.axis('off')  # Hide the axes
plt.show()

# display map
m

error (km) 619.1842173139548


In [36]:
# test KNN prediction on 10 random images from HuggingFace

total_error = np.zeros(10)

for i in range(10):
    rand = random.randint(0,999)

    # parse HuggingFace dataset
    true_lat = row0.iloc[rand]['latitude']
    true_lon = row0.iloc[rand]['longitude']
    img = row0.iloc[rand]['image'] # image is stored as a byte encoding
    address = row0.iloc[rand]['address']

    # generate image embedding
    embedding = embed_image(img['bytes']).numpy()

    # run KNN 
    pred_coords = knn_predict(embedding, k=5)

    print('pred',pred_coords[0],pred_coords[1])
    print('true',true_lat, true_lon)
    error = haversine_distance(pred_coords[0],pred_coords[1],float(true_lat), float(true_lon))
    total_error[i] = error
    print('error (km)',error)
    print('--------------')

print('average error (km)',np.mean(total_error))

pred 42.924744 50.984966
true 36.6539147 136.6930647
error (km) 7030.00596984181
--------------
pred 48.30068 9.78434
true 49.9520936 5.0535073
error (km) 390.0526911476776
--------------
pred 51.18178 10.669512
true 53.8892742 18.6314985
error (km) 616.4558109127753
--------------
pred 54.898186 37.290977
true 57.1266485 24.6971964
error (km) 819.7130599358642
--------------
pred 61.06428 20.939037
true 62.9644872 22.4732781
error (km) 225.934326784427
--------------
pred -20.169245 -32.90283
true -24.6792581 25.9321877
error (km) 6023.571295740278
--------------
pred -4.968458 108.39799
true -7.7118977 110.3517334
error (km) 373.72570562061645
--------------
pred -0.46595582 37.30435
true -27.3001802 31.8857682
error (km) 3039.392664177968
--------------
pred 8.781958 -5.9577026
true 11.0185206 103.7871266
error (km) 11938.966969136925
--------------
pred 46.515102 -0.43168992
true 43.2112054 26.8023411
error (km) 2166.465596381427
--------------
average error (km) 3262.428408967977
