In [4]:
import pandas as pd
import glob
from PIL import Image
import numpy as np
from tqdm import tqdm
from transformers import CLIPProcessor, CLIPModel
import natsort
import torch

In [5]:
# parse images and coordinates
img_dir = 'Streetview_Image_Dataset'
image_paths = natsort.natsorted(glob.glob("Streetview_Image_Dataset/*.png")) # use natsort to keep images indexes paired with coord indexes

df = pd.read_csv('coordinates.csv')
coords = df[["latitude", "longitude"]].to_numpy(dtype=np.float32)

# define StreetCLIP model and processor
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = CLIPModel.from_pretrained("geolocal/StreetCLIP").to(device)
processor = CLIPProcessor.from_pretrained("geolocal/StreetCLIP")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
def embed_images(image_path):
    '''
    Generate an image embedding using StreetCLIP

    args: the image file path
    returns: a tensor of the image embedding
    '''
    # Load image
    image = Image.open(image_path).convert("RGB")

    # Process image
    inputs = processor(images=image, return_tensors="pt", padding=True)
    for k in inputs:
        inputs[k] = inputs[k].to(device)

    with torch.no_grad():
        image_feat = model.get_image_features(**inputs)
        image_feat = image_feat / image_feat.norm(dim=-1, keepdim=True)

    return image_feat.squeeze(0)

In [7]:
# generate image embeddings for each image using StreetCLIP, takes 5 hours on cpu

image_embeddings = np.zeros((len(image_paths), 768), dtype=np.float32) # StreetCLIP image embeddings are tensors of size 768

for i in tqdm(range(len(image_paths))):
    # Load image
    image = Image.open(image_paths[i]).convert("RGB")

    # Process image
    inputs = processor(images=image, return_tensors="pt", padding=True)
    for k in inputs:
        inputs[k] = inputs[k].to(device)

    with torch.no_grad():
        image_feat = model.get_image_features(**inputs)
        image_feat = image_feat / image_feat.norm(dim=-1, keepdim=True)

    image_feat = image_feat.squeeze(0)

    image_embeddings[i] = image_feat.cpu().numpy()

# save image embeddings 
np.save('image_embeddings.npy', image_embeddings)

  0%|          | 3/25229 [00:02<5:58:49,  1.17it/s]


KeyboardInterrupt: 