# Build a list of images to analyze

In [3]:
from pathlib import Path
photos_path = Path("images/")
photos_files = list(photos_path.glob("*.jpg"))
print(f"Photos found: {len(photos_files)}")

Photos found: 32765


# Define function to compute embeddings

In [4]:
import clip
import torch
from PIL import Image
#device = "mps" if torch.backends.mps.is_available() else "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device " + device)
model, preprocess = clip.load("ViT-B/32", device=device)

def compute_embedding(photos_batch):
    # Load all the photos from the files
    photos = [Image.open(photo_file) for photo_file in photos_batch]
    
    # Preprocess all photos
    photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)

    with torch.no_grad():
        # Encode the photos batch to compute the feature vectors and normalize them
        photos_features = model.encode_image(photos_preprocessed)
        photos_features /= photos_features.norm(dim=-1, keepdim=True)

    # Transfer the feature vectors back to the CPU and convert to numpy
    return photos_features.cpu().numpy()

Using device cuda


In [5]:
import math
import numpy as np
import pandas as pd
from pathlib import Path

batch_size = 64
#batch_size=8192

# Save the resulting embeddings here:
features_path = Path("features/")

# Compute the rigt number of batches:
batches = math.ceil(len(photos_files) / batch_size)

# Process each batch
for i in range(batches):
    print(f"Processing batch {i+1} of {batches}")

    batch_ids_path = features_path / f"{i:010d}.csv"
    batch_features_path = features_path / f"{i:010d}.npy"
    
    # Only do the processing if the batch wasn't processed yet
    if not batch_features_path.exists():
        try:
            # Select the photos for the current batch
            batch_files = photos_files[i*batch_size : (i+1)*batch_size]

            # Compute the features and save to a numpy file
            batch_features = compute_embedding(batch_files)
            np.save(batch_features_path, batch_features)

            # Save the photo IDs to a CSV file
            photo_ids = [photo_file.name for photo_file in batch_files]
            photo_ids_data = pd.DataFrame(photo_ids, columns=['photo_id'])
            photo_ids_data.to_csv(batch_ids_path, index=False)
        except:
            # Error logging - possibilities include corrupt jpg, wrong format file, etc
            print(f'Problem with batch {i}')

Processing batch 1 of 512
Processing batch 2 of 512
Processing batch 3 of 512
Processing batch 4 of 512
Processing batch 5 of 512
Processing batch 6 of 512
Processing batch 7 of 512
Processing batch 8 of 512
Processing batch 9 of 512
Processing batch 10 of 512
Processing batch 11 of 512
Processing batch 12 of 512
Processing batch 13 of 512
Processing batch 14 of 512
Processing batch 15 of 512
Processing batch 16 of 512
Processing batch 17 of 512
Processing batch 18 of 512
Processing batch 19 of 512
Processing batch 20 of 512
Processing batch 21 of 512
Processing batch 22 of 512
Processing batch 23 of 512
Processing batch 24 of 512
Processing batch 25 of 512
Processing batch 26 of 512
Processing batch 27 of 512
Processing batch 28 of 512
Processing batch 29 of 512
Processing batch 30 of 512
Processing batch 31 of 512
Processing batch 32 of 512
Processing batch 33 of 512
Processing batch 34 of 512
Processing batch 35 of 512
Processing batch 36 of 512
Processing batch 37 of 512
Processing

# Produce numpy and csv results

In [6]:
import numpy as np
import pandas as pd

# Load all of the embeddings we saved to disk
features_list = [np.load(features_file) for features_file in sorted(features_path.glob("*.npy"))]

# Store all of the embeddings in one big file
features = np.concatenate(features_list)
np.save(features_path / "features.npy", features)

# Write the metadata file
photo_ids = pd.concat([pd.read_csv(ids_file) for ids_file in sorted(features_path.glob("*.csv"))])
photo_ids.to_csv(features_path / "photo_ids.csv", index=False)

# Ingest metadata

In [19]:
import pandas as pd

existing_df = pd.read_csv('features/photo_ids.csv')
new_data_df = pd.read_csv('SULVRC.csv')

merged_df = pd.merge(existing_df, new_data_df[['photo_id', 'description']], on='photo_id', how='left')

merged_df['description'] = merged_df['description'].fillna('[No Caption]')


merged_df.to_csv('updated_file.csv', index=False)

