In [1]:
!pip install -q torch torchvision ftfy regex tqdm
!pip install -q git+https://github.com/openai/CLIP.git


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import json
import torch
import clip
import csv
from PIL import Image
import requests
from io import BytesIO
from tqdm import tqdm
from google.colab import files
from pathlib import Path

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# cooper_hewitt_formatted_data.json and met_formatted_data.json were added to the content/ directory in colab before running this
# This script was run twice, once for the cooper hewitt formatted data, and once for the met formatted data
# To run with cooper hewitt data, uncomment Cooper block and comment out Met block
# To run with met data, umcomment Met block and comment out Cooper block

# Cooper block
with open("cooper_hewitt_formatted_data.json") as f:
  data = json.load(f)
# End cooper block

# Met block
# with open("met_formatted_data.json") as f:
  # data = json.load(f)
# End met block

batch_size = 50
total = len(data)

failed_log_path = Path("failed_images.csv")
log_file = failed_log_path.open("a", newline="")
csv_writer = csv.writer(log_file)
csv_writer.writerow(["id", "image_url", "error_message"])

for start_index in range(0, total, batch_size):
  end_index = min(start_index + batch_size, total)
  batch = data[start_index:end_index]
  results = []

  for obj in tqdm(batch, desc=f"Embedding batch {start_index}-{end_index}"):
    try:
      url = obj.get("image_url")
      response = requests.get(url, timeout=10)
      image = Image.open(BytesIO(response.content)).convert("RGB")
      image_input = preprocess(image).unsqueeze(0).to(device)

      with torch.no_grad():
        image_features = model.encode_image(image_input)
      
      embedding_vector = image_features[0].cpu().tolist()
      results.append({
        "id": obj["id"],
        "image_url": url,
        "embedding": embedding_vector,
        "text_fields": obj.get("embedding_text", ""),
        "metadata": {
            "title": obj["raw"].get("title", ""),
            "medium": obj["raw"].get("medium", ""),
            "date": obj["raw"].get("date", ""),
            "description": obj["raw"].get("description", ""),
            "object_url": obj.get("url", "")
        }
      })

    except Exception as e:
      print(f"Error processing ID {obj.get('id')}: {e}")
      csv_writer.writerow([obj.get("id"), obj.get("image_url"), str(e)])
      continue

  batch_num = start_index // batch_size
  # Cooper block
  filename = f"cooper_hewitt_embeddings_batch_{batch_num}.json"
  # End cooper block

  # Met block
  # filename = f"met_embeddings_batch_{batch_num}.json"
  # End met block 

  with open(filename, "w") as f:
    json.dump(results, f)

log_file.close()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>