In [2]:
!pip install kaggle



In [3]:
!kaggle datasets download -d hsankesara/flickr-image-dataset

Dataset URL: https://www.kaggle.com/datasets/hsankesara/flickr-image-dataset
License(s): CC0-1.0
Downloading flickr-image-dataset.zip to /workspace
100%|██████████████████████████████████████▉| 8.14G/8.16G [00:30<00:00, 346MB/s]
100%|███████████████████████████████████████| 8.16G/8.16G [00:30<00:00, 284MB/s]


In [None]:
!unzip flickr-image-dataset.zip -d /workspace/data/flickr30k

Archive:  flickr-image-dataset.zip
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1000092795.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/10002456.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1000268201.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1000344755.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1000366164.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1000523639.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1000919630.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/10010052.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1001465944.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_images/1001545525.jpg  
  inflating: /workspace/data/flickr30k/flickr30k_images/flickr30k_ima

In [9]:
pip install -r requirements.txt

Collecting git+https://github.com/openai/CLIP.git (from -r requirements.txt (line 8))
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-1sctke9s
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-1sctke9s
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [10]:
import os
import torch
import clip
import open_clip
import numpy as np
from PIL import Image
from tqdm import tqdm
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pandas as pd

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device)
model.eval()

100%|███████████████████████████████████████| 338M/338M [00:05<00:00, 65.5MiB/s]


CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (ln_pre): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): Sequential(
        (0): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          )
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=768, out_features=3072, bias=True)
            (gelu): QuickGELU()
            (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
        (1): ResidualAttentionBlock(
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
          

In [17]:
IMAGES_PATH = "./data/flickr30k/flickr30k_images/flickr30k_images"        # Folder containing 30000 images
CAPTIONS_PATH = "./data/flickr30k/flickr30k_images/results.csv"  # Caption file

In [15]:
captions = []

with open(CAPTIONS_PATH, "r", encoding="utf-8") as f:
    first = True
    for line in f:
        if first:        # 🔥 Skip header line
            first = False
            continue

        parts = [p.strip() for p in line.split("|")]

        # Expect at least 3 parts: img | index | caption
        if len(parts) < 3:
            continue

        img_name = parts[0]
        caption = " ".join(parts[2:])   # merge fragmented caption pieces

        caption = caption.replace("  ", " ").strip()
        captions.append((img_name, caption))

print("Total captions:", len(captions))

Total captions: 158914


In [None]:
image_features = {}

print("Extracting image embeddings...")
for img_name in tqdm(os.listdir(IMAGES_PATH)):
    img_path = os.path.join(IMAGES_PATH, img_name)

    try:
        image = Image.open(img_path).convert("RGB")
    except:
        continue

    image_input = preprocess(image).unsqueeze(0).to(device)

    with torch.no_grad():
        emb = model.encode_image(image_input)
        emb = emb / emb.norm(dim=-1, keepdim=True)

    image_features[img_name] = emb.cpu()

torch.save(image_features, "image_features_flickr30k.pt")
print("Saved image features!")

Extracting image embeddings...


 56%|█████▌    | 17814/31785 [22:03<17:21, 13.41it/s]

In [21]:
caption_features = []
print("Extracting caption embeddings...")

for img_name, caption in tqdm(captions):
    text_input = clip.tokenize([caption], truncate=True).to(device)

    with torch.no_grad():
        emb = model.encode_text(text_input)
        emb = emb / emb.norm(dim=-1, keepdim=True)

    caption_features.append((img_name, caption, emb.cpu()))

torch.save(caption_features, "caption_features_flickr30k.pt")
print("Saved caption features!")

Extracting caption embeddings...


100%|██████████| 158914/158914 [15:29<00:00, 170.92it/s]


Saved caption features!
