# Evaluate Image Search

## Part 1: Loading the eval data

In [1]:
import json
from copy import deepcopy
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import Literal

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
import pandas as pd
from PIL import Image
from tqdm.auto import tqdm

from src import meta_clip
from src import eval_retrieval

In [2]:
dataset_dir = Path("vehicle_image_search")
image_dir = dataset_dir / "images"
filenames = sorted(image_dir.glob("*.jpg"))
print(f"Found {len(filenames):,} image files")
df_queries = pd.read_csv(dataset_dir / "queries.csv")
display(df_queries.head().style.set_caption("queries"))
df_qrels = pd.read_csv("vehicle_image_search/qrels.csv")
display(df_qrels.head().style.set_caption("qrels"))

Found 68,154 image files


Unnamed: 0,query_id,query_text
0,10002456_caption_00,Several men in hard hats are operating a giant pulley system.
1,10002456_caption_01,Workers look down from up above on a piece of equipment.
2,10002456_caption_02,Two men working on a machine wearing hard hats.
3,10002456_caption_03,Four men on top of a tall structure.
4,10002456_caption_04,Three men on a large rig.


Unnamed: 0,query_id,document_id,relevance
0,10002456_caption_00,10002456.jpg,1
1,10002456_caption_01,10002456.jpg,1
2,10002456_caption_02,10002456.jpg,1
3,10002456_caption_03,10002456.jpg,1
4,10002456_caption_04,10002456.jpg,1


## Part 2: Embed teh data using the MetaCLIP 1.2 model

In [11]:
# Set up round-robin inference across devices.
BATCH_SIZE = 128

loaded_clip = meta_clip.load_model()
model_on_devices = [deepcopy(loaded_clip.model).to(i) for i in range(torch.cuda.device_count())]

def round_robin_image_batches(filenames, batch_size: int):
    for i, start in enumerate(range(0, len(filenames), batch_size)):
        end = start + batch_size
        name_batch = filenames[start:end]
        device_i = i % torch.cuda.device_count()
        image_batch = meta_clip.read_image_batch(
            image_processor_fn=loaded_clip.image_processor_fn,
            image_filepaths=name_batch
        )
        yield image_batch, "image", device_i

def round_robin_query_batches(queries: list[str], batch_size: int):
    for i, start in enumerate(range(0, len(queries), batch_size)):
        end = start + batch_size
        query_batch = queries[start:end]
        device_i = i % torch.cuda.device_count()
        batch_tensor = loaded_clip.text_tokenizer_fn(query_batch)
        yield batch_tensor, "text", device_i

@torch.inference_mode()
def embed_on_device(args: tuple[torch.Tensor, Literal["image", "text"], int]) -> np.ndarray:
    batch, kind, device_i = args
    model_on_device = model_on_devices[device_i]
    batch_on_device = batch.to(device_i)
    assert kind in ("image", "text")
    encode = model_on_device.encode_image if kind == "image" else model_on_device.encode_text
    embeddings = encode(batch_on_device)
    embeddings = F.normalize(embeddings, dim=-1)
    return embeddings.cpu().numpy()

In [12]:
# Embed images.
batch_iter = round_robin_image_batches(filenames, BATCH_SIZE)
res = []
with tqdm(total=len(filenames), unit="image", desc="embedding") as pbar, ThreadPool(torch.cuda.device_count()) as pool:
    for chunk in pool.imap(embed_on_device, batch_iter):
        res.append(chunk)
        pbar.update(chunk.shape[0])
res = np.row_stack(res)
np.save("./clip_embedded_images.npy", res)

embedding:   0%|          | 0/68154 [00:00<?, ?image/s]

In [13]:
# Embed queries.
batch_iter = round_robin_query_batches(df_queries["query_text"].tolist(), BATCH_SIZE)
res = []
with tqdm(total=len(df_queries), unit="query", desc="embedding") as pbar, ThreadPool(torch.cuda.device_count()) as pool:
    for chunk in pool.imap(embed_on_device, batch_iter):
        res.append(chunk)
        pbar.update(chunk.shape[0])
res = np.row_stack(res)
np.save("./clip_embedded_queries.npy", res)

embedding:   0%|          | 0/10725 [00:00<?, ?query/s]

## Part2B: Jina CLIP v2

In [4]:
!pip install --quiet einops timm

[0m

In [7]:
from transformers import AutoModel

jina_model = AutoModel.from_pretrained('jinaai/jina-clip-v2', trust_remote_code=True)
jina_on_devices = [deepcopy(jina_model).to(i) for i in range(torch.cuda.device_count())]

@torch.inference_mode()
def embed_on_device_jina(args: tuple[list[Path] | list[str], Literal["image", "text"], int]) -> np.ndarray:
    batch, kind, device_i = args
    model_on_device = jina_on_devices[device_i]
    assert kind in ("image", "text")
    encode = model_on_device.encode_image if kind == "image" else model_on_device.encode_text
    return encode(batch)


# Embed images and texts.
image_batches = [
    ([str(fp) for fp in filenames[start:start+BATCH_SIZE]], "image", i % torch.cuda.device_count())
    for i, start in enumerate(range(0, len(filenames), BATCH_SIZE))
]
query_texts = df_queries["query_text"].tolist()
text_batches = [
    (query_texts[start:start+BATCH_SIZE], "text", i % torch.cuda.device_count())
    for i, start in enumerate(range(0, len(query_texts), BATCH_SIZE))
]

In [8]:
res = []
with tqdm(total=len(filenames), unit="image", desc="embedding") as pbar, ThreadPool(torch.cuda.device_count()) as pool:
    for chunk in pool.imap(embed_on_device_jina, image_batches):
        res.append(chunk)
        pbar.update(chunk.shape[0])
res = np.row_stack(res)
np.save("./clip_embedded_images_jina.npy", res)

res = []
with tqdm(total=len(query_texts), unit="query", desc="embedding") as pbar, ThreadPool(torch.cuda.device_count()) as pool:
    for chunk in pool.imap(embed_on_device_jina, text_batches):
        res.append(chunk)
        pbar.update(chunk.shape[0])
res = np.row_stack(res)
np.save("./clip_embedded_queries_jina.npy", res)

embedding:   0%|          | 0/68154 [00:00<?, ?image/s]

embedding:   0%|          | 0/10725 [00:00<?, ?query/s]

tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

## Part 3: Score

In [11]:
emb_image = np.load("./clip_embedded_images.npy")
emb_text = np.load("./clip_embedded_queries.npy")
emb_image_jina = np.load("./clip_embedded_images_jina.npy")
emb_text_jina = np.load("./clip_embedded_queries_jina.npy")

In [12]:
query_ids = df_qrels["query_id"].tolist()
doc_ids = [filename.name.removeprefix("flickr30k_") for filename in filenames]
qrels = {qid: {docid: relevance} for qid, docid, relevance in df_qrels.itertuples(index=False)}

In [13]:
%%time
run = eval_retrieval.dense_retrieval_run(
    emb_queries=emb_text,
    emb_docs=emb_image,
    query_ids=query_ids,
    doc_ids=doc_ids
)

CPU times: user 45.4 s, sys: 25.8 s, total: 1min 11s
Wall time: 2.07 s


In [14]:
%%time
run_jina = eval_retrieval.dense_retrieval_run(
    emb_queries=emb_text_jina,
    emb_docs=emb_image_jina,
    query_ids=query_ids,
    doc_ids=doc_ids
)

CPU times: user 45.8 s, sys: 30.2 s, total: 1min 15s
Wall time: 2.13 s


In [17]:
scores = eval_retrieval.evaluate_retrieval(qrels=qrels, results=run, k_values=[10])
scores

{'NDCG@10': 0.51109, 'MAP@10': 0.46715, 'R@10': 0.65128, 'P@10': 0.06513}

In [18]:
scores_jina = eval_retrieval.evaluate_retrieval(qrels=qrels, results=run_jina, k_values=[10])
scores_jina

{'NDCG@10': 0.38448, 'MAP@10': 0.33982, 'R@10': 0.52718, 'P@10': 0.05272}

## Conclusions

We find that although the Jina model is trained with search in mind, on the moderately descriptive captions of Flickr30k, MetaCLIP delivers a substantially stronger ability to perform text-to-image search (65% recall at 10 vs. 53%).