In [1]:
import transformers
import torch

from transformers import CLIPProcessor, CLIPModel, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
model_name = "openai/clip-vit-base-patch32"
clip_processor = CLIPProcessor.from_pretrained(model_name)
clip_model     = CLIPModel.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: mps


In [2]:
import json

with open("../dataset/database.json", "r", encoding="utf-8") as f:
    articles_json = json.load(f)

print(articles_json["f8097c7d27a8aac6"].keys())

dict_keys(['url', 'date', 'title', 'images', 'content'])


In [3]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
passage_table = []
for art_id, info in list(articles_json.items())[:3]:  # only first 3 for testing
    text = info["content"]
    sents = [s.text.strip() for s in nlp(text).sents if s.text.strip()]
    current, length = [], 0
    for s in sents:
        tc = len(s.split())
        if length + tc > 120 and current:
            pid = f"{art_id}-p{len(passage_table)}"
            passage_table.append({"passage_id": pid, "article_id": art_id, "text": " ".join(current)})
            current, length = [s], tc
        else:
            current.append(s)
            length += tc
    if current:
        pid = f"{art_id}-p{len(passage_table)}"
        passage_table.append({"passage_id": pid, "article_id": art_id, "text": " ".join(current)})

print(f"Built a mini passage pool of size {len(passage_table)}.")
passage_table[1]

Built a mini passage pool of size 35.


{'passage_id': 'f8097c7d27a8aac6-p0',
 'article_id': 'f8097c7d27a8aac6',
 'text': "(CNN)Right now, there's a shortage of truck drivers in the US and worldwide, exacerbated by the e-commerce boom brought on by the pandemic. One solution to the problem is autonomous trucks, and several companies are in a race to be the first to launch one. Among them is San Diego-based TuSimple. Founded in 2015, TuSimple has completed about 2 million miles of road tests with its 70 prototype trucks across the US, China and Europe."}

In [5]:
passage_table[1]

{'passage_id': 'f8097c7d27a8aac6-p1',
 'article_id': 'f8097c7d27a8aac6',
 'text': "Although these are simply commercially available trucks retrofitted with its technology, TuSimple has deals in place with two of the world's largest truck manufacturers -- Navistar in the US and Traton, Volkswagen's trucking business, in Europe -- to design and build fully autonomous models, which it hopes to launch by 2024. Photos: The Yara Birkeland is what its builders call the world's first zero-emission, autonomous cargo ship. The ship is scheduled to make its first journey between two Norwegian towns before the end of the year. Click through to see more forms of transport set to transform the future."}

In [5]:
# 4) Text‐encode all passages zero‐shot
import numpy as np
import faiss

all_texts = [row["text"] for row in passage_table]
batch_size = 4
text_embs = []

for i in range(0, len(all_texts), batch_size):
    batch = all_texts[i : i + batch_size]
    inputs = tokenizer(batch, padding=True, truncation=True, max_length=77, return_tensors="pt").to(device)
    with torch.no_grad():
        emb = clip_model.get_text_features(**inputs)
        emb = emb / emb.norm(dim=-1, keepdim=True)
    text_embs.append(emb.cpu().numpy())

text_embeddings = np.vstack(text_embs)  # shape = (num_passages, D)
faiss.normalize_L2(text_embeddings)

In [6]:
print(len(all_texts))
print(text_embeddings.shape)

35
(35, 512)
