In [1]:
!pip install -q --no-cache-dir faiss-gpu-cu12 tqdm || pip install -q --no-cache-dir faiss-cpu tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR          = "/content/drive/MyDrive/HotpotQA_snapshot"
SRC_JSON          = f"{BASE_DIR}/evidence_hotpotqa_entities.json"

Q_EMB_NPY         = f"{BASE_DIR}/questions_emb_fp16.npy"
E_EMB_NPY         = f"{BASE_DIR}/entities_emb_fp16.npy"
Q_TXT             = f"{BASE_DIR}/questions_ordered.txt"
E_TXT             = f"{BASE_DIR}/entities_ordered.txt"


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install -q -U "transformers>=4.51.0" "sentence-transformers>=2.7.0" accelerate bitsandbytes tqdm


In [4]:
import json, itertools
from pathlib import Path
from tqdm import tqdm

with Path(SRC_JSON).open(encoding="utf-8") as f:
    examples = json.load(f)

questions = [ex["question"] for ex in examples]           
entity_sets = [set(ex["entities"] or []) for ex in examples]
all_entities = sorted(set(itertools.chain.from_iterable(entity_sets))) 

print(f"Loaded {len(questions)} questions | {len(all_entities)} unique entities")


Loaded 100 questions | 222 unique entities


In [5]:
from sentence_transformers import SentenceTransformer

MODEL_ID   = "Qwen/Qwen3-Embedding-8B"
BATCH_SIZE = 32

print("⏳ Loading model …")
model = SentenceTransformer(
    MODEL_ID,
    model_kwargs={"device_map": "auto", "torch_dtype": "auto"},
    tokenizer_kwargs={"padding_side": "left"},
)
EMBED_DIM = model.get_sentence_embedding_dimension()
model.eval()
print("Model ready – embed dim:", EMBED_DIM)


⏳ Loading model …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model ready – embed dim: 4096


In [6]:
import numpy as np

q_emb = model.encode(
    questions,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True,
).astype("float16")    

print("Questions  →", q_emb.shape, q_emb.dtype)     


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Questions  → (100, 4096) float16


In [7]:
e_emb = model.encode(
    all_entities,
    batch_size=BATCH_SIZE,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True,
).astype("float16")

print("Entities   →", e_emb.shape, e_emb.dtype)            


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Entities   → (222, 4096) float16


In [8]:
np.save(Q_EMB_NPY, q_emb)
np.save(E_EMB_NPY, e_emb)

with open(Q_TXT, "w", encoding="utf-8") as f:
    f.writelines(q + "\n" for q in questions)

with open(E_TXT, "w", encoding="utf-8") as f:
    f.writelines(ent + "\n" for ent in all_entities)

print("✅  Saved:")
print(" •", Q_EMB_NPY)
print(" •", E_EMB_NPY)
print(" •", Q_TXT)
print(" •", E_TXT)


✅  Saved:
 • /content/drive/MyDrive/HotpotQA_snapshot/questions_emb_fp16.npy
 • /content/drive/MyDrive/HotpotQA_snapshot/entities_emb_fp16.npy
 • /content/drive/MyDrive/HotpotQA_snapshot/questions_ordered.txt
 • /content/drive/MyDrive/HotpotQA_snapshot/entities_ordered.txt


In [9]:
import faiss

index = faiss.IndexFlatIP(EMBED_DIM)
index = faiss.IndexIDMap(index)          
index.add_with_ids(e_emb.astype("float32"), np.arange(len(all_entities)))

import random, numpy as np
idx = random.randrange(len(questions))
query_vec = q_emb[idx : idx+1].astype("float32")

D, I = index.search(query_vec, 5)
print(f"\n🔎  Question: {questions[idx]}")
for rank, (eid, score) in enumerate(zip(I[0], D[0]), 1):
    print(f"{rank:>2}) {all_entities[eid]}   (cos={score:.4f})")



🔎  Question: Christopher Oscar Pena was recognized by an entertainment-industry brand aimed at what?
 1) Christopher Oscar Pena   (cos=0.4815)
 2) Academy Award   (cos=0.4627)
 3) Emmy Awards   (cos=0.4298)
 4) Peter Atencio   (cos=0.3979)
 5) Cruyff Football   (cos=0.3960)
