In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:


!pip install -q --no-cache-dir faiss-gpu-cu12 tqdm || pip install -q --no-cache-dir faiss-cpu tqdm

In [3]:
import faiss, numpy as np, os
from tqdm import tqdm

BASE_DIR = "/content/drive/MyDrive/HotpotQA_snapshot"

ENTITY_EMB_PATH   = os.path.join(BASE_DIR, "unique_entities_emb_4096.fp16.memmap")
ENTITY_ORDER_PATH = os.path.join(BASE_DIR, "unique_entities_ordered.txt")
ENTITY_INDEX_PATH = os.path.join(BASE_DIR, "faiss_entity_flatip_gpu.index")

REL_EMB_PATH      = os.path.join(BASE_DIR, "unique_relations_emb_4096.fp16.memmap")
REL_ORDER_PATH    = os.path.join(BASE_DIR, "unique_relations_ordered.txt")
REL_INDEX_PATH    = os.path.join(BASE_DIR, "faiss_relation_flatip_gpu.index")

EMBED_DIM = 4096  

In [4]:
def to_float32_c(arr: np.ndarray) -> np.ndarray:
    """Cast → float32 **ndarray** + C‑contiguous. جلوگیری از خطای `swig_ptr`."""
    out = np.asarray(arr, dtype=np.float32)
    if not out.flags.c_contiguous:
        out = np.ascontiguousarray(out)
    return out

In [5]:
def build_gpu_flatip_index(memmap: np.memmap,
                           dim: int,
                           batch_size: int = 8_192,
                           device: int = 0,
                           use_float16: bool = True) -> faiss.GpuIndexFlatIP:
    res = faiss.StandardGpuResources()
    cfg = faiss.GpuIndexFlatConfig()
    cfg.useFloat16 = use_float16
    cfg.device     = device

    index = faiss.GpuIndexFlatIP(res, dim, cfg)

    total = memmap.shape[0]
    for start in tqdm(range(0, total, batch_size), desc="Adding vectors", unit="vec"):
        end = min(start + batch_size, total)
        batch = to_float32_c(memmap[start:end])
        if batch.size == 0:
            continue  
        index.add(batch)

    return index

In [6]:
with open(ENTITY_ORDER_PATH, encoding="utf-8") as f:
    ENTITY_LIST = [ln.rstrip("\n") for ln in f if ln.strip()]
N_ENTITY = len(ENTITY_LIST)
print(f"· Entity count: {N_ENTITY:,}")

ENT_MEM = np.memmap(ENTITY_EMB_PATH, mode="r", dtype="float16", shape=(N_ENTITY, EMBED_DIM))
ENT_INDEX = build_gpu_flatip_index(ENT_MEM, EMBED_DIM)

#faiss.write_index(faiss.index_gpu_to_cpu(ENT_INDEX), ENTITY_INDEX_PATH)
#print("✔️  Saved →", ENTITY_INDEX_PATH)

· Entity count: 557,825


Adding vectors: 100%|██████████| 69/69 [01:10<00:00,  1.02s/vec]


In [7]:
import random
rand = random.randrange(N_ENTITY)
q_vec = to_float32_c(ENT_MEM[rand]).reshape(1, -1)
D, I = ENT_INDEX.search(q_vec, 5)
print("Query:", ENTITY_LIST[rand])
for r, (idx, score) in enumerate(zip(I[0], D[0]), 1):
    print(f"{r}) {ENTITY_LIST[idx]}  (score={score:.4f})")

Query: misunderstandings
1) misunderstandings  (score=0.9974)
2) misunderstanding  (score=0.9519)
3) misunderstood  (score=0.9103)
4) misinterpretations  (score=0.8958)
5) pitfall of misunderstandings  (score=0.8748)


In [8]:
with open(REL_ORDER_PATH, encoding="utf-8") as f:
    REL_LIST = [ln.rstrip("\n") for ln in f if ln.strip()]
N_REL = len(REL_LIST)
print(f"· Relation count: {N_REL:,}")

REL_MEM = np.memmap(REL_EMB_PATH, mode="r", dtype="float16", shape=(N_REL, EMBED_DIM))
REL_INDEX = build_gpu_flatip_index(REL_MEM, EMBED_DIM)

#faiss.write_index(faiss.index_gpu_to_cpu(REL_INDEX), REL_INDEX_PATH)
#print("✔️  Saved →", REL_INDEX_PATH)

· Relation count: 139,253


Adding vectors: 100%|██████████| 17/17 [00:25<00:00,  1.48s/vec]


In [9]:
rand = random.randrange(N_REL)
q_vec = to_float32_c(REL_MEM[rand]).reshape(1, -1)
D, I = REL_INDEX.search(q_vec, 5)
print("Query:", REL_LIST[rand].replace('_', ' '))
for r, (idx, score) in enumerate(zip(I[0], D[0]), 1):
    print(f"{r}) {REL_LIST[idx].replace('_', ' ')}  (score={score:.4f})")

Query: not mature
1) not mature  (score=0.9986)
2) matures  (score=0.8369)
3) not  (score=0.8243)
4) not made  (score=0.8194)
5) not ready for  (score=0.8189)


In [13]:
!pip install -q --upgrade "huggingface_hub[cli]>=0.23.0"

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/558.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
from huggingface_hub import notebook_login, whoami
notebook_login()  

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
LOCAL_OUT = "/content/faiss_indices"
os.makedirs(LOCAL_OUT, exist_ok=True)

ENT_OUT_DIR = os.path.join(LOCAL_OUT, "entity")
REL_OUT_DIR = os.path.join(LOCAL_OUT, "relation")
os.makedirs(ENT_OUT_DIR, exist_ok=True)
os.makedirs(REL_OUT_DIR, exist_ok=True)

ENT_LOCAL_INDEX = os.path.join(ENT_OUT_DIR, "faiss_entity_flatip_gpu.index")
REL_LOCAL_INDEX = os.path.join(REL_OUT_DIR, "faiss_relation_flatip_gpu.index")

print("⏳ Writing FAISS entity index …")
faiss.write_index(faiss.index_gpu_to_cpu(ENT_INDEX), ENT_LOCAL_INDEX)

print("⏳ Writing FAISS relation index …")
faiss.write_index(faiss.index_gpu_to_cpu(REL_INDEX), REL_LOCAL_INDEX)

import shutil
shutil.copy(ENTITY_ORDER_PATH, os.path.join(ENT_OUT_DIR, "entity_ordered.txt"))
shutil.copy(REL_ORDER_PATH,    os.path.join(REL_OUT_DIR, "relation_ordered.txt"))

⏳ Writing FAISS entity index …
⏳ Writing FAISS relation index …


'/content/faiss_indices/relation/relation_ordered.txt'

In [16]:
from huggingface_hub import HfApi, upload_folder, create_repo
api = HfApi()
user = whoami()["name"]

ENTITY_DS = f"{user}/hotpotqa_entity_faiss_index"
REL_DS    = f"{user}/hotpotqa_relation_faiss_index"

api.create_repo(repo_id=ENTITY_DS, repo_type="dataset", exist_ok=True)
api.create_repo(repo_id=REL_DS,    repo_type="dataset", exist_ok=True)

print("🚀 Uploading entity dataset …")
upload_folder(
    repo_id   = ENTITY_DS,
    repo_type = "dataset",
    folder_path = ENT_OUT_DIR,
    path_in_repo = "",  
    commit_message = "add entity FAISS index + mapping",
)

print("🚀 Uploading relation dataset …")
upload_folder(
    repo_id   = REL_DS,
    repo_type = "dataset",
    folder_path = REL_OUT_DIR,
    path_in_repo = "",
    commit_message = "add relation FAISS index + mapping",
)

print("✔️  All done!  →")
print("  •", ENTITY_DS)
print("  •", REL_DS)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


🚀 Uploading entity dataset …


faiss_entity_flatip_gpu.index:   0%|          | 0.00/9.14G [00:00<?, ?B/s]

entity_ordered.txt:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

🚀 Uploading relation dataset …


faiss_relation_flatip_gpu.index:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

✔️  All done!  →
  • mohammad-shirkhani/hotpotqa_entity_faiss_index
  • mohammad-shirkhani/hotpotqa_relation_faiss_index
