In [None]:
# ## 1) Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# ## 2) Install / update libraries (Sentence‑Transformers)

!pip install -q -U "transformers>=4.51.0" "sentence-transformers>=2.7.0" accelerate bitsandbytes tqdm

In [None]:
# ## 3) Load entity list

import os, math, numpy as np
from tqdm import tqdm

ENT_PATH = "/content/drive/MyDrive/HotpotQA_snapshot/unique_entities.txt"
assert os.path.exists(ENT_PATH), ENT_PATH
with open(ENT_PATH, 'r', encoding='utf-8') as f:
    entities = [line.rstrip('\n') for line in f if line.strip()]
N = len(entities)
print(f"Entities loaded: {N:,}")

Entities loaded: 557,825


In [None]:
# 4) Load Qwen3‑Embedding‑8B (no flash‑attn)
from sentence_transformers import SentenceTransformer

MODEL_ID  = "Qwen/Qwen3-Embedding-8B"


print("Loading SentenceTransformer model …")
model = SentenceTransformer(
    MODEL_ID,
    model_kwargs={"device_map": "auto", "torch_dtype": "auto"},   # هیچ پارامتر اضافی
    tokenizer_kwargs={"padding_side": "left"},
)
EMBED_DIM = model.get_sentence_embedding_dimension()
BATCH_SIZE = 32
print("Model default embed dim:", model.get_sentence_embedding_dimension())  # باید 4096 نشان دهد
model.eval()


Loading SentenceTransformer model …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model default embed dim: 4096


SentenceTransformer(
  (0): Transformer({'max_seq_length': 40960, 'do_lower_case': False, 'architecture': 'Qwen3Model'})
  (1): Pooling({'word_embedding_dimension': 4096, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': True, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
# ## 5) Allocate float32 memmap (≈ 4.7 GB)

EMB_PATH = "/content/drive/MyDrive/HotpotQA_snapshot/unique_entities_emb_4096.fp16.memmap"
emb_mem = np.memmap(EMB_PATH, mode="w+", dtype="float16", shape=(N, EMBED_DIM))
print(f"Memmap ↔ {EMB_PATH} | shape {(N, EMBED_DIM)}")

Memmap ↔ /content/drive/MyDrive/HotpotQA_snapshot/unique_entities_emb_4096.fp16.memmap | shape (557825, 4096)


In [None]:
import torch

for start in tqdm(range(0, N, BATCH_SIZE), total=math.ceil(N / BATCH_SIZE), desc="Embedding"):
    batch = entities[start : start + BATCH_SIZE]

    emb = model.encode(
        batch,
        batch_size=BATCH_SIZE,
        convert_to_numpy=True,       
        normalize_embeddings=True,
        show_progress_bar=False,
    ).astype("float16")

    emb_mem[start : start + len(batch)] = emb

    if (start // BATCH_SIZE) % 50 == 0:
        emb_mem.flush()
        torch.cuda.empty_cache()     
        torch.cuda.ipc_collect()

emb_mem.flush()
del emb_mem
print("✔️  All embeddings stored.")

Embedding: 100%|██████████| 17433/17433 [1:00:49<00:00,  4.78it/s]

✔️  All embeddings stored.





In [None]:
# ## 7) Save entity order for index mapping

ORDER_PATH = "/content/drive/MyDrive/HotpotQA_snapshot/unique_entities_ordered.txt"
with open(ORDER_PATH, 'w', encoding='utf-8') as f:
    f.writelines(ent + '\n' for ent in entities)
print("Order saved →", ORDER_PATH)

Order saved → /content/drive/MyDrive/HotpotQA_snapshot/unique_entities_ordered.txt


In [None]:
# ## 8) Sanity‑check a random row

emb_check = np.memmap(EMB_PATH, mode='r', dtype='float16', shape=(N, EMBED_DIM))
print("First entity:", entities[1000])
print("Sample dims:", emb_check[1000, :5])
print("Norm ≈", np.linalg.norm(emb_check[1000]))

First entity: $253,510
Sample dims: [ 0.02026   0.002975 -0.010864 -0.01373   0.009766]
Norm ≈ 0.998
