In [1]:
# ## 1) Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# ## 2) Install / update libraries (Sentence‑Transformers)
!pip install -q -U "transformers>=4.51.0" "sentence-transformers>=2.7.0" accelerate bitsandbytes tqdm


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m117.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m470.2/470.2 kB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
# ## 3) Load relation list
import os, math, numpy as np
from tqdm import tqdm

REL_PATH = "/content/drive/MyDrive/HotpotQA_snapshot/unique_relations.txt"
assert os.path.exists(REL_PATH), REL_PATH

with open(REL_PATH, 'r', encoding='utf-8') as f:
    relations_raw = [line.rstrip('\n') for line in f if line.strip()]

N = len(relations_raw)
print(f"Relations loaded: {N:,}")


Relations loaded: 139,253


In [4]:
# ## 4) Prepare texts for embedding
relations_for_embed = [r.replace('_', ' ') for r in relations_raw]
assert len(relations_for_embed) == N

In [5]:
# ## 5) Load Qwen3‑Embedding‑8B 
from sentence_transformers import SentenceTransformer

MODEL_ID  = "Qwen/Qwen3-Embedding-8B"

print("Loading SentenceTransformer model …")
model = SentenceTransformer(
    MODEL_ID,
    model_kwargs={"device_map": "auto", "torch_dtype": "auto"},
    tokenizer_kwargs={"padding_side": "left"},
)
EMBED_DIM  = model.get_sentence_embedding_dimension()   # باید 4096 باشد
BATCH_SIZE = 8
model.eval()

print("Model embed dim:", EMBED_DIM)


Loading SentenceTransformer model …


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/729 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

Model embed dim: 4096


In [6]:
# ## 6) Allocate float16 memmap (≈ ۱٫۱ GB)
EMB_PATH = "/content/drive/MyDrive/HotpotQA_snapshot/unique_relations_emb_4096.fp16.memmap"
emb_mem  = np.memmap(EMB_PATH, mode="w+", dtype="float16", shape=(N, EMBED_DIM))
print(f"Memmap ↔ {EMB_PATH} | shape {(N, EMBED_DIM)}")


Memmap ↔ /content/drive/MyDrive/HotpotQA_snapshot/unique_relations_emb_4096.fp16.memmap | shape (139253, 4096)


In [7]:
# ## 7) Batch‑encode & store embeddings
import torch

for start in tqdm(range(0, N, BATCH_SIZE), total=math.ceil(N / BATCH_SIZE), desc="Embedding"):
    batch_texts = relations_for_embed[start : start + BATCH_SIZE]

    emb = model.encode(
        batch_texts,
        batch_size      = BATCH_SIZE,
        convert_to_numpy= True,    
        normalize_embeddings = True,
        show_progress_bar   = False,
    ).astype("float16")

    emb_mem[start : start + len(batch_texts)] = emb

    if (start // BATCH_SIZE) % 50 == 0:
        emb_mem.flush()
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()

emb_mem.flush()
del emb_mem
print("✔️  All relation embeddings stored.")


Embedding: 100%|██████████| 17407/17407 [25:06<00:00, 11.55it/s]

✔️  All relation embeddings stored.





In [8]:
# ## 8) Save relation order for index mapping
ORDER_PATH = "/content/drive/MyDrive/HotpotQA_snapshot/unique_relations_ordered.txt"
with open(ORDER_PATH, 'w', encoding='utf-8') as f:
    f.writelines(rel + '\n' for rel in relations_raw)

print("Order saved →", ORDER_PATH)


Order saved → /content/drive/MyDrive/HotpotQA_snapshot/unique_relations_ordered.txt


In [9]:
# ## 9) Sanity‑check a random row
emb_check = np.memmap(EMB_PATH, mode='r', dtype='float16', shape=(N, EMBED_DIM))

idx = 1234            # یا هر ایندکس دلخواه
print("Relation:", relations_raw[idx])
print("For‑embed:", relations_for_embed[idx])
print("Sample dims:", emb_check[idx, :5])
print("Norm ≈", np.linalg.norm(emb_check[idx]))


Relation: acquired_on
For‑embed: acquired on
Sample dims: [ 0.02087   0.0221   -0.0058   -0.03052   0.012695]
Norm ≈ 0.999
