In [1]:
!pip install -q --no-cache-dir faiss-gpu-cu12 \
                       networkx tqdm \
                       "huggingface_hub>=0.23.0"


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from pathlib import Path

BASE_DIR   = Path("/content/drive/MyDrive/HotpotQA_snapshot")

Q_EMB_NPY        = BASE_DIR / "questions_emb_fp16.npy"
E_EMB_NPY        = BASE_DIR / "entities_emb_fp16.npy"
Q_TXT            = BASE_DIR / "questions_ordered.txt"
E_TXT            = BASE_DIR / "entities_ordered.txt"
EVIDENCE_JSON    = BASE_DIR / "evidence_hotpotqa_entities.json"
KG_PATH          = BASE_DIR / "hotpotqa_kg_v2.gpickle"

EMB_DIM          = 4096         


In [4]:
from huggingface_hub import hf_hub_download

HF_CACHE = "/content/hf_indices"        

ENT_INDEX_FILE = hf_hub_download(
    repo_id   ="mohammad-shirkhani/hotpotqa_entity_faiss_index",
    filename  ="faiss_entity_flatip_gpu.index",
    repo_type ="dataset",
    cache_dir = HF_CACHE
)
ENT_MAP_FILE = hf_hub_download(
    repo_id   ="mohammad-shirkhani/hotpotqa_entity_faiss_index",
    filename  ="entity_ordered.txt",
    repo_type ="dataset",
    cache_dir = HF_CACHE
)

REL_INDEX_FILE = hf_hub_download(
    repo_id   ="mohammad-shirkhani/hotpotqa_relation_faiss_index",
    filename  ="faiss_relation_flatip_gpu.index",
    repo_type ="dataset",
    cache_dir = HF_CACHE
)
REL_MAP_FILE = hf_hub_download(
    repo_id   ="mohammad-shirkhani/hotpotqa_relation_faiss_index",
    filename  ="relation_ordered.txt",
    repo_type ="dataset",
    cache_dir = HF_CACHE
)

print("✓ FAISS indices downloaded to", HF_CACHE)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


faiss_entity_flatip_gpu.index:   0%|          | 0.00/9.14G [00:00<?, ?B/s]

entity_ordered.txt:   0%|          | 0.00/11.2M [00:00<?, ?B/s]

faiss_relation_flatip_gpu.index:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

relation_ordered.txt: 0.00B [00:00, ?B/s]

✓ FAISS indices downloaded to /content/hf_indices


In [5]:
import json, numpy as np, textwrap

q_emb = np.load(Q_EMB_NPY).astype("float32")   # (100, 4096)
e_emb = np.load(E_EMB_NPY).astype("float32")   # (~300, 4096)

with open(Q_TXT, encoding="utf-8") as f:
    questions = [ln.rstrip("\n") for ln in f if ln.strip()]

with open(E_TXT, encoding="utf-8") as f:
    entities = [ln.rstrip("\n") for ln in f if ln.strip()]

assert len(questions) == q_emb.shape[0], "عدم تطابق تعداد سؤال‌ها و embedding‌ها"
assert len(entities)  == e_emb.shape[0], "عدم تطابق تعداد موجودیت‌ها و embedding‌ها"

print(f"✓ Questions : {len(questions):,} | Embeddings → {q_emb.shape}")
print(f"✓ Entities  : {len(entities):,} | Embeddings → {e_emb.shape}")

print("\nنمونه سؤال:\n", textwrap.fill(questions[0], 80))


✓ Questions : 100 | Embeddings → (100, 4096)
✓ Entities  : 222 | Embeddings → (222, 4096)

نمونه سؤال:
 Were both Monkey Kingdom and Anaganaga O Dheerudu connected with Disney?


In [6]:
import gzip, pickle, time, networkx as nx

t0 = time.time()
with gzip.open(KG_PATH, "rb") as f:
    KG = pickle.load(f)

print(f"✓ KG بارگذاری شد:  nodes={KG.number_of_nodes():,} | "
      f"edges={KG.number_of_edges():,} | {(time.time()-t0):.1f}s")


✓ KG بارگذاری شد:  nodes=557,821 | edges=1,075,644 | 9.5s


In [7]:
import faiss

ent_index = faiss.read_index(ENT_INDEX_FILE)
rel_index = faiss.read_index(REL_INDEX_FILE)

with open(ENT_MAP_FILE, encoding="utf-8") as f:
    ent_labels = [ln.rstrip("\n") for ln in f if ln.strip()]
with open(REL_MAP_FILE, encoding="utf-8") as f:
    rel_labels = [ln.rstrip("\n") for ln in f if ln.strip()]

print("✓ Entity-index ntotal :", ent_index.ntotal, "| labels:", len(ent_labels))
print("✓ Relation-index ntotal:", rel_index.ntotal, "| labels:", len(rel_labels))


✓ Entity-index ntotal : 557825 | labels: 557825
✓ Relation-index ntotal: 139253 | labels: 139253


In [8]:
import random

qid = random.randrange(len(questions))
print("🔎  سؤال تصادفی:", questions[qid])

D, I = ent_index.search(q_emb[qid:qid+1], 5)
print("\nنزدیک‌ترین موجودیت‌ها:")
for rank, (idx, score) in enumerate(zip(I[0], D[0]), 1):
    print(f"{rank:>2}) {ent_labels[idx]}  (cos={score:.4f})")


🔎  سؤال تصادفی: Which band was formed first, Wavves or Social Code?

نزدیک‌ترین موجودیت‌ها:
 1) social code's  (cos=0.5515)
 2) social-code  (cos=0.5287)
 3) social code  (cos=0.5245)
 4) joywave  (cos=0.5201)
 5) members_of_fat_wreck_bands  (cos=0.5159)


In [9]:
import json, numpy as np
from pprint import pprint

qid = 0
with EVIDENCE_JSON.open(encoding="utf-8") as f:
    full_data = json.load(f)

q_text      = full_data[qid]["question"]
q_entities  = full_data[qid]["entities"] or []

print("🟢 سؤال:", q_text)
print("🟢 موجودیت‌ها:", q_entities)

D_rel, I_rel = rel_index.search(q_emb[qid:qid+1], 5)
print("\n🔻 ۳ رابطهٔ نزدیک به سؤال")
for rank, (idx, score) in enumerate(zip(I_rel[0], D_rel[0]), 1):
    print(f"{rank}) {rel_labels[idx]}   (cos={score:.4f})")

print("\n🔻 ۳ موجودیتِ نزدیک برای هر entity سؤال")
for ent in q_entities:
    try:
        local_idx = entities.index(ent)          
    except ValueError:
        print(f"⚠️ «{ent}» در لیست embedding‌های محلی نبود.")
        continue

    vec = e_emb[local_idx:local_idx+1]            
    D_ent, I_ent = ent_index.search(vec, 5)

    print(f"\n► {ent}:")
    for rank, (idx, score) in enumerate(zip(I_ent[0], D_ent[0]), 1):
        print(f"   {rank}) {ent_labels[idx]}   (cos={score:.4f})")


🟢 سؤال: Were both Monkey Kingdom and Anaganaga O Dheerudu connected with Disney?
🟢 موجودیت‌ها: ['Monkey Kingdom', 'Anaganaga O Dheerudu']

🔻 ۳ رابطهٔ نزدیک به سؤال
1) influenced_by_disney_acquiring   (cos=0.6179)
2) disney_film   (cos=0.5811)
3) reported_disney_considering_reviving   (cos=0.5726)
4) included_in_disney+   (cos=0.5701)
5) is_second_live-action_disney_renaissance   (cos=0.5677)

🔻 ۳ موجودیتِ نزدیک برای هر entity سؤال

► Monkey Kingdom:
   1) monkey kingdom   (cos=0.9221)
   2) monkey kingdom's   (cos=0.8724)
   3) jungle emperor   (cos=0.8607)
   4) simba's pride   (cos=0.8548)
   5) mufasa: the lion king   (cos=0.8503)

► Anaganaga O Dheerudu:
   1) baahubali: the beginning   (cos=0.8323)
   2) rangasthalam   (cos=0.8313)
   3) mani ratnam   (cos=0.8302)
   4) naayak   (cos=0.8178)
   5) baahubali 2: the conclusion   (cos=0.8138)


In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "Qwen/Qwen3-14B"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype="auto",
    device_map="auto"
)

GEN_DEFAULTS = dict(
    max_new_tokens = 4096,
    temperature    = 0.7,
    top_p          = 0.8,
    top_k          = 20,
    do_sample      = True,
    eos_token_id   = tokenizer.eos_token_id,
)

def qwen_generate(messages, **kw):
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=False
    )
    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
    out = model.generate(**inputs, **{**GEN_DEFAULTS, **kw})[0]
    result_ids = out[len(inputs.input_ids[0]):]
    return tokenizer.decode(result_ids, skip_special_tokens=True).strip()

print("✓ Qwen3-14 B ready (Thinking OFF).")


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

✓ Qwen3-14 B ready (Thinking OFF).


In [11]:
demo = qwen_generate([{"role":"user", "content": "Give me a one-sentence bio of Ada Lovelace."}],
                     max_new_tokens=4096)
print(demo)

Ada Lovelace was the world's first computer programmer, renowned for her work on Charles Babbage's early mechanical general-purpose computer, the Analytical Engine.


In [12]:
print(model.device)


cuda:0


In [13]:
print("قبل از تبدیل →   directed =", KG.is_directed())
KG_u = KG.to_undirected(as_view=False)   
print("بعد از تبدیل  →   directed =", KG_u.is_directed())
print("گراف جدید: nodes =", KG_u.number_of_nodes(), "| edges =", KG_u.number_of_edges())


قبل از تبدیل →   directed = True
بعد از تبدیل  →   directed = False
گراف جدید: nodes = 557821 | edges = 1073632


In [14]:

import numpy as np

qid = 0                      
q_vec = q_emb[qid:qid+1]        

D_q_ent, I_q_ent = ent_index.search(q_vec, 3)
top_q_entities   = [ent_labels[i] for i in I_q_ent[0]]

extra_entities = []
for ent in (full_data[qid]["entities"] or []):
    try:
        local_idx = entities.index(ent)
    except ValueError:
        print(f"⚠️ «{ent}» در لیست کوچک e_emb نبود—رد می‌شود.")
        continue
    vec = e_emb[local_idx:local_idx+1]
    D_tmp, I_tmp = ent_index.search(vec, 3)
    extra_entities.extend(ent_labels[i] for i in I_tmp[0])

seed_entities = list(dict.fromkeys(top_q_entities + extra_entities))   # حفظ ترتیب

print("🟢 سه موجودیتِ نزدیک به سؤال:", top_q_entities)
print("🟢 موجودیت‌های استخراج‌شده از متن سؤال:", full_data[qid]["entities"])
print("🟢 ۳× موجودیت برای هرکدام:", extra_entities)
print("🟢 مجموع (حذف تکراری) →", len(seed_entities), "مورد:\n", seed_entities)


🟢 سه موجودیتِ نزدیک به سؤال: ["monkey kingdom's", 'disney-related', 'disney animated and live-action films']
🟢 موجودیت‌های استخراج‌شده از متن سؤال: ['Monkey Kingdom', 'Anaganaga O Dheerudu']
🟢 ۳× موجودیت برای هرکدام: ['monkey kingdom', "monkey kingdom's", 'jungle emperor', 'baahubali: the beginning', 'rangasthalam', 'mani ratnam']
🟢 مجموع (حذف تکراری) → 8 مورد:
 ["monkey kingdom's", 'disney-related', 'disney animated and live-action films', 'monkey kingdom', 'jungle emperor', 'baahubali: the beginning', 'rangasthalam', 'mani ratnam']


In [15]:
valid_nodes = []
for ent in seed_entities:
    try:
        nid = ent_labels.index(ent)   
    except ValueError:
        print(f"⚠️ «{ent}» در ent_labels پیدا نشد—رد می‌شود.")
        continue
    deg = KG_u.degree(nid)
    if deg == 0:
        print(f"⚠️ «{ent}» (id={nid}) degree=0—حذف از لیست.")
        continue
    valid_nodes.append((ent, nid, deg))

print("\n🟢 موجودیت‌های باقی‌مانده:", len(valid_nodes))
for ent, nid, deg in valid_nodes:
    print(f"  • {ent}   (node={nid}, degree={deg})")



🟢 موجودیت‌های باقی‌مانده: 8
  • monkey kingdom's   (node=322082, degree=1)
  • disney-related   (node=157735, degree=1)
  • disney animated and live-action films   (node=157523, degree=1)
  • monkey kingdom   (node=322081, degree=38)
  • jungle emperor   (node=268501, degree=4)
  • baahubali: the beginning   (node=84349, degree=2)
  • rangasthalam   (node=390499, degree=16)
  • mani ratnam   (node=301891, degree=1)


In [16]:
import numpy as np, os, gc
from pathlib import Path

REL_EMB_PATH = BASE_DIR / "unique_relations_emb_4096.fp16.memmap"
ENT_EMB_PATH = BASE_DIR / "unique_entities_emb_4096.fp16.memmap"
REL_TXT_PATH = BASE_DIR / "unique_relations_ordered.txt"
ENT_TXT_PATH = BASE_DIR / "unique_entities_ordered.txt"

assert REL_EMB_PATH.exists() and ENT_EMB_PATH.exists(), "فایل‌های memmap پیدا نشد."

print("⏳ Loading relation embeddings memmap …")
_rel_mm = np.memmap(REL_EMB_PATH, mode="r", dtype="float16").reshape(-1, EMB_DIM)
rel_emb = np.asarray(_rel_mm, dtype="float32")    # ~ (139 k, 4096) → ≈ 2.2 GB
del _rel_mm ; gc.collect()

print("⏳ Loading entity embeddings memmap …")
_ent_mm = np.memmap(ENT_EMB_PATH, mode="r", dtype="float16").reshape(-1, EMB_DIM)
ent_emb_full = np.asarray(_ent_mm, dtype="float32")   # ~ (557 k, 4096) → ≈ 9.1 GB
del _ent_mm ; gc.collect()

print("✓ Embeddings copied to RAM » rel_emb:", rel_emb.shape, "| ent_emb_full:", ent_emb_full.shape)

if 'rel_labels_full' not in globals():
    with REL_TXT_PATH.open(encoding="utf-8") as f:
        rel_labels_full = [ln.rstrip("\n") for ln in f if ln.strip()]
if 'ent_labels_full' not in globals():
    with ENT_TXT_PATH.open(encoding="utf-8") as f:
        ent_labels_full = [ln.rstrip("\n") for ln in f if ln.strip()]

print("✓ label lists loaded:", len(rel_labels_full), "relations  |", len(ent_labels_full), "entities")


⏳ Loading relation embeddings memmap …
⏳ Loading entity embeddings memmap …
✓ Embeddings copied to RAM » rel_emb: (139253, 4096) | ent_emb_full: (557825, 4096)
✓ label lists loaded: 139253 relations  | 557825 entities


In [17]:

q_vec32 = q_vec.astype("float32")       

entity_edges = {}      
for ent, nid, deg in valid_nodes:
    edges = list(KG_u.edges(nid, keys=True, data=True))
    if not edges:
        continue

    rel_idx_arr = np.fromiter(
        (data["emb_idx"] for (_, _, _, data) in edges),
        dtype=np.int32,
        count=len(edges)
    )
    rel_vecs = rel_emb[rel_idx_arr]              
    sims     = rel_vecs @ q_vec32.T               
    sims     = sims.ravel()

    if len(edges) > 30:
        top_idx = np.argpartition(-sims, 30)[:30]
    else:
        top_idx = np.arange(len(edges))

    sel_order = top_idx[np.argsort(-sims[top_idx])]

    path_list = []
    for ei in sel_order:
        h, t, k, data = edges[ei]
        src_lbl = ent_labels_full[h]
        dst_lbl = ent_labels_full[t]
        rel_lbl = rel_labels_full[k]
        if src_lbl != ent:
            src_lbl, dst_lbl = dst_lbl, src_lbl
        path_list.append(
            ( (h, t, k), float(sims[ei]), f"{src_lbl} -> {rel_lbl} -> {dst_lbl}" )
        )

    entity_edges[ent] = path_list

    print(f"\n🟢 {ent}: degree={deg} | kept={len(path_list)}")
    for rank, (_, sc, pstr) in enumerate(path_list[:], 1):
        print(f"  {rank:2}) cos={sc:.4f} | {pstr}")



🟢 monkey kingdom's: degree=1 | kept=1
   1) cos=0.3674 | monkey kingdom's -> is -> monkey kingdom

🟢 disney-related: degree=1 | kept=1
   1) cos=0.3123 | disney-related -> is_stub -> wikipedia

🟢 disney animated and live-action films: degree=1 | kept=1
   1) cos=0.3790 | disney animated and live-action films -> themed_after -> disney's cinema parade

🟢 monkey kingdom: degree=38 | kept=30
   1) cos=0.4082 | monkey kingdom -> directed_by -> mark linfield
   2) cos=0.4082 | monkey kingdom -> directed_by -> alastair fothergill
   3) cos=0.3959 | monkey kingdom -> produced -> mark linfield
   4) cos=0.3959 | monkey kingdom -> produced -> alastair fothergill
   5) cos=0.3951 | monkey kingdom -> narrated_by -> tina fey
   6) cos=0.3949 | monkey kingdom -> released_across -> 2,012 theaters
   7) cos=0.3924 | monkey kingdom -> released_in -> united states
   8) cos=0.3924 | monkey kingdom -> released_in -> 2015
   9) cos=0.3896 | monkey kingdom -> released_by -> disneynature
  10) cos=0.3820 |

In [18]:
import re, textwrap

def prompt_select_paths(question: str, entity: str, paths: list[str], k: int) -> str:
    """
    Build an English prompt asking the LLM to pick k best paths.
    Each path already looks like 'A -> R -> B'.
    """
    numbered_paths = "\n".join(f"{i+1}: {p}" for i, p in enumerate(paths))
    return textwrap.dedent(f"""
        **User Question**
        {question}

        **Task**
        You are navigating a heterogeneous knowledge-graph.
        Below are candidate one-hop *paths* that all start at the entity **{entity}**.
        Pick exactly **{k} paths** whose relation nodes are *most likely* to contain evidence
        or lead to the answer.

        **How to answer**
        • Return **only the line numbers** of the chosen paths, separated by commas,
          and wrap them inside `<answer>` … `</answer>`.
        • Do **not** output anything else.

        **Candidate paths**
        {numbered_paths}

        <answer>
    """).strip()

def parse_llm_indices(raw_response: str) -> list[int]:
    """
    Extract comma-separated integers between <answer> tags.
    """
    m = re.search(r"<answer>(.*?)</answer>", raw_response, flags=re.S|re.I)
    if not m:
        return []
    nums = re.findall(r"\d+", m.group(1))
    return [int(n) for n in nums]

chosen_paths = []          # paths chosen across all entities

for ent, lst in entity_edges.items():
    paths_only = [p for *_, p in lst]               # strip score / tuple
    if len(paths_only) <= 5:
        print(f"⏭️  {ent}: only {len(paths_only)} paths (≤ 5) → skipped.")
        chosen_paths.extend(paths_only)              # keep them unfiltered
        continue

    prompt = prompt_select_paths(q_text, ent, paths_only, k=5)
    print("\n" + "="*80)
    print(f"🔹 PROMPT sent for entity **{ent}**:\n")
    print(prompt[:2000] + ("\n[…truncated…]" if len(prompt) > 2000 else ""))  # avoid massive dump

    llm_resp = qwen_generate([{"role": "user", "content": prompt}], max_new_tokens=64)
    print("\n🔹 RAW LLM response:\n", llm_resp)

    idx_list = parse_llm_indices(llm_resp)
    print("🔹 Parsed indices:", idx_list)

    selected = [paths_only[i-1] for i in idx_list if 1 <= i <= len(paths_only)]
    if not selected:        # fallback: take top-5 by original ranking
        selected = paths_only[:5]

    print("🔹 Final kept paths:")
    for p in selected:
        print("   •", p)

    chosen_paths.extend(selected)

print("\n" + "="*80)
print(f"✅ Total paths after pass-1: {len(chosen_paths)}")


⏭️  monkey kingdom's: only 1 paths (≤ 5) → skipped.
⏭️  disney-related: only 1 paths (≤ 5) → skipped.
⏭️  disney animated and live-action films: only 1 paths (≤ 5) → skipped.

🔹 PROMPT sent for entity **monkey kingdom**:

**User Question**
        Were both Monkey Kingdom and Anaganaga O Dheerudu connected with Disney?

        **Task**
        You are navigating a heterogeneous knowledge-graph.  
        Below are candidate one-hop *paths* that all start at the entity **monkey kingdom**.  
        Pick exactly **5 paths** whose relation nodes are *most likely* to contain evidence
        or lead to the answer.

        **How to answer**
        • Return **only the line numbers** of the chosen paths, separated by commas,  
          and wrap them inside `<answer>` … `</answer>`.  
        • Do **not** output anything else.

        **Candidate paths**
        1: monkey kingdom -> directed_by -> mark linfield
2: monkey kingdom -> directed_by -> alastair fothergill
3: monkey kingdom -> p

In [19]:
if len(chosen_paths) <= 10:
    print(f"🔸 Only {len(chosen_paths)} paths ≤ 10 → no second filtering needed.")
    final_paths = chosen_paths
else:
    prompt_all = prompt_select_paths(
        question = q_text,
        entity   = "the *combined set*",    # generic label
        paths    = chosen_paths,
        k        = 10
    )
    print("\n" + "="*80)
    print("🔹 PROMPT for final 10-path selection:\n")
    print(prompt_all[:2000] + ("\n[…truncated…]" if len(prompt_all) > 2000 else ""))

    llm_resp2 = qwen_generate([{"role": "user", "content": prompt_all}], max_new_tokens=64)
    print("\n🔹 RAW LLM response:\n", llm_resp2)

    idx_final = parse_llm_indices(llm_resp2)
    print("🔹 Parsed indices:", idx_final)

    final_paths = [chosen_paths[i-1] for i in idx_final if 1 <= i <= len(chosen_paths)]
    if not final_paths:                     # safety fallback
        final_paths = chosen_paths[:10]

print("\n🎯 **Final 10 (or fewer) paths selected:**")
for p in final_paths:
    print(" •", p)



🔹 PROMPT for final 10-path selection:

**User Question**
        Were both Monkey Kingdom and Anaganaga O Dheerudu connected with Disney?

        **Task**
        You are navigating a heterogeneous knowledge-graph.  
        Below are candidate one-hop *paths* that all start at the entity **the *combined set***.  
        Pick exactly **10 paths** whose relation nodes are *most likely* to contain evidence
        or lead to the answer.

        **How to answer**
        • Return **only the line numbers** of the chosen paths, separated by commas,  
          and wrap them inside `<answer>` … `</answer>`.  
        • Do **not** output anything else.

        **Candidate paths**
        1: monkey kingdom's -> is -> monkey kingdom
2: disney-related -> is_stub -> wikipedia
3: disney animated and live-action films -> themed_after -> disney's cinema parade
4: monkey kingdom -> released_by -> disneynature
5: monkey kingdom -> released -> disneynature
6: monkey kingdom -> opened_simultaneousl

In [20]:
import re, textwrap, collections

path2edge: dict[str, tuple[int,int,int]] = {}
for ent, lst in entity_edges.items():
    for (h,t,k), _, pstr in lst:
        path2edge[pstr] = (h,t,k)

two_hop_candidates = []      
q_vec32 = q_vec.astype("float32")

for pstr in final_paths:                    
    h1, t1, k1 = path2edge[pstr]            
    e1_lbl, _, e2_lbl = [s.strip() for s in pstr.split("->")]
    e1_lbl = e1_lbl.strip(); e2_lbl = e2_lbl.strip()
    nid2    = ent_labels_full.index(e2_lbl)

    all_edges = [e for e in KG_u.edges(nid2, keys=True, data=True)]
    all_edges = [
        (h,t,k,d) for (h,t,k,d) in all_edges
        if not ({h,t}=={h1,t1} and k==k1)
    ]

    if not all_edges:                
        two_hop_candidates.append(pstr)
        continue

    rel_idx_arr = np.fromiter( (d["emb_idx"] for (_,_,_,d) in all_edges), dtype=np.int32 )
    sims = (rel_emb[rel_idx_arr] @ q_vec32.T).ravel()

    if len(all_edges) > 30:
        keep_idx = np.argpartition(-sims, 30)[:30]
    else:
        keep_idx = np.arange(len(all_edges))
    keep_idx = keep_idx[np.argsort(-sims[keep_idx])]   # sort desc.

    for ei in keep_idx:
        h2, t2, k2, data2 = all_edges[ei]
        src2_lbl = ent_labels_full[h2]
        dst2_lbl = ent_labels_full[t2]
        rel2_lbl = rel_labels_full[k2]
        if src2_lbl != e2_lbl:
            src2_lbl, dst2_lbl = dst2_lbl, src2_lbl
        two_hop_candidates.append(f"{e1_lbl} -> {rel_labels_full[k1]} -> {e2_lbl} "
                                  f"-> {rel2_lbl} -> {dst2_lbl}")

print(f"🟢 Total 2-hop candidate paths generated: {len(two_hop_candidates)}")


🟢 Total 2-hop candidate paths generated: 138


In [21]:
pprint(two_hop_candidates)

["monkey kingdom's -> is -> monkey kingdom -> directed_by -> mark linfield",
 "monkey kingdom's -> is -> monkey kingdom -> directed_by -> alastair "
 'fothergill',
 "monkey kingdom's -> is -> monkey kingdom -> produced -> mark linfield",
 "monkey kingdom's -> is -> monkey kingdom -> produced -> alastair fothergill",
 "monkey kingdom's -> is -> monkey kingdom -> narrated_by -> tina fey",
 "monkey kingdom's -> is -> monkey kingdom -> released_across -> 2,012 "
 'theaters',
 "monkey kingdom's -> is -> monkey kingdom -> released_in -> united states",
 "monkey kingdom's -> is -> monkey kingdom -> released_in -> 2015",
 "monkey kingdom's -> is -> monkey kingdom -> released_by -> disneynature",
 "monkey kingdom's -> is -> monkey kingdom -> directed -> alastair fothergill",
 "monkey kingdom's -> is -> monkey kingdom -> directed -> mark linfield",
 "monkey kingdom's -> is -> monkey kingdom -> released_on -> april 17, 2015",
 "monkey kingdom's -> is -> monkey kingdom -> released_on -> april 17",

In [22]:
entity2_groups = collections.defaultdict(list)   

for p in two_hop_candidates:
    parts = [s.strip() for s in p.split("->")]
    entity2 = parts[2]                           
    entity2_groups[entity2].append(p)

chosen2_paths = []

for ent2, plist in entity2_groups.items():
    if len(plist) <= 5:
        chosen2_paths.extend(plist)
        continue

    prompt2 = prompt_select_paths(
        question = q_text,
        entity   = ent2,
        paths    = plist,
        k        = 5
    )
    print("\n" + "="*90)
    print(f"🔸 PROMPT for entity2 **{ent2}**\n")
    print(prompt2[:2000] + ("\n[…truncated…]" if len(prompt2) > 2000 else ""))

    resp2 = qwen_generate([{"role":"user","content":prompt2}], max_new_tokens=64)
    print("\n🔸 RAW LLM response:\n", resp2)

    idx2 = parse_llm_indices(resp2)
    picked = [plist[i-1] for i in idx2 if 1 <= i <= len(plist)]
    if not picked: picked = plist[:5]

    print("🔸 Chosen:")
    for pth in picked: print("   •", pth)
    chosen2_paths.extend(picked)

if len(chosen2_paths) > 10:
    prompt_final2 = prompt_select_paths(
        question = q_text,
        entity   = "the combined 2-hop set",
        paths    = chosen2_paths,
        k        = 10
    )
    print("\n" + "="*90)
    print("🔸 PROMPT for final 10-path selection (Hop-2):\n")
    print(prompt_final2[:2000] + ("\n[…truncated…]" if len(prompt_final2) > 2000 else ""))

    resp_final2 = qwen_generate([{"role":"user","content":prompt_final2}], max_new_tokens=64)
    print("\n🔸 RAW LLM response:\n", resp_final2)

    idx_final2 = parse_llm_indices(resp_final2)
    final2_paths = [chosen2_paths[i-1] for i in idx_final2 if 1 <= i <= len(chosen2_paths)]
    if not final2_paths: final2_paths = chosen2_paths[:10]
else:
    final2_paths = chosen2_paths

print("\n" + "="*90)
print(f"🎯 **Final Hop-2 paths ({len(final2_paths)})**")
for p in final2_paths:
    print(" •", p)



🔸 PROMPT for entity2 **monkey kingdom**

**User Question**
        Were both Monkey Kingdom and Anaganaga O Dheerudu connected with Disney?

        **Task**
        You are navigating a heterogeneous knowledge-graph.  
        Below are candidate one-hop *paths* that all start at the entity **monkey kingdom**.  
        Pick exactly **5 paths** whose relation nodes are *most likely* to contain evidence
        or lead to the answer.

        **How to answer**
        • Return **only the line numbers** of the chosen paths, separated by commas,  
          and wrap them inside `<answer>` … `</answer>`.  
        • Do **not** output anything else.

        **Candidate paths**
        1: monkey kingdom's -> is -> monkey kingdom -> directed_by -> mark linfield
2: monkey kingdom's -> is -> monkey kingdom -> directed_by -> alastair fothergill
3: monkey kingdom's -> is -> monkey kingdom -> produced -> mark linfield
4: monkey kingdom's -> is -> monkey kingdom -> produced -> alastair fothergil

In [23]:
import itertools, numpy as np, textwrap, re, collections

def split_path(path: str) -> list[str]:
    """['entity1', 'relation1', 'entity2', …]"""
    return [x.strip() for x in path.split("->")]

def exclude_incoming_edges(edges, id_prev, rel_prev_idx):
    out = []
    for h,t,k,d in edges:
        if {h,t}=={id_prev} | {id_prev} and k==rel_prev_idx:
            if (h==id_prev or t==id_prev):
                continue
        out.append((h,t,k,d))
    return out

three_hop_candidates = []       # list[str]
q32 = q_vec.astype("float32")

for p2 in final2_paths:
    tokens = split_path(p2)
    if len(tokens) < 5:
        three_hop_candidates.append(p2)
        continue

    ent1_lbl, rel1_lbl, ent2_lbl, rel2_lbl, ent3_lbl = tokens[:5]
    id3   = ent_labels_full.index(ent3_lbl)
    id2   = ent_labels_full.index(ent2_lbl)
    rel2_idx = rel_labels_full.index(rel2_lbl)

    nbr_edges = list(KG_u.edges(id3, keys=True, data=True))
    pruned_edges = [
        (h,t,k,d) for (h,t,k,d) in nbr_edges
        if not ({h,t}=={id2,id3} and k==rel2_idx)
    ]

    if not pruned_edges:
        three_hop_candidates.append(p2)      
        continue

    rel_idx_arr = np.fromiter((d["emb_idx"] for *_,d in pruned_edges), dtype=np.int32)
    sims = (rel_emb[rel_idx_arr] @ q32.T).ravel()
    if len(pruned_edges) > 30:
        keep = np.argpartition(-sims, 30)[:30]
    else:
        keep = np.arange(len(pruned_edges))
    ordered = keep[np.argsort(-sims[keep])]

    for idx in ordered:
        h3,t3,k3,d3 = pruned_edges[idx]
        src3, dst3 = ent_labels_full[h3], ent_labels_full[t3]
        if src3 != ent3_lbl:
            src3, dst3 = dst3, src3
        rel3_lbl = rel_labels_full[k3]
        three_hop_candidates.append(
            f"{ent1_lbl} -> {rel1_lbl} -> {ent2_lbl} -> {rel2_lbl} -> "
            f"{ent3_lbl} -> {rel3_lbl} -> {dst3}"
        )

print(f"🟢 Generated 3-hop candidates: {len(three_hop_candidates)}")


🟢 Generated 3-hop candidates: 176


In [24]:
# grouping by entity3
group3 = collections.defaultdict(list)
for p in three_hop_candidates:
    entity3_lbl = split_path(p)[4] if len(split_path(p)) >= 5 else split_path(p)[-1]
    group3[entity3_lbl].append(p)

chosen3 = []

for ent3, plist in group3.items():
    if len(plist) <= 5:
        chosen3.extend(plist)
        continue
    prompt3 = prompt_select_paths(
        question = q_text,
        entity   = ent3,
        paths    = plist,
        k        = 5
    )
    print("\n" + "="*100)
    print(f"🔷 PROMPT for entity3 **{ent3}**\n")
    print(prompt3[:2000] + ("\n[…truncated…]" if len(prompt3) > 2000 else ""))

    resp3 = qwen_generate([{"role":"user","content":prompt3}], max_new_tokens=64)
    print("\n🔷 RAW LLM response:\n", resp3)

    idx3 = parse_llm_indices(resp3)
    picked3 = [plist[i-1] for i in idx3 if 1 <= i <= len(plist)]
    if not picked3:
        picked3 = plist[:5]

    print("🔷 Chosen paths for entity3:")
    for pth in picked3: print("   •", pth)
    chosen3.extend(picked3)

if len(chosen3) > 4:
    prompt_final3 = prompt_select_paths(
        question = q_text,
        entity   = "the combined 3-hop set",
        paths    = chosen3,
        k        = 4
    )
    print("\n" + "="*100)
    print("🔶 PROMPT for final 4-path selection (Hop-3):\n")
    print(prompt_final3[:2000] + ("\n[…truncated…]" if len(prompt_final3) > 2000 else ""))

    resp_final3 = qwen_generate([{"role":"user","content":prompt_final3}], max_new_tokens=64)
    print("\n🔶 RAW LLM response:\n", resp_final3)

    idx_final3 = parse_llm_indices(resp_final3)
    final3 = [chosen3[i-1] for i in idx_final3 if 1 <= i <= len(chosen3)]
    if not final3: final3 = chosen3[:4]
else:
    final3 = chosen3

print("\n" + "="*100)
print(f"🎯 **Final Hop-3 paths ({len(final3)})**")
for p in final3:
    print(" •", p)



🔷 PROMPT for entity3 **disneynature**

**User Question**
        Were both Monkey Kingdom and Anaganaga O Dheerudu connected with Disney?

        **Task**
        You are navigating a heterogeneous knowledge-graph.  
        Below are candidate one-hop *paths* that all start at the entity **disneynature**.  
        Pick exactly **5 paths** whose relation nodes are *most likely* to contain evidence
        or lead to the answer.

        **How to answer**
        • Return **only the line numbers** of the chosen paths, separated by commas,  
          and wrap them inside `<answer>` … `</answer>`.  
        • Do **not** output anything else.

        **Candidate paths**
        1: monkey kingdom's -> is -> monkey kingdom -> released_by -> disneynature -> released_in -> earth day
2: monkey kingdom's -> is -> monkey kingdom -> released_by -> disneynature -> released_in -> the united states
3: monkey kingdom's -> is -> monkey kingdom -> released_by -> disneynature -> released_on -> april

In [25]:
def split_triplets(path: str) -> list[tuple[str, str, str]]:
    """'E1 -> R1 -> E2 -> …' ⟶ [(E1,R1,E2), (E2,R2,E3), …]"""
    toks = [t.strip() for t in path.split("->")]
    return [(toks[i], toks[i+1], toks[i+2]) for i in range(0, len(toks)-2, 2)]

def edge_chunk_ids(src_lbl: str, rel_lbl: str, dst_lbl: str) -> list[int]:
    s_idx = ent_labels_full.index(src_lbl)
    d_idx = ent_labels_full.index(dst_lbl)
    r_idx = rel_labels_full.index(rel_lbl)

    data = KG_u.get_edge_data(s_idx, d_idx, key=r_idx) or \
           KG_u.get_edge_data(d_idx, s_idx, key=r_idx)
    return data.get("chunk_ids", []) if data else []

relation_chunk_ids: set[int] = set()

for path in final3:
    for s_lbl, r_lbl, d_lbl in split_triplets(path):
        relation_chunk_ids.update(edge_chunk_ids(s_lbl, r_lbl, d_lbl))

print(f"🟢 Unique relation-chunk_ids = {len(relation_chunk_ids)}")
print(sorted(relation_chunk_ids))


🟢 Unique relation-chunk_ids = 6
[25138, 60079, 61140, 61147, 61148, 61150]


In [26]:
file_path = '/content/drive/MyDrive/HotpotQA_snapshot/all_docs_chunks_entities_relations_all.json'

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

target_ids = [25138, 60079, 61140, 61147, 61148, 61150]

data_by_id = {item['id']: item for item in data}

for tid in target_ids:
    item = data_by_id.get(tid)
    if item:
        print(f"\nID: {tid}\n")
        print(item.get('doc_chunk', 'doc_chunk موجود نیست'))
    else:
        print(f"\nID: {tid}   —   در داده‌ها یافت نشد.")



ID: 25138

Splash is a 1984 American fantasy romantic comedy film directed by Ron Howard, from a screenplay by Lowell Ganz, Babaloo Mandel, and Bruce Jay Friedman, and a story by Friedman and producer Brian Grazer, and starring Tom Hanks, Daryl Hannah, John Candy, and Eugene Levy. It involves a young man who falls in love with a mysterious woman who is secretly a mermaid. It was nominated for an Academy Award for Best Original Screenplay.The film is notable for being the first film released by Touchstone Pictures, a film label created by Walt Disney Studios that same year in an effort to release films targeted at adult audiences, with mature content not appropriate for the studio's flagship Walt Disney Pictures banner. Splash received a PG-rating for some profanity and brief nudity. Splash was critically and commercially successful, earning over $69 million on an $11 million budget, and received praise for the

ID: 60079

Mark Thomas Ciardi (pronounced CHAR-dee; born August 19, 1961) 

In [27]:
req_vars = [
    'questions', 'q_emb',                
    'ent_index', 'rel_index',            
    'KG_u',                              
    'ent_labels_full', 'rel_labels_full', 
    'rel_emb',                            
    'qwen_generate'                     
]
for v in req_vars:
    assert v in globals(), f"متغیر «{v}» موجود نیست؛ سلول‌های مقدماتی را اجرا کنید."
print("✓ همهٔ پیش‌نیازها در حافظه موجود است.")


✓ همهٔ پیش‌نیازها در حافظه موجود است.


In [28]:
import numpy as np, re, textwrap, collections, json
from pathlib import Path

def prompt_select_paths(question: str, entity: str, paths: list[str], k: int) -> str:
    numbered = "\n".join(f"{i+1}: {p}" for i, p in enumerate(paths))
    return textwrap.dedent(f"""
        **User Question**
        {question}

        **Task**
        You are navigating a heterogeneous knowledge-graph.
        Below are candidate one-hop *paths* that all start at the entity **{entity}**.
        Pick exactly **{k} paths** whose relation nodes are *most likely* to contain evidence
        or lead to the answer.

        **How to answer**
        • Return **only the line numbers** of the chosen paths, separated by commas,
          and wrap them inside `<answer>` … `</answer>`.
        • Do **not** output anything else.

        **Candidate paths**
        {numbered}

        <answer>
    """).strip()

def parse_llm_indices(raw: str) -> list[int]:
    m = re.search(r"<answer>(.*?)</answer>", raw, flags=re.I|re.S)
    if not m:
        return []
    return [int(x) for x in re.findall(r"\d+", m.group(1))]

split_path      = lambda p: [t.strip() for t in p.split("->")]
split_triplets  = lambda p: [(t[i], t[i+1], t[i+2]) for t in [split_path(p)] for i in range(0, len(t)-2, 2)]

def edge_chunk_ids(src_lbl: str, rel_lbl: str, dst_lbl: str) -> list[int]:
    s_idx, d_idx   = ent_labels_full.index(src_lbl),  ent_labels_full.index(dst_lbl)
    r_idx          = rel_labels_full.index(rel_lbl)
    data           = (KG_u.get_edge_data(s_idx, d_idx, key=r_idx) or
                      KG_u.get_edge_data(d_idx, s_idx, key=r_idx))
    return data.get("chunk_ids", []) if data else []

with (BASE_DIR / "all_docs_chunks_entities_relations_all.json").open(encoding="utf-8") as f:
    _chunks_by_id = {row["id"]: row["doc_chunk"] for row in json.load(f)}
print("✓ doc_chunk mapping در RAM:", len(_chunks_by_id), "چانک")


✓ doc_chunk mapping در RAM: 66237 چانک


In [29]:
def process_question(qid: int,
                     top_ne         = 3,   
                     per_entity_ne  = 3,   
                     hop1_keep_e    = 30, 
                     hop1_keep_llm  = 5,   
                     hop1_final     = 10,  

                     hop2_keep_e    = 30,
                     hop2_keep_llm  = 5,
                     hop2_final     = 10,

                     hop3_keep_e    = 30,
                     hop3_keep_llm  = 5,
                     hop3_final     = 4) -> dict:

    q_vec  = q_emb[qid:qid+1].astype("float32")
    q_text = questions[qid]
    local_entities  = full_data[qid].get("entities", []) or []

    _, I_top = ent_index.search(q_vec, top_ne)
    seed = [ent_labels_full[i] for i in I_top[0]]
    seed.extend(local_entities)
    for ent in local_entities:
        if ent in ent_labels_full:
            v  = ent_emb_full[ent_labels_full.index(ent):][:1]
            _, I_sim = ent_index.search(v, per_entity_ne)
            seed.extend([ent_labels_full[i] for i in I_sim[0]])
    seed = list(dict.fromkeys(seed))     

    valid = [(e, ent_labels_full.index(e))
             for e in seed
             if e in ent_labels_full and KG_u.degree(ent_labels_full.index(e)) > 0]
    if not valid:
        return {"Final_paths": [], "relation_chunk_ids": [], "evidence": []}

    entity_edges = {}
    for e_lbl, nid in valid:
        Edges = list(KG_u.edges(nid, keys=True, data=True))
        if not Edges:
            continue
        rel_idx = np.fromiter((d["emb_idx"] for *_, d in Edges), dtype=np.int32)
        sims    = (rel_emb[rel_idx] @ q_vec.T).ravel()
        top_k   = np.argpartition(-sims, min(hop1_keep_e, len(sims))-1)[:hop1_keep_e]
        chosen  = top_k[np.argsort(-sims[top_k])]
        paths   = []
        for idx in chosen:
            h, t, k, d = Edges[idx]
            src, dst   = ent_labels_full[h], ent_labels_full[t]
            if src != e_lbl:
                src, dst = dst, src
            paths.append(((h, t, k),
                          float(sims[idx]),
                          f"{src} -> {rel_labels_full[k]} -> {dst}"))
        entity_edges[e_lbl] = paths

    hop1_all = []
    for e_lbl, lst in entity_edges.items():
        p_list = [p for *_, p in lst]
        if len(p_list) > hop1_keep_llm:
            prompt = prompt_select_paths(q_text, e_lbl, p_list, hop1_keep_llm)
            resp   = qwen_generate([{"role":"user","content":prompt}], max_new_tokens=64)
            idx    = parse_llm_indices(resp)
            p_list = [p_list[i-1] for i in idx if 1<=i<=len(p_list)] or p_list[:hop1_keep_llm]
        hop1_all.extend(p_list)

    if len(hop1_all) > hop1_final:
        prompt = prompt_select_paths(q_text, "combined hop-1", hop1_all, hop1_final)
        resp   = qwen_generate([{"role":"user","content":prompt}], max_new_tokens=64)
        idx    = parse_llm_indices(resp)
        hop1_all = [hop1_all[i-1] for i in idx if 1<=i<=len(hop1_all)] or hop1_all[:hop1_final]

    edge_lookup = {p:e for lst in entity_edges.values() for (e,_,p) in lst}
    hop2_raw = []
    for path1 in hop1_all:
        h1, t1, k1 = edge_lookup[path1]
        e1, _, e2  = split_path(path1)[:3]
        nid2       = ent_labels_full.index(e2)
        inner_edges = [(h,t,k,d) for h,t,k,d in KG_u.edges(nid2, keys=True, data=True)
                       if not ({h,t}=={h1,t1} and k==k1)]
        if not inner_edges:
            hop2_raw.append(path1); continue
        rel_idx = np.fromiter((d["emb_idx"] for *_,d in inner_edges), dtype=np.int32)
        sims    = (rel_emb[rel_idx] @ q_vec.T).ravel()
        top_k   = np.argpartition(-sims, min(hop2_keep_e, len(sims))-1)[:hop2_keep_e]
        chosen  = top_k[np.argsort(-sims[top_k])]
        for idx in chosen:
            h,t,k,d = inner_edges[idx]
            src,dst = ent_labels_full[h], ent_labels_full[t]
            if src != e2: src,dst = dst,src
            hop2_raw.append(f"{e1} -> {rel_labels_full[k1]} -> {e2} "
                            f"-> {rel_labels_full[k]} -> {dst}")

    grp2 = collections.defaultdict(list)
    for p in hop2_raw:
        grp2[split_path(p)[2]].append(p)

    hop2_all = []
    for mid, lst in grp2.items():
        if len(lst) > hop2_keep_llm:
            prompt = prompt_select_paths(q_text, mid, lst, hop2_keep_llm)
            resp   = qwen_generate([{"role":"user","content":prompt}], max_new_tokens=64)
            idx    = parse_llm_indices(resp)
            lst    = [lst[i-1] for i in idx if 1<=i<=len(lst)] or lst[:hop2_keep_llm]
        hop2_all.extend(lst)

    if len(hop2_all) > hop2_final:
        prompt = prompt_select_paths(q_text, "combined hop-2", hop2_all, hop2_final)
        resp   = qwen_generate([{"role":"user","content":prompt}], max_new_tokens=64)
        idx    = parse_llm_indices(resp)
        hop2_all = [hop2_all[i-1] for i in idx if 1<=i<=len(hop2_all)] or hop2_all[:hop2_final]

    hop3_raw = []
    for p2 in hop2_all:
        toks = split_path(p2)
        if len(toks) < 5:
            hop3_raw.append(p2); continue
        e1,r1,e2,r2,e3 = toks[:5]
        nid2,nid3      = ent_labels_full.index(e2), ent_labels_full.index(e3)
        r2_idx         = rel_labels_full.index(r2)
        inner3 = [(h,t,k,d) for h,t,k,d in KG_u.edges(nid3, keys=True, data=True)
                  if not ({h,t}=={nid2,nid3} and k==r2_idx)]
        if not inner3:
            hop3_raw.append(p2); continue
        rel_idx = np.fromiter((d["emb_idx"] for *_,d in inner3), dtype=np.int32)
        sims    = (rel_emb[rel_idx] @ q_vec.T).ravel()
        top_k   = np.argpartition(-sims, min(hop3_keep_e, len(sims))-1)[:hop3_keep_e]
        chosen  = top_k[np.argsort(-sims[top_k])]
        for idx in chosen:
            h,t,k,d = inner3[idx]
            src,dst = ent_labels_full[h], ent_labels_full[t]
            if src != e3: src,dst = dst,src
            hop3_raw.append(f"{e1} -> {r1} -> {e2} -> {r2} -> {e3} "
                            f"-> {rel_labels_full[k]} -> {dst}")

    grp3 = collections.defaultdict(list)
    for p in hop3_raw:
        grp3[split_path(p)[4] if len(split_path(p))>=5 else split_path(p)[-1]].append(p)

    hop3_all = []
    for e3,lst in grp3.items():
        if len(lst) > hop3_keep_llm:
            prompt = prompt_select_paths(q_text, e3, lst, hop3_keep_llm)
            resp   = qwen_generate([{"role":"user","content":prompt}], max_new_tokens=64)
            idx    = parse_llm_indices(resp)
            lst    = [lst[i-1] for i in idx if 1<=i<=len(lst)] or lst[:hop3_keep_llm]
        hop3_all.extend(lst)

    if len(hop3_all) > hop3_final:
        prompt = prompt_select_paths(q_text, "combined hop-3", hop3_all, hop3_final)
        resp   = qwen_generate([{"role":"user","content":prompt}], max_new_tokens=64)
        idx    = parse_llm_indices(resp)
        hop3_all = [hop3_all[i-1] for i in idx if 1<=i<=len(hop3_all)] or hop3_all[:hop3_final]

    chunk_ids = set()
    for p in hop3_all:
        for s,r,d in split_triplets(p):
            chunk_ids.update(edge_chunk_ids(s,r,d))
    evidence_texts = [_chunks_by_id[cid] for cid in chunk_ids]

    return {
        "Final_paths": hop3_all,
        "relation_chunk_ids": sorted(chunk_ids),
        "evidence": evidence_texts
    }


In [30]:
import copy, tqdm, json

with EVIDENCE_JSON.open(encoding="utf-8") as f:
    full_data = json.load(f)
assert len(full_data) == 100 == len(questions)

augmented = copy.deepcopy(full_data)

for qid in tqdm.tqdm(range(100), desc="Processing 100 questions"):
    result = process_question(qid)
    augmented[qid]["Final_paths"]        = result["Final_paths"]
    augmented[qid]["relation_chunk_ids"] = result["relation_chunk_ids"]
    augmented[qid]["evidence"]           = result["evidence"]

out_file = BASE_DIR / "evidence_hotpotqa_entities_with_paths.json"
with out_file.open("w", encoding="utf-8") as f:
    json.dump(augmented, f, ensure_ascii=False, indent=2)

print("✓ خروجی ذخیره شد →", out_file)


Processing 100 questions: 100%|██████████| 100/100 [26:15<00:00, 15.75s/it]

✓ خروجی ذخیره شد → /content/drive/MyDrive/HotpotQA_snapshot/evidence_hotpotqa_entities_with_paths.json



