In [1]:

!nvidia-smi -L

from google.colab import drive
drive.mount('/content/drive')


GPU 0: NVIDIA L4 (UUID: GPU-af3496cf-f50f-c72f-add3-6d87096c9e91)
Mounted at /content/drive


In [2]:
#2
!pip install -q networkx tqdm


In [3]:
#3
from pathlib import Path

BASE_DIR = Path("/content/drive/MyDrive/HotpotQA_snapshot")
JSON_PATH      = BASE_DIR / "all_docs_chunks_entities_relations_all.json"

ENT_ORDER_PATH = BASE_DIR / "unique_entities_ordered.txt"
REL_ORDER_PATH = BASE_DIR / "unique_relations_ordered.txt"

GRAPH_PKL_PATH = BASE_DIR / "hotpotqa_kg_v2.gpickle"    


In [4]:
#4
def load_ordered(path: Path) -> list[str]:
    with path.open(encoding="utf‑8") as f:
        return [ln.rstrip("\n") for ln in f if ln.strip()]

ordered_entities  = load_ordered(ENT_ORDER_PATH)
ordered_relations = load_ordered(REL_ORDER_PATH)

entity2idx   = {txt: i for i, txt in enumerate(ordered_entities)}
relation2idx = {txt: i for i, txt in enumerate(ordered_relations)}

print(f"entities : {len(entity2idx):,}")
print(f"relations: {len(relation2idx):,}")


entities : 557,825
relations: 139,253


In [5]:
#5
import re, unicodedata, itertools
_WS_RE       = re.compile(r"\s+", re.UNICODE)
QUOTE_CHARS  = '"\'“”‘’´‛❝❞❮❯⹂〝〞«»‚„”″′˝'
PUNCT_TAIL   = ',.;:!?☞\u2014\u2013-'

def _strip_outer_quotes(t: str) -> str:
    return t[1:-1] if len(t) >= 2 and t[0] in QUOTE_CHARS and t[-1] in QUOTE_CHARS else t

def normalize_ent(txt: str) -> str:
    s = txt.strip()
    while s and s[0] in QUOTE_CHARS: s = s[1:]
    while s and s[-1] in QUOTE_CHARS: s = s[:-1]
    s = _strip_outer_quotes(s).strip(PUNCT_TAIL + " ")
    s = unicodedata.normalize("NFKC", s).casefold()
    return _WS_RE.sub(" ", s).strip()

def clean_rel(middle: str) -> str:
    r = middle.strip()
    while r and r[0] in QUOTE_CHARS: r = r[1:]
    while r and r[-1] in QUOTE_CHARS: r = r[:-1]
    r = _strip_outer_quotes(r).strip(PUNCT_TAIL + " ")
    r = unicodedata.normalize("NFKC", r).casefold()
    r = _WS_RE.sub(" ", r).strip()
    return r.replace(" ", "_")


In [6]:
#6
import json, collections, tqdm

print("⏳ loading JSON …")
with JSON_PATH.open(encoding="utf‑8") as f:
    chunks = json.load(f)

NodeInfo = collections.namedtuple("NodeInfo", "types chunk_ids")
node_tmp  : dict[int, NodeInfo]          = {}
edge_tmp  : dict[tuple[int,int,int], set[int]] = collections.defaultdict(set)

missing_ent, missing_rel = 0, 0

for ch in tqdm.tqdm(chunks, desc="scan"):
    cid   = ch["id"]
    ents  = ch.get("entities", [])
    rels  = ch.get("relations", [])

    for ent in ents:
        e_norm = normalize_ent(ent["text"])
        idx    = entity2idx.get(e_norm)
        if idx is None:          
            missing_ent += 1
            continue
        ni = node_tmp.setdefault(idx, NodeInfo(set(), set()))
        ni.types.add(ent["type"])
        ni.chunk_ids.add(cid)

    for rel_raw in rels:
        if rel_raw.count("->") < 2:
            continue
        parts = [p.strip() for p in rel_raw.split("->") if p.strip()]
        head_norm, tail_norm = normalize_ent(parts[0]), normalize_ent(parts[-1])
        middle_raw           = "_".join(parts[1:-1]) if len(parts) > 2 else parts[1]
        rel_norm             = clean_rel(middle_raw)

        h_idx = entity2idx.get(head_norm)
        t_idx = entity2idx.get(tail_norm)
        r_idx = relation2idx.get(rel_norm)

        if None in (h_idx, t_idx):
            missing_ent += 1
            continue
        if r_idx is None:
            missing_rel += 1
            continue

        node_tmp.setdefault(h_idx, NodeInfo(set(), set())).chunk_ids.add(cid)
        node_tmp.setdefault(t_idx, NodeInfo(set(), set())).chunk_ids.add(cid)

        edge_tmp[(h_idx, r_idx, t_idx)].add(cid)

print(f"⭕ missing entities in lookup : {missing_ent}")
print(f"⭕ missing relations in lookup: {missing_rel}")
print(f"nodes collected : {len(node_tmp):,}")
print(f"edges collected : {len(edge_tmp):,}")


⏳ loading JSON …


scan: 100%|██████████| 66237/66237 [00:23<00:00, 2858.02it/s]

⭕ missing entities in lookup : 97263
⭕ missing relations in lookup: 4
nodes collected : 557,821
edges collected : 1,075,644





In [7]:
#7
import networkx as nx

KG = nx.MultiDiGraph()

for idx, info in tqdm.tqdm(node_tmp.items(), desc="add nodes"):
    KG.add_node(
        idx,
        label      = ordered_entities[idx],            
        emb_idx    = idx,
        types      = sorted(info.types) if info.types else None,
        chunk_ids  = sorted(info.chunk_ids)               
    )

for (h, r, t), cid_set in tqdm.tqdm(edge_tmp.items(), desc="add edges"):
    KG.add_edge(
        h, t, key=r,
        relation  = ordered_relations[r],
        emb_idx   = r,
        chunk_ids = sorted(cid_set),
    )

print(f"✔️ graph: nodes={KG.number_of_nodes():,}  |  edges={KG.number_of_edges():,}")


add nodes: 100%|██████████| 557821/557821 [00:03<00:00, 165578.90it/s]
add edges: 100%|██████████| 1075644/1075644 [00:09<00:00, 112930.19it/s]


✔️ graph: nodes=557,821  |  edges=1,075,644


In [8]:
#9
missing_attr = [n for n, d in KG.nodes(data=True) if not d]   # dict خالی
for n in missing_attr:
    KG.nodes[n].update(
        label     = ordered_entities[n],
        emb_idx   = n,
        types     = None,
        chunk_ids = [],
    )
print("⚠️ nodes with empty dict fixed →", len(missing_attr))


⚠️ nodes with empty dict fixed → 0


In [9]:
#10
import gzip, pickle, time, os

t0 = time.time()
with gzip.open(GRAPH_PKL_PATH, "wb") as f:
    pickle.dump(KG, f, protocol=pickle.HIGHEST_PROTOCOL)
print(f"💾 saved → {GRAPH_PKL_PATH}   |   {(time.time()-t0):.1f}s")
print(f"size ≈ {os.path.getsize(GRAPH_PKL_PATH)/1e6:.1f} MB")


💾 saved → /content/drive/MyDrive/HotpotQA_snapshot/hotpotqa_kg_v2.gpickle   |   33.9s
size ≈ 36.8 MB


In [10]:
#11
import random, pprint

def sample_node(min_edges: int = 5, max_trials: int = 1000):
    for _ in range(max_trials):
        n = random.choice(list(KG.nodes))
        deg = KG.out_degree(n) + KG.in_degree(n)
        if deg >= min_edges:
            return n, deg
    raise RuntimeError("no node with required degree found")

nid, deg = sample_node()
print(f"🎲 node id={nid} | total_degree={deg}")
pprint.pprint(KG.nodes[nid])

print("\n🔗 edges involving this node:")
for direction, edges in (("out", KG.out_edges(nid, keys=True, data=True)),
                         ("in ", KG.in_edges(nid,  keys=True, data=True))):
    print(f"  {direction}-edges: {len(edges)}")
    for h, t, k, data in itertools.islice(edges, 0, 10):   
        print(f"    {ordered_entities[h]}  --{ordered_relations[k]}-->  {ordered_entities[t]}  | chunks={len(data['chunk_ids'])}")


🎲 node id=428304 | total_degree=8
{'chunk_ids': [12142,
               22669,
               22776,
               24862,
               24863,
               32979,
               37351,
               45197,
               57423,
               57424],
 'emb_idx': 428304,
 'label': 'silesia',
 'types': ['GPE', 'LOC']}

🔗 edges involving this node:
  out-edges: 2
    silesia  --located_in-->  oder–neisse line  | chunks=1
    silesia  --present_day-->  wrocław  | chunks=1
  in -edges: 6
    maria szraiber  --born_in-->  silesia  | chunks=1
    german  --displaced-->  silesia  | chunks=1
    unrest  --expanded_into-->  silesia  | chunks=1
    luther  --spoke_out_against-->  silesia  | chunks=1
    frank  --located_in-->  silesia  | chunks=1
    breslau  --located_in-->  silesia  | chunks=2


In [11]:
#12
def sample_edge(max_trials: int = 1000):
    for _ in range(max_trials):
        h, t, k = random.choice(list(KG.edges(keys=True)))
        data    = KG.get_edge_data(h, t, k)
        if data: return h, t, k, data
    raise RuntimeError("no edge with data found")

h, t, k, edata = sample_edge()
print("🎲 random edge")
print(f"{ordered_entities[h]}  --{edata['relation']}-->  {ordered_entities[t]}")
pprint.pprint(edata)


🎲 random edge
independencia  --border_with-->  the los olivos district
{'chunk_ids': [60838], 'emb_idx': 13974, 'relation': 'border_with'}


In [12]:
unique_missing = set()

for ch in tqdm.tqdm(chunks, desc="analyse missing"):
    for rel_raw in ch["relations"]:
        if rel_raw.count("->") < 2:
            continue
        parts = [p.strip() for p in rel_raw.split("->") if p.strip()]
        heads, tails = parts[0], parts[-1]

        for raw_side in (heads, tails):
            norm_side = normalize_ent(raw_side)
            if norm_side not in entity2idx:
                unique_missing.add(norm_side)

print("🔎  Unique missing entities:", len(unique_missing))   # باید 4 باشد
for e in list(unique_missing)[:10]:
    print(" •", repr(e))


analyse missing: 100%|██████████| 66237/66237 [00:06<00:00, 10181.88it/s]

🔎  Unique missing entities: 1
نمونه‌ها:
 • ''





In [13]:
total_rel_strings = sum(len(ch["relations"]) for ch in chunks)
print("کل رشته‌های relation :", total_rel_strings)
skipped  = missing_ent          
kept     = len(edge_tmp)        
print(f"یال ساخته شده | skipped ≈ {skipped/total_rel_strings:.1%} از کل strings")


کل رشته‌های relation : 1235425
یال ساخته شده | skipped ≈ 7.9% از کل strings


In [14]:
empty_types_count = sum(
    1
    for _, data in KG.nodes(data=True)
    if not data.get('types')  # types is None or empty list
)
print(f"Nodes with empty or None types: {empty_types_count}")


Nodes with empty or None types: 271238
