# 4) KG Cleanup, Cross-Type Resolution & Multi-Country Schema

**Purpose:** take the Stage 3 v2 knowledge graph (1,748 nodes, 3,929 triples from entity-registry extraction) and produce a clean, country-tagged graph suitable for:
1. A queryable governance intelligence backend (RAG / structured queries)
2. Scalable integration of UK and Canadian corpora through the same schema
3. Presentation-ready outputs and statistics

**Pipeline:**
- §0 — Dependencies & configuration
- §1 — Load Stage 3 v2 outputs + integrity checks
- §2 — Generic-term pruning (remove uninformative hub nodes)
- §3 — Cross-type entity resolution (merge same concept across different types)
- §4 — Type correction (LLM-adjudicated)
- §5 — Country tagging + multi-country schema
- §6 — Final graph assembly + validation
- §7 — Export clean graph + backend-ready artefacts


## 0) Install dependencies
Run once per Colab runtime.

In [2]:
!pip -q install anthropic pandas numpy tqdm tenacity networkx matplotlib rapidfuzz sentence-transformers scikit-learn

import os, json, re, hashlib
from pathlib import Path
from collections import defaultdict, Counter
from itertools import combinations

import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm
from rapidfuzz import fuzz

print("✅ Dependencies installed.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/405.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m405.9/405.9 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m140.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m72.7 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Dependencies installed.


## 1) Configuration + API key
Reads `Anthropic_API_KEY` from **Colab Secrets** only. Model selection is centralised here.

In [3]:
# ─────────────────────────────────────────────
# 1) Configuration
# ─────────────────────────────────────────────
from google.colab import userdata

# API
ANTHROPIC_API_KEY = userdata.get("Anthropic_API_KEY")
MODEL = "claude-sonnet-4-5"  # All LLM calls in this notebook
LLM_TEMPERATURE = 0.0

# Paths — reads from Stage 3 v2 output
INPUT_DIR  = "/content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/3_graph_built_v2"
OUTPUT_DIR = "/content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/4_kg_cleaned"
CACHE_DIR  = str(Path(OUTPUT_DIR) / "cache")

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)

# Country tag for this corpus
COUNTRY = "AU"
COUNTRY_FULL = "Australia"

# Schema (must match Stage 3)
ENTITY_TYPES = [
    "Instrument", "Authority", "Jurisdiction", "Requirement",
    "Practice", "MaterialAsset", "Stakeholder",
    "Barrier", "Enabler", "OutcomeMetric",
]

RELATION_TYPES = [
    "ISSUED_BY", "APPLIES_IN", "APPLIES_TO",
    "REQUIRES", "PROHIBITS", "REFERENCES",
    "INVOLVES", "PRODUCES", "BARRIERS", "ENABLES", "AFFECTS",
]

print(f"Config loaded. Country: {COUNTRY}")
print(f"Input:  {INPUT_DIR}")
print(f"Output: {OUTPUT_DIR}")


Config loaded. Country: AU
Input:  /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/3_graph_built_v2
Output: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/4_kg_cleaned


## 2) Load Stage 3 v2 outputs and run integrity checks

In [4]:
# ─────────────────────────────────────────────
# 2) Load nodes, edges, registry
# ─────────────────────────────────────────────
def require_columns(df, cols, name):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        raise ValueError(f"{name} missing columns: {missing}")

nodes_raw = pd.read_csv(os.path.join(INPUT_DIR, "nodes.csv"))
edges_raw = pd.read_csv(os.path.join(INPUT_DIR, "edges.csv"))
triples_raw = pd.read_csv(os.path.join(INPUT_DIR, "edge_triples.csv"))

require_columns(nodes_raw, ["node_id", "entity_type", "name", "name_norm"], "nodes")
require_columns(edges_raw, ["edge_id", "subject_id", "predicate", "object_id"], "edges")

# Load entity registry from Stage 3
with open(os.path.join(INPUT_DIR, "entity_registry.json")) as f:
    registry_entries = json.load(f)

# Load source chunks for evidence retrieval later
chunks_path = os.path.join(INPUT_DIR, "KG_records.jsonl")
kg_records = []
if os.path.exists(chunks_path):
    with open(chunks_path) as f:
        for line in f:
            if line.strip():
                kg_records.append(json.loads(line))

print(f"Stage 3 v2 loaded:")
print(f"  Nodes:    {len(nodes_raw)}")
print(f"  Edges:    {len(edges_raw)}")
print(f"  Triples:  {len(triples_raw)}")
print(f"  Registry: {len(registry_entries)} canonical entries")
print(f"  KG recs:  {len(kg_records)} chunk extractions")

print(f"\nNode type distribution:")
print(nodes_raw["entity_type"].value_counts().to_string())

# Build baseline graph
G_raw = nx.MultiDiGraph()
for _, row in nodes_raw.iterrows():
    G_raw.add_node(row["node_id"], **row.to_dict())
for _, row in edges_raw.iterrows():
    G_raw.add_edge(row["subject_id"], row["object_id"],
                   predicate=row["predicate"],
                   edge_id=row["edge_id"])

comps = list(nx.weakly_connected_components(G_raw))
largest = max(comps, key=len)
print(f"\nBaseline graph: {G_raw.number_of_nodes()} nodes, {G_raw.number_of_edges()} edges")
print(f"Components: {len(comps)}, largest: {len(largest)} ({100*len(largest)/G_raw.number_of_nodes():.1f}%)")


Stage 3 v2 loaded:
  Nodes:    1748
  Edges:    4529
  Triples:  3929
  Registry: 2637 canonical entries
  KG recs:  355 chunk extractions

Node type distribution:
entity_type
Practice         460
Instrument       261
MaterialAsset    254
Requirement      156
Stakeholder      143
Barrier          126
OutcomeMetric    120
Authority        100
Enabler           85
Jurisdiction      43

Baseline graph: 1748 nodes, 4529 edges
Components: 105, largest: 1576 (90.2%)


## 3) Generic-term pruning (remove uninformative hub nodes)
Entities like "Materials", "Waste", "Stakeholder", "Industry" are category-level abstractions,
not domain entities. They absorb edges that should connect to specific concepts and inflate
node degree without adding information. We flag and remove them.

**Strategy:** combine a curated exclusion list with degree-based outlier detection.
Entities pruned here are logged but permanently removed — they are noise, not signal.

In [5]:
# ─────────────────────────────────────────────
# 3a) Curated generic-term exclusion list
# ─────────────────────────────────────────────
# These are category-level or abstract terms that should not be graph entities.
# Criterion: the term names a CATEGORY of things rather than a specific thing.

GENERIC_TERMS = {
    # Abstract categories
    "materials", "waste", "stakeholders", "stakeholder", "industry",
    "infrastructure", "environment", "built environment", "construction",
    "government", "regulation", "regulations", "policy", "policies",
    "building", "buildings", "resources", "products", "services",
    "technology", "technologies", "innovation", "market", "markets",
    "community", "communities", "society", "sector", "economy",
    # Verb-fragment practice labels
    "reuse", "recycling", "reduce", "recovery", "repair",
    "redesign", "rethink", "reducing", "reusing", "recovering",
    # Over-broad composite terms
    "circular economy", "construction industry", "building sector",
    "built environment sector", "construction sector",
    "circular economy principles", "sustainability",
    "waste management", "resource management",
}

# Match against name_norm
nodes_work = nodes_raw.copy()
nodes_work["is_generic"] = nodes_work["name_norm"].isin(GENERIC_TERMS)

n_generic = nodes_work["is_generic"].sum()
print(f"Curated generic matches: {n_generic} nodes")
print(f"\nGeneric nodes to remove:")
generic_nodes = nodes_work[nodes_work["is_generic"]].sort_values("entity_type")
for _, row in generic_nodes.iterrows():
    deg = G_raw.degree(row["node_id"]) if row["node_id"] in G_raw else 0
    print(f"  [{row['entity_type']:15s}] {row['name']:45s} (degree {deg})")


Curated generic matches: 26 nodes

Generic nodes to remove:
  [Authority      ] Government                                    (degree 47)
  [Barrier        ] Waste                                         (degree 68)
  [Enabler        ] Circular Economy Principles                   (degree 104)
  [Enabler        ] Innovation                                    (degree 8)
  [Enabler        ] Markets                                       (degree 25)
  [Instrument     ] Circular Economy                              (degree 255)
  [Instrument     ] Regulations                                   (degree 26)
  [Jurisdiction   ] Built Environment                             (degree 14)
  [Jurisdiction   ] Construction Industry                         (degree 54)
  [MaterialAsset  ] Products                                      (degree 39)
  [MaterialAsset  ] Infrastructure                                (degree 47)
  [MaterialAsset  ] Materials                                     (degree 67)
  [

In [6]:
# ─────────────────────────────────────────────
# 3b) Degree-based outlier detection (catch remaining hubs)
# ─────────────────────────────────────────────
# Nodes with extremely high degree relative to their type are likely generic.
# We flag for review but only auto-remove those matching soft patterns.

degree_data = []
for _, row in nodes_work.iterrows():
    nid = row["node_id"]
    deg = G_raw.degree(nid) if nid in G_raw else 0
    degree_data.append({"node_id": nid, "name": row["name"], "type": row["entity_type"], "degree": deg})

deg_df = pd.DataFrame(degree_data)

# Per-type thresholds: flag if degree > mean + 3*std within type
flagged_hubs = []
for etype, group in deg_df.groupby("type"):
    mean_d = group["degree"].mean()
    std_d = group["degree"].std()
    threshold = mean_d + 3 * std_d
    if threshold < 10:
        threshold = 10  # minimum floor
    outliers = group[group["degree"] > threshold]
    for _, row in outliers.iterrows():
        if not nodes_work.loc[nodes_work["node_id"] == row["node_id"], "is_generic"].values[0]:
            flagged_hubs.append(row.to_dict())

if flagged_hubs:
    print(f"Degree outliers (not already in generic list) — REVIEW:")
    for h in sorted(flagged_hubs, key=lambda x: -x["degree"]):
        print(f"  [{h['type']:15s}] {h['name']:45s} (degree {h['degree']})")
else:
    print("No additional degree outliers found beyond generic list.")


Degree outliers (not already in generic list) — REVIEW:
  [OutcomeMetric  ] Material Reuse                                (degree 88)
  [MaterialAsset  ] Construction and Demolition Waste (C&D Waste) (degree 85)
  [Instrument     ] Design for Deconstruction (DfD)               (degree 79)
  [Enabler        ] Design for Disassembly (DfD)                  (degree 76)
  [Jurisdiction   ] Australia                                     (degree 76)
  [MaterialAsset  ] Material passports                            (degree 69)
  [OutcomeMetric  ] Material Recovery                             (degree 57)
  [Practice       ] Circular Design                               (degree 55)
  [MaterialAsset  ] Recycled Materials                            (degree 53)
  [Requirement    ] Disassembly                                   (degree 50)
  [Practice       ] Circular Economy Practices                    (degree 46)
  [Stakeholder    ] Contractors                                   (degree 40)
  [Barri

In [7]:
# ─────────────────────────────────────────────
# 3c) Apply generic pruning
# ─────────────────────────────────────────────
generic_ids = set(nodes_work.loc[nodes_work["is_generic"], "node_id"])

nodes_pruned = nodes_work[~nodes_work["is_generic"]].copy()
edges_pruned = edges_raw[
    ~edges_raw["subject_id"].isin(generic_ids) &
    ~edges_raw["object_id"].isin(generic_ids)
].copy()

# Rebuild triples
triples_pruned = triples_raw[
    ~triples_raw["subject_id"].isin(generic_ids) &
    ~triples_raw["object_id"].isin(generic_ids)
].copy()

print(f"After generic pruning:")
print(f"  Nodes: {len(nodes_raw)} → {len(nodes_pruned)} (removed {len(nodes_raw) - len(nodes_pruned)})")
print(f"  Edges: {len(edges_raw)} → {len(edges_pruned)} (removed {len(edges_raw) - len(edges_pruned)})")
print(f"  Triples: {len(triples_raw)} → {len(triples_pruned)} (removed {len(triples_raw) - len(triples_pruned)})")

# Save pruning log
pruning_log = generic_nodes[["node_id", "entity_type", "name"]].to_dict("records")
with open(os.path.join(OUTPUT_DIR, "generic_pruning_log.json"), "w") as f:
    json.dump(pruning_log, f, indent=2)
print(f"  Pruning log saved ({len(pruning_log)} entries)")


After generic pruning:
  Nodes: 1748 → 1722 (removed 26)
  Edges: 4529 → 3282 (removed 1247)
  Triples: 3929 → 2863 (removed 1066)
  Pruning log saved (26 entries)


## 4) Cross-type entity resolution
The original pipeline only merged entities within the same type. This step catches concepts
that were extracted as different types from different chunks — e.g.:
- "Design for Deconstruction" as Instrument AND "Design for Disassembly" as Enabler
- "Circular Economy" as Instrument AND Practice AND Enabler
- "Construction Industry" as Jurisdiction AND Stakeholder

**Strategy:** embed all remaining entities (ignoring type prefix), cluster by cosine similarity,
then send candidate cross-type clusters to Claude for adjudication.


In [8]:
# ─────────────────────────────────────────────
# 4a) Compute type-agnostic embeddings
# ─────────────────────────────────────────────
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

print("Loading embedding model...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Embed using ONLY the entity name (no type prefix) to catch cross-type matches
entity_names = nodes_pruned["name"].tolist()
entity_ids = nodes_pruned["node_id"].tolist()
entity_types = nodes_pruned["entity_type"].tolist()

print(f"Computing embeddings for {len(entity_names)} entities (type-agnostic)...")
embeddings = embed_model.encode(entity_names, show_progress_bar=True, batch_size=64)
print(f"Embeddings: {embeddings.shape} ✓")


Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing embeddings for 1722 entities (type-agnostic)...


Batches:   0%|          | 0/27 [00:00<?, ?it/s]

Embeddings: (1722, 384) ✓


In [9]:
# ─────────────────────────────────────────────
# 4b) Find cross-type candidate pairs (cosine ≥ 0.80)
# ─────────────────────────────────────────────
CROSS_TYPE_COSINE = 0.80  # Higher threshold since cross-type merges are riskier

# Also use token Jaccard as a secondary filter
def token_jaccard(a, b):
    ta = set(a.lower().split())
    tb = set(b.lower().split())
    if not ta or not tb:
        return 0.0
    return len(ta & tb) / len(ta | tb)

# Find cross-type pairs
cross_candidates = []
n = len(entity_names)

# Batch cosine computation per type-pair to manage memory
from itertools import combinations as type_combos

type_indices = defaultdict(list)
for i, t in enumerate(entity_types):
    type_indices[t].append(i)

for t1, t2 in type_combos(sorted(type_indices.keys()), 2):
    idx1 = type_indices[t1]
    idx2 = type_indices[t2]

    if not idx1 or not idx2:
        continue

    emb1 = embeddings[idx1]
    emb2 = embeddings[idx2]
    sim_matrix = cosine_similarity(emb1, emb2)

    for li, gi in enumerate(idx1):
        for lj, gj in enumerate(idx2):
            if sim_matrix[li, lj] >= CROSS_TYPE_COSINE:
                tj = token_jaccard(entity_names[gi], entity_names[gj])
                if tj >= 0.3:  # At least some lexical overlap
                    cross_candidates.append({
                        "idx_a": gi, "idx_b": gj,
                        "name_a": entity_names[gi], "name_b": entity_names[gj],
                        "type_a": entity_types[gi], "type_b": entity_types[gj],
                        "id_a": entity_ids[gi], "id_b": entity_ids[gj],
                        "cosine": float(sim_matrix[li, lj]),
                        "jaccard": tj,
                    })

# Also find within-type near-duplicates that Pass 2 missed
within_candidates = []
for etype, indices in type_indices.items():
    if len(indices) < 2:
        continue
    emb_sub = embeddings[indices]
    sim_sub = cosine_similarity(emb_sub)
    for li in range(len(indices)):
        for lj in range(li + 1, len(indices)):
            if sim_sub[li, lj] >= 0.85:  # Stricter for within-type
                gi, gj = indices[li], indices[lj]
                tj = token_jaccard(entity_names[gi], entity_names[gj])
                if tj >= 0.4:
                    within_candidates.append({
                        "idx_a": gi, "idx_b": gj,
                        "name_a": entity_names[gi], "name_b": entity_names[gj],
                        "type_a": entity_types[gi], "type_b": entity_types[gj],
                        "id_a": entity_ids[gi], "id_b": entity_ids[gj],
                        "cosine": float(sim_sub[li, lj]),
                        "jaccard": tj,
                    })

all_candidates = cross_candidates + within_candidates
print(f"Cross-type candidate pairs: {len(cross_candidates)}")
print(f"Within-type residual pairs: {len(within_candidates)}")
print(f"Total candidates for LLM adjudication: {len(all_candidates)}")

# Show top cross-type candidates
if cross_candidates:
    print(f"\nTop cross-type candidates:")
    for c in sorted(cross_candidates, key=lambda x: -x["cosine"])[:15]:
        print(f"  cos={c['cosine']:.3f} jac={c['jaccard']:.2f} | [{c['type_a']}] {c['name_a']}")
        print(f"  {'':36s}  ↔ [{c['type_b']}] {c['name_b']}")


Cross-type candidate pairs: 250
Within-type residual pairs: 49
Total candidates for LLM adjudication: 299

Top cross-type candidates:
  cos=1.000 jac=1.00 | [Instrument] As-Built Drawings
                                        ↔ [Practice] As-Built Drawings
  cos=1.000 jac=1.00 | [OutcomeMetric] Material Efficiency
                                        ↔ [Practice] Material Efficiency
  cos=1.000 jac=1.00 | [Practice] Life Cycle Assessment (LCA)
                                        ↔ [Requirement] Life Cycle Assessment (LCA)
  cos=1.000 jac=1.00 | [Enabler] Resource Efficiency
                                        ↔ [OutcomeMetric] Resource Efficiency
  cos=0.986 jac=0.33 | [Practice] Pre-Demolition Audits
                                        ↔ [Requirement] Pre-demolition audit
  cos=0.986 jac=0.50 | [Practice] Waste Management Plans
                                        ↔ [Requirement] Waste Management Plan
  cos=0.983 jac=0.75 | [OutcomeMetric] Transition to Circular Ec

In [10]:
# ─────────────────────────────────────────────
# 4c) Cluster candidates into groups (union-find)
# ─────────────────────────────────────────────

# Union-find to group transitive matches
parent = {}
def find(x):
    while parent.get(x, x) != x:
        parent[x] = parent.get(parent[x], parent[x])
        x = parent[x]
    return x

def union(a, b):
    ra, rb = find(a), find(b)
    if ra != rb:
        parent[ra] = rb

for c in all_candidates:
    union(c["idx_a"], c["idx_b"])

# Build clusters
clusters = defaultdict(set)
all_involved = set()
for c in all_candidates:
    all_involved.add(c["idx_a"])
    all_involved.add(c["idx_b"])

for idx in all_involved:
    clusters[find(idx)].add(idx)

# Filter to multi-member clusters
merge_clusters = {k: v for k, v in clusters.items() if len(v) >= 2}

print(f"Merge clusters: {len(merge_clusters)}")
print(f"Total entities in clusters: {sum(len(v) for v in merge_clusters.values())}")

# Show clusters
for i, (root, members) in enumerate(sorted(merge_clusters.items(), key=lambda x: -len(x[1]))):
    if i >= 20:
        print(f"  ... and {len(merge_clusters) - 20} more clusters")
        break
    member_info = [(entity_names[m], entity_types[m]) for m in members]
    print(f"  Cluster {i+1} ({len(members)} members):")
    for name, etype in sorted(member_info, key=lambda x: x[1]):
        print(f"    [{etype:15s}] {name}")


Merge clusters: 79
Total entities in clusters: 277
  Cluster 1 (34 members):
    [Barrier        ] Lack of Circular Economy Regulation
    [Barrier        ] Lack of regulation on circular economy
    [Barrier        ] Lack of Understanding of Circular Economy
    [Barrier        ] Barrier: Lack of Regulation on Circular Economy
    [Enabler        ] Circular Economy (CE)
    [Enabler        ] Circular Economy Business Models
    [Instrument     ] Designers for a Circular Economy
    [Instrument     ] Whole-of-Government Circular Economy Strategy
    [Instrument     ] 2024 National Circular Economy Framework
    [Instrument     ] ACT Circular Economy Strategy and Action Plan
    [Instrument     ] Basic level training in circular economy principles
    [Instrument     ] National Circular Economy Framework
    [Instrument     ] Circular Economy Act
    [Instrument     ] Circular Economy Action Plan
    [Instrument     ] Circular Economy How to Guide
    [Instrument     ] Circular Economy 

In [11]:
# ─────────────────────────────────────────────
# 4d) LLM adjudication of cross-type clusters
# ─────────────────────────────────────────────
import anthropic
from tenacity import retry, wait_exponential, stop_after_attempt

client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

@retry(wait=wait_exponential(min=1, max=30), stop=stop_after_attempt(5))
def call_claude(system, user_msg, max_tokens=2048):
    resp = client.messages.create(
        model=MODEL, max_tokens=max_tokens, temperature=LLM_TEMPERATURE,
        system=system, messages=[{"role": "user", "content": user_msg}],
    )
    return resp.content[0].text

def parse_json_response(text):
    text = text.strip()
    if text.startswith("{") or text.startswith("["):
        try: return json.loads(text)
        except: pass
    m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL)
    if m:
        try: return json.loads(m.group(1).strip())
        except: pass
    for ch in ["{", "["]:
        idx = text.find(ch)
        if idx >= 0:
            try: return json.loads(text[idx:])
            except: pass
    raise ValueError(f"Could not parse JSON from: {text[:200]}...")

# Cache
def load_cache(path):
    if not os.path.exists(path): return {}
    cache = {}
    with open(path) as f:
        for line in f:
            if line.strip():
                rec = json.loads(line)
                cache[rec["key"]] = rec["value"]
    return cache

def save_cache_entry(path, key, value):
    with open(path, "a") as f:
        f.write(json.dumps({"key": key, "value": value}, ensure_ascii=False) + "\n")

MERGE_CACHE = os.path.join(CACHE_DIR, "cross_type_merge.jsonl")
merge_cache = load_cache(MERGE_CACHE)

MERGE_SYSTEM = """You are an expert in building deconstruction governance, construction policy, and circular economy.

You will be given a cluster of entities that may refer to the same or related concepts,
potentially extracted under DIFFERENT entity types from different source documents.

YOUR TASK:
1. Identify which entities are genuinely the SAME concept (just extracted with different labels or types).
2. For each group of same-concept entities, pick the BEST canonical label and the CORRECT entity type.
3. Entities that are DISTINCT concepts should remain separate.

RULES:
- Be CONSERVATIVE: only merge entities that truly refer to the same real-world concept.
- When choosing the correct type, consider the entity's PRIMARY function in governance:
  * Instrument = a specific named policy, law, standard, guideline, framework, or code
  * Authority = a specific named organisation that creates/enforces instruments
  * Jurisdiction = a geographic/political area where instruments apply
  * Requirement = a specific obligation or mandate within an instrument
  * Practice = a specific action, method, or approach used in construction/deconstruction
  * MaterialAsset = a specific material, component, or physical resource
  * Stakeholder = a category of actor (person/org role) involved in the domain
  * Barrier = a specific obstacle or challenge
  * Enabler = a specific factor that facilitates or supports a practice
  * OutcomeMetric = a specific measurable result or KPI
- "Design for Deconstruction" is a Practice (it's something people DO), not an Instrument
- "Circular Economy" is NOT an entity in this domain — it's the overarching paradigm. If it appears, flag it for removal.
- Use title case. Include standard abbreviations in parentheses.
"""

MERGE_USER = """CLUSTER of {n} candidate entities:

{entity_list}

Return JSON:
{{
  "groups": [
    {{
      "canonical_label": "Best Name",
      "correct_type": "EntityType",
      "members": ["entity1 [OriginalType]", "entity2 [OriginalType]"],
      "reasoning": "why these are the same"
    }}
  ],
  "remove": ["entity names that are too generic and should be removed entirely"]
}}

Every input entity must appear in exactly one group OR in the remove list.
Return ONLY valid JSON.
"""

# Process clusters
merge_results = []
merge_errors = []

for cluster_idx, (root, members) in enumerate(tqdm(
    sorted(merge_clusters.items(), key=lambda x: -len(x[1])),
    desc="LLM merge adjudication"
)):
    cache_key = f"cluster_{cluster_idx}_{len(members)}"

    if cache_key in merge_cache:
        merge_results.append({"cluster_idx": cluster_idx, "members": list(members), **merge_cache[cache_key]})
        continue

    member_info = [(entity_names[m], entity_types[m], entity_ids[m]) for m in members]
    entity_list = "\n".join(
        f'- "{name}" [type: {etype}]' for name, etype, _ in member_info
    )

    try:
        raw = call_claude(
            system=MERGE_SYSTEM,
            user_msg=MERGE_USER.format(n=len(members), entity_list=entity_list),
        )
        result = parse_json_response(raw)
        save_cache_entry(MERGE_CACHE, cache_key, result)
        merge_cache[cache_key] = result
        merge_results.append({"cluster_idx": cluster_idx, "members": list(members), **result})
    except Exception as e:
        merge_errors.append({"cluster_idx": cluster_idx, "error": str(e)})
        print(f"  ✗ Cluster {cluster_idx}: {e}")

print(f"\nAdjudication complete: {len(merge_results)} clusters, {len(merge_errors)} errors")


LLM merge adjudication: 100%|██████████| 79/79 [07:05<00:00,  5.39s/it]


Adjudication complete: 79 clusters, 0 errors





In [12]:
# ─────────────────────────────────────────────
# 4e) Build merge map from LLM results
# ─────────────────────────────────────────────

# Build lookup: entity_name (lower) + entity_type -> node_id
name_type_to_id = {}
for i, row in nodes_pruned.iterrows():
    key = (row["name"].lower().strip(), row["entity_type"])
    name_type_to_id[key] = row["node_id"]
    # Also just by name for fuzzy matching
    name_type_to_id[(row["name"].lower().strip(),)] = row["node_id"]

# Parse merge results into: old_node_id -> (new_canonical_label, new_type)
merge_map = {}  # old_id -> canonical_id
canonical_updates = {}  # canonical_id -> {label, type}
remove_ids = set()

merge_log_entries = []

for result in merge_results:
    groups = result.get("groups", [])
    removes = result.get("remove", [])
    member_indices = result["members"]

    # Handle removals
    for rm_name in removes:
        rm_lower = rm_name.lower().strip()
        # Find matching node
        for idx in member_indices:
            if entity_names[idx].lower().strip() == rm_lower:
                remove_ids.add(entity_ids[idx])
                break

    # Handle groups
    for group in groups:
        canonical_label = group.get("canonical_label", "")
        correct_type = group.get("correct_type", "")
        group_members = group.get("members", [])

        if not canonical_label or correct_type not in ENTITY_TYPES:
            continue

        # Find all node_ids in this group
        group_node_ids = []
        for member_str in group_members:
            # Parse "entity name [OriginalType]" format
            m = re.match(r'^"?(.+?)"?\s*\[(?:type:\s*)?(.+?)\]$', member_str.strip())
            if m:
                m_name = m.group(1).strip()
                m_type = m.group(2).strip()
            else:
                m_name = member_str.strip().strip('"')
                m_type = None

            # Try to find matching node
            found = False
            for idx in member_indices:
                if entity_names[idx].lower().strip() == m_name.lower().strip():
                    group_node_ids.append(entity_ids[idx])
                    found = True
                    break

            if not found:
                # Fuzzy match
                for idx in member_indices:
                    if fuzz.ratio(entity_names[idx].lower(), m_name.lower()) > 85:
                        group_node_ids.append(entity_ids[idx])
                        found = True
                        break

        if len(group_node_ids) <= 1:
            # Single member or no matches — just update type if needed
            if group_node_ids:
                canonical_updates[group_node_ids[0]] = {
                    "canonical_label": canonical_label,
                    "correct_type": correct_type,
                }
            continue

        # Multi-member group: merge all into first node_id
        canonical_id = group_node_ids[0]
        canonical_updates[canonical_id] = {
            "canonical_label": canonical_label,
            "correct_type": correct_type,
        }

        for other_id in group_node_ids[1:]:
            if other_id != canonical_id:
                merge_map[other_id] = canonical_id
                merge_log_entries.append({
                    "merged_from": other_id,
                    "merged_into": canonical_id,
                    "canonical_label": canonical_label,
                    "correct_type": correct_type,
                    "reasoning": group.get("reasoning", ""),
                })

print(f"Merge map: {len(merge_map)} nodes will be merged")
print(f"Type/label updates: {len(canonical_updates)} nodes")
print(f"Flagged for removal: {len(remove_ids)} nodes")
print(f"\nMerge log entries:")
for entry in merge_log_entries[:20]:
    old_name = nodes_pruned.loc[nodes_pruned["node_id"] == entry["merged_from"], "name"].values
    new_name = entry["canonical_label"]
    old_name = old_name[0] if len(old_name) > 0 else "?"
    print(f"  {old_name:40s} → {new_name}")
if len(merge_log_entries) > 20:
    print(f"  ... and {len(merge_log_entries) - 20} more")


Merge map: 69 nodes will be merged
Type/label updates: 192 nodes
Flagged for removal: 11 nodes

Merge log entries:
  Circular Economy Practices               → Circular Economy Practices
  Circular Economy Transition              → Circular Economy Practices
  Basic level training in circular economy principles → Circular Economy Principles Training
  Lack of regulation on circular economy   → Lack of Circular Economy Regulation
  Barrier: Lack of Regulation on Circular Economy → Lack of Circular Economy Regulation
  2024 National Circular Economy Framework → National Circular Economy Framework
  National Circular Economy Framework      → National Circular Economy Framework
  Circular Economy Outcomes                → Transition to Circular Economy
  Circular Economy Implementation Plan     → Circular Economy Action Plan
  Circular Business Models                 → Circular Economy Business Models
  Circular Economy Strategies              → Circular Economy Policies
  Designers for a 

In [13]:
# ─────────────────────────────────────────────
# 4f) Apply merges + type corrections + removals
# ─────────────────────────────────────────────

# 1. Remove flagged generic nodes
nodes_clean = nodes_pruned[~nodes_pruned["node_id"].isin(remove_ids)].copy()
edges_clean = edges_pruned[
    ~edges_pruned["subject_id"].isin(remove_ids) &
    ~edges_pruned["object_id"].isin(remove_ids)
].copy()

# 2. Apply merges: remap node_ids in edge table
def remap_id(nid):
    return merge_map.get(nid, nid)

edges_clean["subject_id"] = edges_clean["subject_id"].map(remap_id)
edges_clean["object_id"] = edges_clean["object_id"].map(remap_id)

# Remove self-loops created by merges
before_loops = len(edges_clean)
edges_clean = edges_clean[edges_clean["subject_id"] != edges_clean["object_id"]]
print(f"Self-loops removed: {before_loops - len(edges_clean)}")

# Remove merged nodes from node table
merged_away = set(merge_map.keys())
nodes_clean = nodes_clean[~nodes_clean["node_id"].isin(merged_away)].copy()

# 3. Apply canonical label + type updates
for nid, updates in canonical_updates.items():
    mask = nodes_clean["node_id"] == nid
    if mask.any():
        nodes_clean.loc[mask, "name"] = updates["canonical_label"]
        nodes_clean.loc[mask, "entity_type"] = updates["correct_type"]
        nodes_clean.loc[mask, "name_norm"] = updates["canonical_label"].lower().strip()

# 4. Remove edges pointing to non-existent nodes
valid_ids = set(nodes_clean["node_id"])
edges_clean = edges_clean[
    edges_clean["subject_id"].isin(valid_ids) &
    edges_clean["object_id"].isin(valid_ids)
]

# Deduplicate edges
edges_clean = edges_clean.drop_duplicates(subset=["edge_id"])

print(f"\nAfter cross-type resolution:")
print(f"  Nodes: {len(nodes_pruned)} → {len(nodes_clean)} (removed {len(nodes_pruned) - len(nodes_clean)})")
print(f"  Edges: {len(edges_pruned)} → {len(edges_clean)} (removed {len(edges_pruned) - len(edges_clean)})")
print(f"\nNode type distribution:")
print(nodes_clean["entity_type"].value_counts().to_string())


Self-loops removed: 4

After cross-type resolution:
  Nodes: 1722 → 1642 (removed 80)
  Edges: 3282 → 3223 (removed 59)

Node type distribution:
entity_type
Practice         428
Instrument       245
MaterialAsset    245
Stakeholder      140
Requirement      136
Barrier          120
OutcomeMetric    115
Authority         98
Enabler           74
Jurisdiction      41


## 5) Country tagging + multi-country schema
Tag all entities and edges with `country: AU`. This prepares the graph for integration
with UK and Canadian corpora — same extraction pipeline, same schema, different country tag.
Cross-country queries become graph operations filtered by country.

In [14]:
# ─────────────────────────────────────────────
# 5a) Add country metadata to all records
# ─────────────────────────────────────────────
nodes_clean["country"] = COUNTRY
nodes_clean["country_full"] = COUNTRY_FULL
edges_clean["country"] = COUNTRY

# Also tag KG records
for rec in kg_records:
    rec["country"] = COUNTRY

print(f"Country tag '{COUNTRY}' applied to {len(nodes_clean)} nodes and {len(edges_clean)} edges")


Country tag 'AU' applied to 1642 nodes and 3223 edges


In [15]:
# ─────────────────────────────────────────────
# 5b) Define multi-country schema (for UK/CA integration)
# ─────────────────────────────────────────────
MULTI_COUNTRY_SCHEMA = {
    "version": "1.0",
    "description": "Circular Construction Governance Knowledge Graph — Multi-Country Schema",
    "countries": {
        "AU": {"full_name": "Australia", "status": "complete", "corpus_size": len(kg_records)},
        "UK": {"full_name": "United Kingdom", "status": "planned", "corpus_size": 0},
        "CA": {"full_name": "Canada", "status": "planned", "corpus_size": 0},
    },
    "entity_types": {t: {"description": ""} for t in ENTITY_TYPES},
    "relation_types": {t: {"description": ""} for t in RELATION_TYPES},
    "node_schema": {
        "required": ["node_id", "entity_type", "name", "name_norm", "country"],
        "optional": ["country_full", "is_registry", "chunk_count", "surface_forms",
                      "first_seen_chunk", "first_seen_source", "first_seen_page", "example_evidence"],
    },
    "edge_schema": {
        "required": ["edge_id", "subject_id", "predicate", "object_id", "country"],
        "optional": ["chunk_id", "source_file", "page_num", "judge_label",
                      "statement_confidence", "evidence_excerpt"],
    },
    "extraction_pipeline": {
        "method": "Three-pass LLM extraction with entity registry",
        "pass_1_model": "claude-haiku-4-5",
        "pass_2_model": "claude-sonnet-4-5",
        "pass_3_model": "claude-sonnet-4-5",
        "cleanup_model": "claude-sonnet-4-5",
    },
}

# Add entity type descriptions
TYPE_DESCRIPTIONS = {
    "Instrument": "Named policy, law, standard, guideline, framework, or code",
    "Authority": "Named organisation that creates, administers, or enforces instruments",
    "Jurisdiction": "Geographic or political area where instruments apply",
    "Requirement": "Specific obligation, mandate, or criterion within an instrument",
    "Practice": "Specific action, method, or approach in construction/deconstruction",
    "MaterialAsset": "Specific material, component, waste stream, or physical resource",
    "Stakeholder": "Category of actor (person/organisation role) in the domain",
    "Barrier": "Specific obstacle, challenge, or impediment",
    "Enabler": "Specific factor that facilitates or supports a practice",
    "OutcomeMetric": "Specific measurable result, target, or KPI",
}
for t, desc in TYPE_DESCRIPTIONS.items():
    MULTI_COUNTRY_SCHEMA["entity_types"][t]["description"] = desc

schema_path = os.path.join(OUTPUT_DIR, "multi_country_schema.json")
with open(schema_path, "w") as f:
    json.dump(MULTI_COUNTRY_SCHEMA, f, indent=2)
print(f"Multi-country schema saved: {schema_path}")


Multi-country schema saved: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/4_kg_cleaned/multi_country_schema.json


## 6) Final graph assembly + validation
Build the definitive graph, compute summary statistics, and run quality checks.

In [16]:
# ─────────────────────────────────────────────
# 6a) Build final NetworkX graph
# ─────────────────────────────────────────────
G_final = nx.MultiDiGraph()

for _, row in nodes_clean.iterrows():
    G_final.add_node(row["node_id"], **{k: v for k, v in row.to_dict().items() if pd.notna(v)})

for _, row in edges_clean.iterrows():
    G_final.add_edge(
        row["subject_id"], row["object_id"],
        predicate=row["predicate"],
        edge_id=row["edge_id"],
        confidence=row.get("statement_confidence", 0.5),
    )

# Compute unique triples
triples_final = edges_clean.groupby(["subject_id", "predicate", "object_id"]).agg(
    support_count=("chunk_id", "nunique"),
    avg_confidence=("statement_confidence", "mean"),
    evidence_samples=("evidence_excerpt", lambda x: " | ".join(str(v) for v in x.head(3))),
).reset_index()

# Components
comps_final = list(nx.weakly_connected_components(G_final))
largest_final = max(comps_final, key=len)

# Isolates
isolates = [n for n in G_final.nodes() if G_final.degree(n) == 0]

print(f"{'='*60}")
print(f"FINAL GRAPH SUMMARY")
print(f"{'='*60}")
print(f"Nodes:             {G_final.number_of_nodes()}")
print(f"Edges (all):       {G_final.number_of_edges()}")
print(f"Unique triples:    {len(triples_final)}")
print(f"Components:        {len(comps_final)}")
print(f"Largest component: {len(largest_final)} ({100*len(largest_final)/G_final.number_of_nodes():.1f}%)")
print(f"Isolate nodes:     {len(isolates)}")
print(f"Country:           {COUNTRY}")


FINAL GRAPH SUMMARY
Nodes:             1642
Edges (all):       3223
Unique triples:    2788
Components:        219
Largest component: 1339 (81.5%)
Isolate nodes:     174
Country:           AU


In [17]:
# ─────────────────────────────────────────────
# 6b) Quality checks: remaining near-duplicates
# ─────────────────────────────────────────────
print("=== Residual Near-Duplicate Check ===")

suspect_count = 0
for etype in ENTITY_TYPES:
    type_nodes = nodes_clean[nodes_clean["entity_type"] == etype]["name"].tolist()
    if len(type_nodes) < 2:
        continue

    suspects = []
    for a, b in combinations(type_nodes, 2):
        tj = token_jaccard(a, b)
        if tj >= 0.65:
            suspects.append((a, b, tj))

    if suspects:
        suspect_count += len(suspects)
        print(f"\n  [{etype}] {len(suspects)} suspects:")
        for a, b, j in sorted(suspects, key=lambda x: -x[2])[:5]:
            print(f"    {j:.2f} | \"{a}\" ↔ \"{b}\"")
        if len(suspects) > 5:
            print(f"    ... +{len(suspects)-5} more")

if suspect_count == 0:
    print("  No residual near-duplicates detected ✓")
else:
    print(f"\n  Total suspects: {suspect_count} (acceptable if < 20)")


=== Residual Near-Duplicate Check ===

  [Instrument] 4 suspects:
    0.71 | "National End-of-Waste Decision for Recycled Aggregates" ↔ "Single-Case End-of-Waste Decision for Recycled Aggregates"
    0.67 | "Building Codes" ↔ "State Building Codes"
    0.67 | "Landfill Levy" ↔ "Landfill Levy Exemptions"
    0.67 | "Trends in Solutions 2024: Material" ↔ "Trends in Solutions 2024: Reuse"

  [Authority] 1 suspects:
    0.80 | "Australian State and Territory Governments" ↔ "State and Territory Governments"

  [Requirement] 4 suspects:
    0.67 | "Acceptance Criteria" ↔ "Material Acceptance Criteria"
    0.67 | "Acceptance Criteria" ↔ "Reuse Acceptance Criteria"
    0.67 | "Circular Design Principles" ↔ "Design Principles"
    0.67 | "Construction GPP criteria" ↔ "GPP Criteria"

  [Practice] 9 suspects:
    1.00 | "Design for Disassembly (DfD)" ↔ "Design for Disassembly (DfD)"
    0.67 | "Material Tracking" ↔ "Material Flow Tracking"
    0.67 | "Circular Economy Practices" ↔ "Circular Pract

In [18]:
# ─────────────────────────────────────────────
# 6c) Node type distribution + degree statistics
# ─────────────────────────────────────────────
print("=== Entity Type Distribution ===")
type_dist = nodes_clean["entity_type"].value_counts()
print(type_dist.to_string())

print(f"\n=== Relation Distribution ===")
rel_dist = edges_clean["predicate"].value_counts()
print(rel_dist.to_string())

print(f"\n=== Degree Statistics ===")
degrees = dict(G_final.degree())
deg_vals = list(degrees.values())
print(f"  Mean degree:   {np.mean(deg_vals):.1f}")
print(f"  Median degree: {np.median(deg_vals):.0f}")
print(f"  Max degree:    {max(deg_vals)}")

print(f"\n=== Top 20 Most Connected Nodes ===")
top_nodes = sorted(degrees.items(), key=lambda x: -x[1])[:20]
for nid, deg in top_nodes:
    nd = G_final.nodes[nid]
    print(f"  {deg:4d} | [{nd.get('entity_type',''):15s}] {nd.get('name','')}")

print(f"\n=== Edge Support Distribution ===")
print(triples_final["support_count"].describe().to_string())

multi_support = triples_final[triples_final["support_count"] >= 2]
print(f"\nTriples with support ≥ 2: {len(multi_support)} ({100*len(multi_support)/len(triples_final):.1f}%)")

print(f"\n=== Comparison ===")
print(f"  Stage 3 v1:       2,180 nodes, 2,026 edges, 471 components")
print(f"  Stage 3 v2 raw:   1,748 nodes, 3,929 triples, 105 components")
print(f"  Stage 4 cleaned:  {G_final.number_of_nodes()} nodes, {len(triples_final)} triples, {len(comps_final)} components")


=== Entity Type Distribution ===
entity_type
Practice         428
Instrument       245
MaterialAsset    245
Stakeholder      140
Requirement      136
Barrier          120
OutcomeMetric    115
Authority         98
Enabler           74
Jurisdiction      41

=== Relation Distribution ===
predicate
INVOLVES      750
ENABLES       569
REQUIRES      432
APPLIES_TO    322
PRODUCES      278
AFFECTS       257
BARRIERS      184
APPLIES_IN    174
ISSUED_BY     133
REFERENCES    116
PROHIBITS       8

=== Degree Statistics ===
  Mean degree:   3.9
  Median degree: 2
  Max degree:    131

=== Top 20 Most Connected Nodes ===
   131 | [Practice       ] Design for Disassembly (DfD)
    81 | [MaterialAsset  ] Construction and Demolition (C&D) Waste
    75 | [OutcomeMetric  ] Material Reuse Rate
    64 | [Practice       ] Circular Design
    63 | [Jurisdiction   ] Australia
    62 | [MaterialAsset  ] Material passports
    57 | [Practice       ] Building Disassembly
    51 | [OutcomeMetric  ] Material R

## 7) Export clean graph + backend-ready artefacts
Saves everything needed for the backend assembly (Notebook 5):
- Clean node/edge tables with country tags
- Aggregated triple table with support counts
- Multi-country schema
- Complete merge audit trail
- Source chunks with evidence for RAG retrieval

In [19]:
# ─────────────────────────────────────────────
# 7) Save all outputs
# ─────────────────────────────────────────────

# Core graph files
nodes_clean.to_csv(os.path.join(OUTPUT_DIR, "nodes_clean.csv"), index=False)
edges_clean.to_csv(os.path.join(OUTPUT_DIR, "edges_clean.csv"), index=False)
triples_final.to_csv(os.path.join(OUTPUT_DIR, "triples_clean.csv"), index=False)

# KG records (with country tag) for RAG evidence retrieval
kg_path = os.path.join(OUTPUT_DIR, "KG_records_tagged.jsonl")
with open(kg_path, "w") as f:
    for rec in kg_records:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")

# Merge audit trail
merge_audit = {
    "generic_pruned": len(generic_ids),
    "cross_type_merged": len(merge_map),
    "type_corrected": len(canonical_updates),
    "additionally_removed": len(remove_ids),
    "merge_entries": merge_log_entries,
}
with open(os.path.join(OUTPUT_DIR, "merge_audit.json"), "w") as f:
    json.dump(merge_audit, f, indent=2, ensure_ascii=False)

# Isolate nodes list (for potential removal in backend)
if isolates:
    isolate_info = [{"node_id": n, "name": G_final.nodes[n].get("name", ""),
                      "type": G_final.nodes[n].get("entity_type", "")} for n in isolates]
    with open(os.path.join(OUTPUT_DIR, "isolate_nodes.json"), "w") as f:
        json.dump(isolate_info, f, indent=2)

print(f"{'='*60}")
print(f"STAGE 4 — ALL OUTPUTS SAVED")
print(f"{'='*60}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"  nodes_clean.csv         ({len(nodes_clean)} rows)")
print(f"  edges_clean.csv         ({len(edges_clean)} rows)")
print(f"  triples_clean.csv       ({len(triples_final)} rows)")
print(f"  KG_records_tagged.jsonl  ({len(kg_records)} rows)")
print(f"  multi_country_schema.json")
print(f"  merge_audit.json")
print(f"  generic_pruning_log.json")
if isolates:
    print(f"  isolate_nodes.json       ({len(isolates)} isolates)")
print(f"\nThis graph is ready for backend assembly (Notebook 5).")


STAGE 4 — ALL OUTPUTS SAVED
Output directory: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/4_kg_cleaned
  nodes_clean.csv         (1642 rows)
  edges_clean.csv         (3223 rows)
  triples_clean.csv       (2788 rows)
  KG_records_tagged.jsonl  (355 rows)
  multi_country_schema.json
  merge_audit.json
  generic_pruning_log.json
  isolate_nodes.json       (174 isolates)

This graph is ready for backend assembly (Notebook 5).
