# Stage 3 v2 — Schema-Guided KG Extraction with Entity Registry

**Three-pass extraction pipeline** for building deconstruction governance knowledge graph.

| Pass | Purpose | Model | Est. Cost |
|------|---------|-------|-----------|
| **1 — Discovery** | Fast entity/relation extraction from all chunks | Claude Haiku 4.5 | ~\$2 |
| **2 — Consolidation** | Cluster & canonicalise entities into master registry | Claude Sonnet 4.5 | ~\$5 |
| **3 — Grounded extraction** | Re-extract with registry in context | Claude Sonnet 4.5 | ~\$18 |

**Why this approach?** The original Stage 3 extracted independently per chunk (≤12 entities each), producing 2,180 nodes with massive redundancy. Post-hoc cleaning in Stage 4 only removed 7.2%. This pipeline prevents duplicates at source by maintaining a canonical entity registry that the LLM references during extraction.

**Target outcome:** A clean graph of ~400–800 nodes with consistent entity naming, suitable for producing readable paper figures without elaborate post-hoc filtering.


## 0 — Environment Setup

In [2]:
# Install dependencies (uncomment if needed)
# !pip install -q anthropic pandas tqdm tenacity sentence-transformers scikit-learn networkx
!pip install anthropic

import os, json, re, hashlib, time
from pathlib import Path
from collections import defaultdict, Counter

import pandas as pd
import numpy as np
from tqdm import tqdm
from tenacity import retry, wait_exponential, stop_after_attempt


Collecting anthropic
  Downloading anthropic-0.79.0-py3-none-any.whl.metadata (28 kB)
Downloading anthropic-0.79.0-py3-none-any.whl (405 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/405.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m399.4/405.9 kB[0m [31m20.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m405.9/405.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.79.0


## 1 — Configuration

In [4]:
# ─────────────────────────────────────────────
# INPUT / OUTPUT
# ─────────────────────────────────────────────
INPUT_JSONL = "/content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/batch_enhanced_KG_outputs/outputs/kept_chunks/MASTER_kept_judged.jsonl"

OUTPUT_DIR  = "/content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/3_graph_built_v2"
CACHE_DIR   = str(Path(OUTPUT_DIR) / "cache")

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)

# ─────────────────────────────────────────────
# Judge labels
# ─────────────────────────────────────────────
INCLUDE_JUDGE_LABELS = {"keep", "maybe"}
JUDGE_CONFIDENCE = {"keep": 1.0, "maybe": 0.6}

# ─────────────────────────────────────────────
# Claude API
# ─────────────────────────────────────────────
from google.colab import userdata
ANTHROPIC_API_KEY = userdata.get("Anthropic_API_KEY")

# Models
MODEL_DISCOVERY     = "claude-haiku-4-5"      # Pass 1: fast + cheap
MODEL_CONSOLIDATION = "claude-sonnet-4-5"     # Pass 2: smart clustering
MODEL_EXTRACTION    = "claude-sonnet-4-5"     # Pass 3: quality extraction

LLM_TEMPERATURE = 0.0  # Reproducibility
MAX_RETRIES = 5

# ─────────────────────────────────────────────
# Schema
# ─────────────────────────────────────────────
ENTITY_TYPES = [
    "Instrument", "Authority", "Jurisdiction", "Requirement",
    "Practice", "MaterialAsset", "Stakeholder",
    "Barrier", "Enabler", "OutcomeMetric",
]

RELATION_TYPES = [
    "ISSUED_BY", "APPLIES_IN", "APPLIES_TO",
    "REQUIRES", "PROHIBITS", "REFERENCES",
    "INVOLVES", "PRODUCES", "BARRIERS", "ENABLES", "AFFECTS",
]

print(f"Config loaded. Output → {OUTPUT_DIR}")


Config loaded. Output → /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/3_graph_built_v2


## 2 — API Client & Helper Functions

In [5]:
import anthropic

client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

# Test connection
try:
    resp = client.messages.create(
        model=MODEL_DISCOVERY,
        max_tokens=20,
        messages=[{"role": "user", "content": "Say OK"}],
    )
    print(f"✓ Claude API connected. Test response: {resp.content[0].text}")
except Exception as e:
    print(f"✗ Connection failed: {e}")


✓ Claude API connected. Test response: OK


In [6]:
# ─────────────────────────────────────────────
# Helper functions
# ─────────────────────────────────────────────
def sha1_16(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()[:16]

def norm_text(s: str) -> str:
    s = (s or "").strip().lower()
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"[^\w\s\-\/\.\(\)]", "", s)
    return s.strip()

def make_entity_id(entity_type: str, name: str) -> str:
    return f"ent:{sha1_16(entity_type + '|' + norm_text(name))}"

def make_edge_id(sub_id: str, pred: str, obj_id: str, chunk_id: str) -> str:
    return f"rel:{sha1_16(sub_id + '|' + pred + '|' + obj_id + '|' + str(chunk_id))}"

def safe_get_judge_label(rec: dict):
    j = rec.get("judge")
    if isinstance(j, dict):
        return j.get("label")
    return None

def evidence_offsets(text: str, excerpt: str):
    if not text or not excerpt:
        return None, None
    idx = text.find(excerpt)
    if idx != -1:
        return idx, idx + len(excerpt)
    return None, None

def load_cache(path):
    """Load JSONL cache file."""
    if not os.path.exists(path):
        return {}
    cache = {}
    with open(path, "r") as f:
        for line in f:
            if line.strip():
                rec = json.loads(line)
                cache[rec["key"]] = rec["value"]
    return cache

def save_cache_entry(path, key, value):
    """Append one entry to JSONL cache."""
    with open(path, "a") as f:
        f.write(json.dumps({"key": key, "value": value}, ensure_ascii=False) + "\n")

@retry(wait=wait_exponential(min=1, max=30), stop=stop_after_attempt(MAX_RETRIES))
def call_claude(model, system, user_msg, max_tokens=4096):
    """Generic Claude API call with retry."""
    resp = client.messages.create(
        model=model,
        max_tokens=max_tokens,
        temperature=LLM_TEMPERATURE,
        system=system,
        messages=[{"role": "user", "content": user_msg}],
    )
    return resp.content[0].text

def parse_json_response(text):
    """Extract JSON from Claude response, handling markdown fences."""
    # Try direct parse first
    text = text.strip()
    if text.startswith("{") or text.startswith("["):
        try:
            return json.loads(text)
        except json.JSONDecodeError:
            pass
    # Try extracting from markdown code fence
    m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", text, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1).strip())
        except json.JSONDecodeError:
            pass
    # Last resort: find first { or [ and parse from there
    for start_char in ["{", "["]:
        idx = text.find(start_char)
        if idx >= 0:
            try:
                return json.loads(text[idx:])
            except json.JSONDecodeError:
                pass
    raise ValueError(f"Could not parse JSON from: {text[:200]}...")

print("Helpers loaded ✓")


Helpers loaded ✓


## 3 — Load Judged Chunks

In [7]:
def load_judged_chunks(path: str):
    kept = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            if not line.strip():
                continue
            rec = json.loads(line)
            label = safe_get_judge_label(rec)
            if label not in INCLUDE_JUDGE_LABELS:
                continue
            rec["_judge_label"] = label
            rec["_judge_confidence"] = float(JUDGE_CONFIDENCE.get(label, 0.5))
            kept.append(rec)
    return kept

chunks = load_judged_chunks(INPUT_JSONL)
print(f"Loaded {len(chunks)} chunks (labels: {sorted(INCLUDE_JUDGE_LABELS)})")

# Basic stats
labels = Counter(c["_judge_label"] for c in chunks)
sources = Counter(c["source_file"] for c in chunks)
lengths = [len(c["text"]) for c in chunks]
print(f"  Labels: {dict(labels)}")
print(f"  Source files: {len(sources)}")
print(f"  Text length: min={min(lengths)}, median={sorted(lengths)[len(lengths)//2]}, max={max(lengths)}")


Loaded 357 chunks (labels: ['keep', 'maybe'])
  Labels: {'maybe': 205, 'keep': 152}
  Source files: 36
  Text length: min=101, median=1324, max=1399


## 4 — Pass 1: Discovery Extraction

**Goal:** Quickly extract candidate entities and relations from all chunks using a fast model. No registry — just capture what's in the text.

Uses **Claude Haiku 4.5** for speed and cost-efficiency (~357 calls × ~\$0.005 = ~\$2).


In [8]:
# ─────────────────────────────────────────────
# Pass 1 system prompt — broad discovery
# ─────────────────────────────────────────────
P1_SYSTEM = f"""You extract a knowledge graph from governance/policy text about building deconstruction,
secondary materials reuse/recycling, and circular economy in the built environment.

ENTITY TYPES (use ONLY these): {ENTITY_TYPES}
RELATION TYPES (use ONLY these): {RELATION_TYPES}

RULES:
- Extract ALL distinct entities mentioned in the text (aim for completeness).
- Every entity needs: name, type, evidence_excerpt (10-35 word verbatim quote).
- Every relation needs: subject, predicate, object, evidence_excerpt (10-35 word verbatim quote).
- Both subject and object of a relation must appear in your entity list.
- Prefer SPECIFIC names over generic terms (e.g. "National Construction Code" not "the code").
- Use CONSISTENT naming: if an entity has a formal name, always use that form.
- If nothing can be reliably extracted, return empty lists.
""".strip()

P1_USER = """TEXT CHUNK:
{chunk_text}

Return JSON with:
1) "entities": list of {{"name": str, "type": str, "evidence_excerpt": str}}
2) "relations": list of {{"subject": str, "predicate": str, "object": str, "evidence_excerpt": str}}

Extract generously — we will consolidate duplicates later. Aim for completeness.
Return ONLY valid JSON.
"""

def validate_p1_output(data):
    """Validate and filter Pass 1 extraction output."""
    ents = data.get("entities", []) or []
    rels = data.get("relations", []) or []

    valid_ents = []
    for e in ents:
        if (isinstance(e, dict)
            and isinstance(e.get("name"), str) and e["name"].strip()
            and e.get("type") in ENTITY_TYPES
            and isinstance(e.get("evidence_excerpt"), str) and e["evidence_excerpt"].strip()):
            valid_ents.append(e)

    valid_rels = []
    ent_names = {e["name"].strip() for e in valid_ents}
    for r in rels:
        if (isinstance(r, dict)
            and isinstance(r.get("subject"), str) and r["subject"].strip() in ent_names
            and r.get("predicate") in RELATION_TYPES
            and isinstance(r.get("object"), str) and r["object"].strip() in ent_names
            and isinstance(r.get("evidence_excerpt"), str) and r["evidence_excerpt"].strip()):
            valid_rels.append(r)

    return {"entities": valid_ents, "relations": valid_rels}

print("Pass 1 prompts ready ✓")


Pass 1 prompts ready ✓


In [9]:
# ─────────────────────────────────────────────
# Run Pass 1 (with caching)
# ─────────────────────────────────────────────
P1_CACHE = os.path.join(CACHE_DIR, "pass1_discovery.jsonl")
p1_cache = load_cache(P1_CACHE)

p1_results = []  # list of {chunk_id, entities, relations}
p1_errors  = []

for rec in tqdm(chunks, desc="Pass 1 — Discovery"):
    chunk_id = rec["chunk_id"]
    chunk_text = rec["text"].strip()

    # Check cache
    if chunk_id in p1_cache:
        p1_results.append({"chunk_id": chunk_id, **p1_cache[chunk_id]})
        continue

    try:
        raw = call_claude(
            model=MODEL_DISCOVERY,
            system=P1_SYSTEM,
            user_msg=P1_USER.format(chunk_text=chunk_text),
            max_tokens=4096,
        )
        data = parse_json_response(raw)
        validated = validate_p1_output(data)

        # Cache
        save_cache_entry(P1_CACHE, chunk_id, validated)
        p1_cache[chunk_id] = validated
        p1_results.append({"chunk_id": chunk_id, **validated})

    except Exception as e:
        p1_errors.append({"chunk_id": chunk_id, "error": str(e)})
        print(f"  ✗ {chunk_id}: {e}")

print(f"\nPass 1 complete: {len(p1_results)} chunks processed, {len(p1_errors)} errors")


Pass 1 — Discovery:  10%|█         | 36/357 [05:04<56:12, 10.51s/it]

  ✗ 27.Engineering-for-Australias-Circular-Economy-White-Paper-Download::c25::0ade648bd4ba4314: Could not parse JSON from: ```json
{
  "entities": [
    {
      "name": "Circular Design & Construction",
      "type": "Pillar",
      "evidence_excerpt": "Circular Design & Construction Design for disassembly, modular & pref...


Pass 1 — Discovery: 100%|██████████| 357/357 [40:20<00:00,  6.78s/it]


Pass 1 complete: 356 chunks processed, 1 errors





In [10]:
# ─────────────────────────────────────────────
# Pass 1 statistics
# ─────────────────────────────────────────────
all_p1_entities = []
for r in p1_results:
    for e in r["entities"]:
        all_p1_entities.append({
            "name": e["name"].strip(),
            "type": e["type"],
            "name_norm": norm_text(e["name"]),
            "chunk_id": r["chunk_id"],
        })

all_p1_relations = []
for r in p1_results:
    for rel in r["relations"]:
        all_p1_relations.append({
            "subject": rel["subject"].strip(),
            "predicate": rel["predicate"],
            "object": rel["object"].strip(),
            "chunk_id": r["chunk_id"],
        })

p1_ent_df = pd.DataFrame(all_p1_entities)
p1_rel_df = pd.DataFrame(all_p1_relations)

print(f"Pass 1 raw extraction:")
print(f"  Total entity mentions: {len(p1_ent_df)}")
print(f"  Unique (name_norm, type) pairs: {p1_ent_df.groupby(['name_norm','type']).ngroups}")
print(f"  Total relation mentions: {len(p1_rel_df)}")
print(f"\nEntity type distribution:")
print(p1_ent_df["type"].value_counts().to_string())
print(f"\nRelation type distribution:")
print(p1_rel_df["predicate"].value_counts().to_string())

# Show most frequent entities (the ones needing consolidation)
freq = p1_ent_df.groupby(["name_norm", "type"]).size().reset_index(name="chunk_count")
freq = freq.sort_values("chunk_count", ascending=False)
print(f"\nTop 20 most frequent entities (appear in most chunks):")
print(freq.head(20).to_string(index=False))


Pass 1 raw extraction:
  Total entity mentions: 5703
  Unique (name_norm, type) pairs: 3614
  Total relation mentions: 3116

Entity type distribution:
type
Practice         2030
MaterialAsset     865
Instrument        542
Stakeholder       517
Requirement       420
OutcomeMetric     365
Barrier           320
Enabler           256
Authority         203
Jurisdiction      185

Relation type distribution:
predicate
ENABLES       700
REQUIRES      493
INVOLVES      464
BARRIERS      300
PRODUCES      299
APPLIES_TO    284
APPLIES_IN    190
AFFECTS       145
ISSUED_BY     129
REFERENCES     91
PROHIBITS      21

Top 20 most frequent entities (appear in most chunks):
                        name_norm          type  chunk_count
                 circular economy      Practice           46
                        australia  Jurisdiction           44
                   material reuse      Practice           42
                        recycling      Practice           35
                          

## 5 — Pass 2: Entity Consolidation

**Goal:** Cluster the raw Pass 1 entities into canonical concepts and build a master entity registry.

**Steps:**
1. Compute sentence embeddings for all unique entities
2. Block candidates by type + embedding similarity (cosine ≥ 0.75)
3. Send candidate clusters to Claude Sonnet for adjudication
4. Build master entity registry with canonical labels


In [11]:
# ─────────────────────────────────────────────
# Step 2a: Build unique entity pool from Pass 1
# ─────────────────────────────────────────────
# Group by (name_norm, type) and collect all surface forms + chunk evidence
entity_pool = defaultdict(lambda: {"surface_forms": set(), "chunk_ids": set(), "type": None})

for e in all_p1_entities:
    key = (e["name_norm"], e["type"])
    entity_pool[key]["surface_forms"].add(e["name"])
    entity_pool[key]["chunk_ids"].add(e["chunk_id"])
    entity_pool[key]["type"] = e["type"]

# Convert to list
unique_entities = []
for (name_norm, etype), info in entity_pool.items():
    unique_entities.append({
        "name_norm": name_norm,
        "type": etype,
        "surface_forms": sorted(info["surface_forms"]),
        "chunk_count": len(info["chunk_ids"]),
        "representative": sorted(info["surface_forms"])[0],  # alphabetical first
    })

print(f"Unique entity candidates: {len(unique_entities)}")
print(f"By type:")
type_counts = Counter(e["type"] for e in unique_entities)
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
    print(f"  {t}: {c}")


Unique entity candidates: 3614
By type:
  Practice: 1173
  MaterialAsset: 491
  Instrument: 395
  Requirement: 344
  Barrier: 282
  Stakeholder: 272
  OutcomeMetric: 267
  Enabler: 202
  Authority: 126
  Jurisdiction: 62


In [12]:
# ─────────────────────────────────────────────
# Step 2b: Compute embeddings for clustering
# ─────────────────────────────────────────────
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load a lightweight model (runs on CPU, ~100MB)
print("Loading embedding model...")
embed_model = SentenceTransformer("all-MiniLM-L6-v2")

# Embed all unique entities (type-prefixed for disambiguation)
entity_texts = [f"{e['type']}: {e['representative']}" for e in unique_entities]
print(f"Computing embeddings for {len(entity_texts)} entities...")
embeddings = embed_model.encode(entity_texts, show_progress_bar=True, batch_size=64)

print(f"Embeddings shape: {embeddings.shape} ✓")


Loading embedding model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Computing embeddings for 3614 entities...


Batches:   0%|          | 0/57 [00:00<?, ?it/s]

Embeddings shape: (3614, 384) ✓


In [13]:
# ─────────────────────────────────────────────
# Step 2c: Block candidates by type + cosine similarity
# ─────────────────────────────────────────────
COSINE_THRESHOLD = 0.72  # Tuned for catching synonyms without false positives

# Group by entity type
type_to_indices = defaultdict(list)
for i, e in enumerate(unique_entities):
    type_to_indices[e["type"]].append(i)

# Find candidate pairs within each type
candidate_clusters = []  # list of sets of indices

from scipy.cluster.hierarchy import fcluster, linkage
from scipy.spatial.distance import pdist

for etype, indices in type_to_indices.items():
    if len(indices) < 2:
        continue

    sub_embeddings = embeddings[indices]

    # Hierarchical clustering
    if len(indices) == 2:
        sim = cosine_similarity(sub_embeddings)[0, 1]
        if sim >= COSINE_THRESHOLD:
            candidate_clusters.append(set(indices))
        continue

    dists = pdist(sub_embeddings, metric="cosine")  # 1 - cosine_similarity
    Z = linkage(dists, method="average")
    labels = fcluster(Z, t=1.0 - COSINE_THRESHOLD, criterion="distance")

    # Group indices by cluster label
    cluster_map = defaultdict(set)
    for local_idx, cluster_label in enumerate(labels):
        cluster_map[cluster_label].add(indices[local_idx])

    # Only keep clusters with 2+ members (these need consolidation)
    for cluster_indices in cluster_map.values():
        if len(cluster_indices) >= 2:
            candidate_clusters.append(cluster_indices)

print(f"Found {len(candidate_clusters)} candidate clusters needing consolidation")
print(f"Total entities in clusters: {sum(len(c) for c in candidate_clusters)}")
print(f"Singletons (no merge needed): {len(unique_entities) - sum(len(c) for c in candidate_clusters)}")

# Show some example clusters
print(f"\nExample clusters:")
for i, cluster in enumerate(sorted(candidate_clusters, key=len, reverse=True)[:10]):
    members = [unique_entities[idx]["representative"] for idx in cluster]
    etype = unique_entities[list(cluster)[0]]["type"]
    print(f"  [{etype}] {members}")


Found 584 candidate clusters needing consolidation
Total entities in clusters: 2630
Singletons (no merge needed): 984

Example clusters:
  [Stakeholder] ['Businesses', 'Consultants', 'Large companies', 'Professional Bodies and Industry Leaders', 'Policymakers', 'local community', 'Startups', 'Stockists', 'External Stakeholders', 'Community', 'Key Actors', 'Policy Makers', 'Cost Consultant', 'industry stakeholders', 'Building users', 'Practitioners', 'Member states', 'Internal Stakeholders', 'Business', 'Private Sector', 'Stakeholders', 'End-users', 'Client', 'organizational decision-makers', 'Regulators', 'Procurers', 'Organizations', 'Industry and standards bodies', 'Client champion', 'Governments', 'Individuals', 'Receiving stakeholder', 'Providers', 'Stakeholder', 'Service Providers', 'Material receiving stakeholder', 'Clients', 'Regulator', 'Customers', 'Government Projects', 'Investors', 'Stakeholder receiving material', 'Users and managers', 'Government', 'Senior Management', 'In

In [14]:
# ─────────────────────────────────────────────
# Step 2d: LLM adjudication of candidate clusters
# ─────────────────────────────────────────────
P2_SYSTEM = """You are an expert in building deconstruction governance, construction policy, and circular economy.

Your task is to consolidate candidate entity clusters. For each cluster of similar entities, decide:
1. Which entities are truly the SAME concept (synonyms, abbreviations, grammatical variants)?
2. What is the single best CANONICAL LABEL for each distinct concept?

RULES:
- Be CONSERVATIVE: only merge entities that genuinely refer to the same thing.
- "Design for Deconstruction" and "Design for Disassembly" = SAME concept → canonical: "Design for Deconstruction (DfD)"
- "National Construction Code" and "NCC" = SAME concept → canonical: "National Construction Code (NCC)"
- "waste levy" and "landfill levy" = DIFFERENT concepts (may overlap but distinct instruments) → keep separate
- Preserve important distinctions between jurisdictions, instruments, practices.
- Use title case for canonical labels.
- Include common abbreviations in parentheses where standard.
"""

P2_USER = """Here is a cluster of {n} candidate {etype} entities that may refer to the same or overlapping concepts:

{entity_list}

For this cluster, return JSON with:
{{
  "groups": [
    {{
      "canonical_label": "Best Name for This Concept",
      "members": ["entity1", "entity2"],
      "reasoning": "Brief explanation"
    }}
  ]
}}

RULES:
- Every input entity must appear in exactly one group.
- If an entity is DISTINCT from all others, put it in its own group with just itself.
- Only group entities that truly refer to the SAME concept.
- Return ONLY valid JSON.
"""

P2_CACHE = os.path.join(CACHE_DIR, "pass2_consolidation.jsonl")
p2_cache = load_cache(P2_CACHE)

# Batch small clusters together for efficiency
BATCH_SIZE = 8  # clusters per API call for small clusters

registry_entries = []  # Will hold: {canonical_label, type, member_norms, chunk_count}
merge_log = []

# Process each cluster
for cluster_idx, cluster in enumerate(tqdm(candidate_clusters, desc="Pass 2 — Consolidation")):
    members = [unique_entities[idx] for idx in cluster]
    etype = members[0]["type"]

    cache_key = f"cluster_{cluster_idx}_{etype}_{len(members)}"

    if cache_key in p2_cache:
        result = p2_cache[cache_key]
    else:
        entity_list = "\n".join(
            f"- \"{m['representative']}\" (appears in {m['chunk_count']} chunks, variants: {m['surface_forms']})"
            for m in members
        )

        try:
            raw = call_claude(
                model=MODEL_CONSOLIDATION,
                system=P2_SYSTEM,
                user_msg=P2_USER.format(n=len(members), etype=etype, entity_list=entity_list),
                max_tokens=2048,
            )
            result = parse_json_response(raw)
            save_cache_entry(P2_CACHE, cache_key, result)
            p2_cache[cache_key] = result
        except Exception as e:
            print(f"  ✗ Cluster {cluster_idx}: {e}")
            # Fallback: keep each entity as its own canonical
            result = {"groups": [
                {"canonical_label": m["representative"], "members": [m["representative"]], "reasoning": "fallback"}
                for m in members
            ]}

    # Process groups
    for group in result.get("groups", []):
        canonical = group["canonical_label"]
        group_members = group.get("members", [])

        # Find matching unique_entities by name
        member_norms = set()
        total_chunks = 0
        for m_name in group_members:
            m_norm = norm_text(m_name)
            for m in members:
                if m["name_norm"] == m_norm or m["representative"].lower() == m_name.lower():
                    member_norms.add(m["name_norm"])
                    total_chunks += m["chunk_count"]

        if not member_norms:
            # Fuzzy fallback
            member_norms = {norm_text(m_name) for m_name in group_members}
            total_chunks = 1

        registry_entries.append({
            "canonical_label": canonical,
            "type": etype,
            "member_norms": sorted(member_norms),
            "all_surface_forms": group_members,
            "chunk_count": total_chunks,
        })

        if len(group_members) > 1:
            merge_log.append({
                "canonical": canonical,
                "type": etype,
                "merged": group_members,
                "reasoning": group.get("reasoning", ""),
            })

print(f"\nConsolidation complete:")
print(f"  Registry entries from clusters: {len(registry_entries)}")
print(f"  Merges performed: {len(merge_log)}")


Pass 2 — Consolidation: 100%|██████████| 584/584 [59:24<00:00,  6.10s/it]


Consolidation complete:
  Registry entries from clusters: 1653
  Merges performed: 596





In [15]:
# ─────────────────────────────────────────────
# Step 2e: Add singletons (entities not in any cluster)
# ─────────────────────────────────────────────
clustered_indices = set()
for cluster in candidate_clusters:
    clustered_indices.update(cluster)

for i, e in enumerate(unique_entities):
    if i not in clustered_indices:
        # This entity had no close neighbours — it's already canonical
        registry_entries.append({
            "canonical_label": e["representative"],
            "type": e["type"],
            "member_norms": [e["name_norm"]],
            "all_surface_forms": e["surface_forms"],
            "chunk_count": e["chunk_count"],
        })

print(f"Total registry entries (clusters + singletons): {len(registry_entries)}")
print(f"\nBy type:")
reg_type_counts = Counter(e["type"] for e in registry_entries)
for t, c in sorted(reg_type_counts.items(), key=lambda x: -x[1]):
    print(f"  {t}: {c}")

# Verify: every unique_entity name_norm should map to exactly one registry entry
norm_to_canonical = {}
for entry in registry_entries:
    for mn in entry["member_norms"]:
        if mn in norm_to_canonical:
            print(f"  ⚠ Duplicate mapping: '{mn}' → '{norm_to_canonical[mn]}' AND '{entry['canonical_label']}'")
        norm_to_canonical[mn] = entry["canonical_label"]

coverage = sum(1 for e in unique_entities if e["name_norm"] in norm_to_canonical)
print(f"\nRegistry coverage: {coverage}/{len(unique_entities)} unique entities mapped ({100*coverage/len(unique_entities):.1f}%)")


Total registry entries (clusters + singletons): 2637

By type:
  Practice: 807
  MaterialAsset: 340
  Instrument: 320
  Requirement: 274
  Barrier: 208
  OutcomeMetric: 188
  Stakeholder: 185
  Enabler: 160
  Authority: 105
  Jurisdiction: 50
  ⚠ Duplicate mapping: 'supply chain' → 'Supply Chain' AND 'Supply Chain'
  ⚠ Duplicate mapping: 'public projects' → 'Major Public Works' AND 'Public Projects'
  ⚠ Duplicate mapping: 'material banks' → 'Buildings as Material Banks (BAMB)' AND 'Material Banks'
  ⚠ Duplicate mapping: 'construction and demolition sector' → 'Construction and Demolition Sector' AND 'Construction Industry'
  ⚠ Duplicate mapping: 'construction industry' → 'Construction Industry' AND 'Construction Industry'
  ⚠ Duplicate mapping: 'construction sector' → 'Construction Industry' AND 'Construction Industry'
  ⚠ Duplicate mapping: 'government projects' → 'Government Projects' AND 'Government Projects'
  ⚠ Duplicate mapping: 'built environment sector' → 'Built Environment Sect

In [16]:
# ─────────────────────────────────────────────
# Step 2f: Build master entity registry
# ─────────────────────────────────────────────
# Create lookup: norm_text → (canonical_label, type)
ENTITY_REGISTRY = {}
for entry in registry_entries:
    canonical = entry["canonical_label"]
    etype = entry["type"]
    for mn in entry["member_norms"]:
        ENTITY_REGISTRY[mn] = {"canonical": canonical, "type": etype}
    # Also map the canonical label's norm to itself
    ENTITY_REGISTRY[norm_text(canonical)] = {"canonical": canonical, "type": etype}
    # Map all surface forms
    for sf in entry.get("all_surface_forms", []):
        ENTITY_REGISTRY[norm_text(sf)] = {"canonical": canonical, "type": etype}

# Save registry
registry_path = os.path.join(OUTPUT_DIR, "entity_registry.json")
with open(registry_path, "w") as f:
    json.dump(registry_entries, f, indent=2, ensure_ascii=False)

# Save merge log
merge_log_path = os.path.join(OUTPUT_DIR, "merge_log.json")
with open(merge_log_path, "w") as f:
    json.dump(merge_log, f, indent=2, ensure_ascii=False)

print(f"Entity registry: {len(registry_entries)} canonical entities")
print(f"Lookup table: {len(ENTITY_REGISTRY)} surface forms → canonical")
print(f"Saved to: {registry_path}")
print(f"Merge log: {merge_log_path} ({len(merge_log)} merges)")


Entity registry: 2637 canonical entities
Lookup table: 3624 surface forms → canonical
Saved to: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/3_graph_built_v2/entity_registry.json
Merge log: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/3_graph_built_v2/merge_log.json (596 merges)


## 6 — Pass 3: Grounded Re-Extraction

**Goal:** Re-process each chunk with the entity registry in context. The LLM should:
1. Map mentions to existing canonical entities from the registry
2. Only create NEW entities if they're genuinely not in the registry
3. Extract relations using canonical entity names

This eliminates duplicate entities at source.


In [17]:
# ─────────────────────────────────────────────
# Build registry context for injection into prompts
# ─────────────────────────────────────────────
# Format registry by type for prompt injection
def format_registry_for_prompt(registry_entries, max_per_type=80):
    """Format registry as a compact reference for the extraction prompt."""
    by_type = defaultdict(list)
    for entry in registry_entries:
        by_type[entry["type"]].append(entry["canonical_label"])

    lines = []
    for etype in ENTITY_TYPES:
        entities = sorted(by_type.get(etype, []))[:max_per_type]
        if entities:
            lines.append(f"  {etype}: {', '.join(entities)}")

    return "\n".join(lines)

# For chunks, we'll inject a RELEVANT subset of the registry
# (entities whose surface forms appear in or near the chunk text)
def get_relevant_registry(chunk_text, registry_entries, max_entries=60):
    """Find registry entries relevant to this chunk."""
    chunk_lower = chunk_text.lower()
    chunk_tokens = set(re.findall(r"[a-z]{3,}", chunk_lower))

    scored = []
    for entry in registry_entries:
        # Check if any surface form or canonical label tokens overlap with chunk
        entry_tokens = set()
        for sf in entry.get("all_surface_forms", []) + [entry["canonical_label"]]:
            entry_tokens.update(re.findall(r"[a-z]{3,}", sf.lower()))

        overlap = len(entry_tokens & chunk_tokens)
        if overlap > 0:
            # Bonus if exact substring match
            bonus = 0
            canonical_lower = entry["canonical_label"].lower()
            if canonical_lower in chunk_lower:
                bonus = 10
            for sf in entry.get("all_surface_forms", []):
                if sf.lower() in chunk_lower:
                    bonus = max(bonus, 5)

            scored.append((entry, overlap + bonus))

    # Sort by relevance, take top N
    scored.sort(key=lambda x: -x[1])
    return [e for e, s in scored[:max_entries]]

# Test
sample_chunk = chunks[0]["text"]
relevant = get_relevant_registry(sample_chunk, registry_entries)
print(f"Sample chunk: {len(sample_chunk)} chars")
print(f"Relevant registry entries: {len(relevant)}")
print(f"Sample relevant entries: {[e['canonical_label'] for e in relevant[:5]]}")


Sample chunk: 1298 chars
Relevant registry entries: 60
Sample relevant entries: ['Donation via Charitable and Social Enterprises', 'Commercial and Industrial Repair and Refurbishment', 'Charitable and Social Enterprises', 'Reusable alternatives to single-use items', 'Corporate Donation of Unsold Goods']


In [18]:
# ─────────────────────────────────────────────
# Pass 3 prompts
# ─────────────────────────────────────────────
P3_SYSTEM = f"""You extract a HIGH-PRECISION knowledge graph from governance/policy text about building deconstruction,
secondary materials, reuse/recycling, and circular economy in the built environment.

ENTITY TYPES: {ENTITY_TYPES}
RELATION TYPES: {RELATION_TYPES}

CRITICAL INSTRUCTION — ENTITY REGISTRY:
You will be given a registry of KNOWN canonical entities. When you encounter a concept in the text:
1. FIRST check if it matches any entity in the registry (same concept, even if worded differently).
2. If YES → use the EXACT canonical label and type from the registry.
3. If NO → create a new entity, but ONLY if it's clearly a distinct concept not in the registry.

RULES:
- Use registry canonical labels EXACTLY as given (same spelling, same case).
- Every entity needs: name, type, is_new (true if not in registry), evidence_excerpt (10-35 word verbatim quote).
- Every relation needs: subject, predicate, object, evidence_excerpt.
- Both subject and object must appear in your entity list.
- Keep extractions precise and well-supported by the text.
- If nothing reliable can be extracted, return empty lists.
"""

P3_USER = """ENTITY REGISTRY (use these canonical labels when the concept appears in the text):
{registry_text}

---

TEXT CHUNK:
{chunk_text}

---

TASK: Extract entities and relations. Map to registry entities where possible.

Return JSON:
{{
  "entities": [
    {{"name": "Canonical Label From Registry", "type": "Type", "is_new": false, "evidence_excerpt": "verbatim 10-35 words"}},
    {{"name": "Genuinely New Entity", "type": "Type", "is_new": true, "evidence_excerpt": "verbatim 10-35 words"}}
  ],
  "relations": [
    {{"subject": "Entity Name", "predicate": "RELATION_TYPE", "object": "Entity Name", "evidence_excerpt": "verbatim 10-35 words"}}
  ]
}}

CONSTRAINTS:
- Max 15 entities, max 25 relations.
- Prefer governance: instruments, authorities, jurisdictions, requirements.
- Also capture practices, materials, stakeholders, barriers, enablers, outcomes.
- Return ONLY valid JSON.
"""

def validate_p3_output(data):
    """Validate Pass 3 output."""
    ents = data.get("entities", []) or []
    rels = data.get("relations", []) or []

    valid_ents = []
    for e in ents:
        if (isinstance(e, dict)
            and isinstance(e.get("name"), str) and e["name"].strip()
            and e.get("type") in ENTITY_TYPES
            and isinstance(e.get("evidence_excerpt"), str)):
            valid_ents.append(e)

    valid_rels = []
    ent_names = {e["name"].strip() for e in valid_ents}
    for r in rels:
        if (isinstance(r, dict)
            and isinstance(r.get("subject"), str) and r["subject"].strip() in ent_names
            and r.get("predicate") in RELATION_TYPES
            and isinstance(r.get("object"), str) and r["object"].strip() in ent_names):
            valid_rels.append(r)

    return {"entities": valid_ents[:15], "relations": valid_rels[:25]}

print("Pass 3 prompts ready ✓")


Pass 3 prompts ready ✓


In [19]:
# ─────────────────────────────────────────────
# Run Pass 3 (with caching)
# ─────────────────────────────────────────────
P3_CACHE = os.path.join(CACHE_DIR, "pass3_grounded.jsonl")
p3_cache = load_cache(P3_CACHE)

p3_results = []
p3_errors  = []

for rec in tqdm(chunks, desc="Pass 3 — Grounded Extraction"):
    chunk_id = rec["chunk_id"]
    chunk_text = rec["text"].strip()

    # Check cache
    if chunk_id in p3_cache:
        p3_results.append({"chunk_id": chunk_id, **p3_cache[chunk_id]})
        continue

    # Get relevant registry entries for this chunk
    relevant = get_relevant_registry(chunk_text, registry_entries)

    # Format registry text
    registry_text = "\n".join(
        f"  [{e['type']}] {e['canonical_label']}"
        for e in relevant
    )
    if not registry_text:
        registry_text = "(No closely matching registry entries for this chunk)"

    try:
        raw = call_claude(
            model=MODEL_EXTRACTION,
            system=P3_SYSTEM,
            user_msg=P3_USER.format(registry_text=registry_text, chunk_text=chunk_text),
            max_tokens=4096,
        )
        data = parse_json_response(raw)
        validated = validate_p3_output(data)

        # Cache
        save_cache_entry(P3_CACHE, chunk_id, validated)
        p3_cache[chunk_id] = validated
        p3_results.append({"chunk_id": chunk_id, **validated})

    except Exception as e:
        p3_errors.append({"chunk_id": chunk_id, "error": str(e)})
        print(f"  ✗ {chunk_id}: {e}")

print(f"\nPass 3 complete: {len(p3_results)} chunks processed, {len(p3_errors)} errors")


Pass 3 — Grounded Extraction:   7%|▋         | 24/357 [07:23<1:55:03, 20.73s/it]

  ✗ 27.Engineering-for-Australias-Circular-Economy-White-Paper-Download::c80::b2ecead212ec052e: Could not parse JSON from: ```json
{
  "entities": [
    {
      "name": "EPR schemes",
      "type": "Instrument",
      "is_new": false,
      "evidence_excerpt": "Ensure EPR schemes incentivise circular design – perhaps by m...


Pass 3 — Grounded Extraction:  76%|███████▌  | 272/357 [1:24:23<29:35, 20.88s/it]

  ✗ 71.The_Reuse_Playbook_-_Circular_Buildings_Toolkit::c181::19035d1020d2fdbf: Could not parse JSON from: ```json
{
  "entities": [
    {
      "name": "Steel Construction Institute P440 Reuse of Pre-1970 Steelwork",
      "type": "Instrument",
      "is_new": false,
      "evidence_excerpt": "SCI P440 – ...


Pass 3 — Grounded Extraction: 100%|██████████| 357/357 [1:49:55<00:00, 18.47s/it]


Pass 3 complete: 355 chunks processed, 2 errors





## 7 — Graph Assembly

Build the final knowledge graph from Pass 3 results. Entities are already canonical, so minimal post-processing needed.


In [20]:
# ─────────────────────────────────────────────
# Build node and edge tables
# ─────────────────────────────────────────────
node_store = {}  # node_id -> dict
edge_rows  = []
kg_rows    = []

# Track provenance
node_provenance = defaultdict(lambda: {"chunk_ids": set(), "surface_forms": set()})

for res in p3_results:
    chunk_id = res["chunk_id"]

    # Find original chunk record
    chunk_rec = next((c for c in chunks if c["chunk_id"] == chunk_id), None)
    if not chunk_rec:
        continue

    chunk_text = chunk_rec["text"]
    source_file = chunk_rec.get("source_file")
    page_num = chunk_rec.get("page_num")
    judge_label = chunk_rec.get("_judge_label")
    judge_conf = chunk_rec.get("_judge_confidence")

    local_name_to_id = {}

    for e in res["entities"]:
        name = e["name"].strip()
        etype = e["type"]
        is_new = e.get("is_new", True)

        # Try to resolve to registry canonical form
        name_norm_val = norm_text(name)
        if name_norm_val in ENTITY_REGISTRY:
            reg = ENTITY_REGISTRY[name_norm_val]
            canonical_name = reg["canonical"]
            canonical_type = reg["type"]
        else:
            canonical_name = name
            canonical_type = etype

        nid = make_entity_id(canonical_type, canonical_name)
        local_name_to_id[name] = nid

        # Track provenance
        node_provenance[nid]["chunk_ids"].add(chunk_id)
        node_provenance[nid]["surface_forms"].add(name)

        if nid not in node_store:
            node_store[nid] = {
                "node_id": nid,
                "entity_type": canonical_type,
                "name": canonical_name,
                "name_norm": norm_text(canonical_name),
                "is_registry": not is_new,
                "first_seen_chunk": chunk_id,
                "first_seen_source": source_file,
                "first_seen_page": page_num,
                "example_evidence": (e.get("evidence_excerpt") or "")[:500],
            }

    for r in res["relations"]:
        subj = r["subject"].strip()
        obj = r["object"].strip()
        pred = r["predicate"]
        evid = (r.get("evidence_excerpt") or "").strip()

        if subj not in local_name_to_id or obj not in local_name_to_id:
            continue

        sid = local_name_to_id[subj]
        oid = local_name_to_id[obj]

        if sid == oid:  # Skip self-loops
            continue

        s_char, e_char = evidence_offsets(chunk_text, evid)

        edge_rows.append({
            "edge_id": make_edge_id(sid, pred, oid, str(chunk_id)),
            "subject_id": sid,
            "predicate": pred,
            "object_id": oid,
            "chunk_id": chunk_id,
            "source_file": source_file,
            "page_num": page_num,
            "judge_label": judge_label,
            "statement_confidence": judge_conf,
            "evidence_excerpt": evid[:800],
            "evidence_start_char": s_char,
            "evidence_end_char": e_char,
        })

    kg_rows.append({
        "chunk_id": chunk_id,
        "source_file": source_file,
        "page_num": page_num,
        "entities": res["entities"],
        "relations": res["relations"],
    })

# Add provenance to nodes
for nid, prov in node_provenance.items():
    if nid in node_store:
        node_store[nid]["chunk_count"] = len(prov["chunk_ids"])
        node_store[nid]["surface_forms"] = "; ".join(sorted(prov["surface_forms"]))

print(f"Graph assembled:")
print(f"  Nodes: {len(node_store)}")
print(f"  Edges: {len(edge_rows)}")


Graph assembled:
  Nodes: 1748
  Edges: 4533


In [21]:
# ─────────────────────────────────────────────
# Build DataFrames and remove duplicate edges
# ─────────────────────────────────────────────
nodes_df = pd.DataFrame(list(node_store.values())).sort_values(["entity_type", "name_norm"])
edges_df = pd.DataFrame(edge_rows).drop_duplicates(subset=["edge_id"])

# Remove edges referencing non-existent nodes
valid_nids = set(nodes_df["node_id"])
before = len(edges_df)
edges_df = edges_df[edges_df["subject_id"].isin(valid_nids) & edges_df["object_id"].isin(valid_nids)]
print(f"Edge cleanup: {before} → {len(edges_df)} (removed {before - len(edges_df)} orphan edges)")

# Compute edge weight (number of chunks supporting each unique triple)
edge_triples = edges_df.groupby(["subject_id", "predicate", "object_id"]).agg(
    support_count=("chunk_id", "nunique"),
    avg_confidence=("statement_confidence", "mean"),
    evidence_samples=("evidence_excerpt", lambda x: " | ".join(x.head(3))),
).reset_index()

print(f"\nUnique triples: {len(edge_triples)}")
print(f"Support count distribution:")
print(edge_triples["support_count"].describe().to_string())

print(f"\n--- Final Graph Statistics ---")
print(f"Nodes: {len(nodes_df)}")
print(f"Unique edges (triples): {len(edge_triples)}")
print(f"\nNode type distribution:")
print(nodes_df["entity_type"].value_counts().to_string())
print(f"\nRelation distribution:")
print(edges_df["predicate"].value_counts().to_string())

# Registry vs new entities
if "is_registry" in nodes_df.columns:
    reg_counts = nodes_df["is_registry"].value_counts()
    print(f"\nRegistry entities: {reg_counts.get(True, 0)}")
    print(f"New entities (not in registry): {reg_counts.get(False, 0)}")


Edge cleanup: 4529 → 4529 (removed 0 orphan edges)

Unique triples: 3929
Support count distribution:
count    3929.000000
mean        1.152711
std         0.444007
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         6.000000

--- Final Graph Statistics ---
Nodes: 1748
Unique edges (triples): 3929

Node type distribution:
entity_type
Practice         460
Instrument       261
MaterialAsset    254
Requirement      156
Stakeholder      143
Barrier          126
OutcomeMetric    120
Authority        100
Enabler           85
Jurisdiction      43

Relation distribution:
predicate
INVOLVES      1121
ENABLES        810
REQUIRES       604
APPLIES_TO     487
PRODUCES       360
AFFECTS        356
BARRIERS       273
APPLIES_IN     213
ISSUED_BY      150
REFERENCES     141
PROHIBITS       14

Registry entities: 1529
New entities (not in registry): 219


## 8 — Save Outputs

In [22]:
# ─────────────────────────────────────────────
# Save all outputs
# ─────────────────────────────────────────────
import networkx as nx

# CSVs
nodes_csv_path = os.path.join(OUTPUT_DIR, "nodes.csv")
edges_csv_path = os.path.join(OUTPUT_DIR, "edges.csv")
triples_csv_path = os.path.join(OUTPUT_DIR, "edge_triples.csv")

nodes_df.to_csv(nodes_csv_path, index=False)
edges_df.to_csv(edges_csv_path, index=False)
edge_triples.to_csv(triples_csv_path, index=False)

# KG JSONL
kg_jsonl_path = os.path.join(OUTPUT_DIR, "KG_records.jsonl")
with open(kg_jsonl_path, "w") as f:
    for row in kg_rows:
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

# Build and save NetworkX graph for quick analysis
G = nx.MultiDiGraph()
for _, row in nodes_df.iterrows():
    G.add_node(row["node_id"], **row.to_dict())
for _, row in edges_df.iterrows():
    G.add_edge(row["subject_id"], row["object_id"],
               predicate=row["predicate"],
               chunk_id=row["chunk_id"],
               confidence=row["statement_confidence"])

# Graph stats
components = list(nx.weakly_connected_components(G))
largest_cc = max(components, key=len) if components else set()

print(f"\n{'='*60}")
print(f"STAGE 3 v2 — FINAL OUTPUT SUMMARY")
print(f"{'='*60}")
print(f"Nodes:              {len(nodes_df)}")
print(f"Edges (all):        {len(edges_df)}")
print(f"Unique triples:     {len(edge_triples)}")
print(f"Components:         {len(components)}")
print(f"Largest component:  {len(largest_cc)} nodes ({100*len(largest_cc)/len(nodes_df):.1f}%)")
print(f"\nOutput directory: {OUTPUT_DIR}")
print(f"  nodes.csv          ({len(nodes_df)} rows)")
print(f"  edges.csv          ({len(edges_df)} rows)")
print(f"  edge_triples.csv   ({len(edge_triples)} rows)")
print(f"  KG_records.jsonl   ({len(kg_rows)} rows)")
print(f"  entity_registry.json")
print(f"  merge_log.json")



STAGE 3 v2 — FINAL OUTPUT SUMMARY
Nodes:              1748
Edges (all):        4529
Unique triples:     3929
Components:         105
Largest component:  1576 nodes (90.2%)

Output directory: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/3_graph_built_v2
  nodes.csv          (1748 rows)
  edges.csv          (4529 rows)
  edge_triples.csv   (3929 rows)
  KG_records.jsonl   (355 rows)
  entity_registry.json
  merge_log.json


## 9 — Validation & Quality Checks

Quick checks to verify the graph quality before moving to Stage 4.


In [23]:
# ─────────────────────────────────────────────
# Quality checks
# ─────────────────────────────────────────────

# 1. Check for remaining near-duplicates
print("=== Near-Duplicate Check ===")
from itertools import combinations

for etype in ENTITY_TYPES:
    type_nodes = nodes_df[nodes_df["entity_type"] == etype]["name"].tolist()
    if len(type_nodes) < 2:
        continue

    # Quick token overlap check
    suspects = []
    for a, b in combinations(type_nodes, 2):
        tokens_a = set(norm_text(a).split())
        tokens_b = set(norm_text(b).split())
        if len(tokens_a) > 0 and len(tokens_b) > 0:
            jaccard = len(tokens_a & tokens_b) / len(tokens_a | tokens_b)
            if jaccard >= 0.6:
                suspects.append((a, b, jaccard))

    if suspects:
        print(f"\n  [{etype}] {len(suspects)} potential duplicates:")
        for a, b, j in sorted(suspects, key=lambda x: -x[2])[:5]:
            print(f"    {j:.2f} | \"{a}\" ↔ \"{b}\"")

# 2. Check node degree distribution
print("\n=== Degree Distribution ===")
G_simple = nx.Graph()
for _, row in edge_triples.iterrows():
    G_simple.add_edge(row["subject_id"], row["object_id"])

degrees = dict(G_simple.degree())
if degrees:
    deg_values = list(degrees.values())
    isolates = sum(1 for d in deg_values if d == 0)
    print(f"  Isolate nodes (degree 0): {len(nodes_df) - len(degrees) + isolates}")
    print(f"  Mean degree: {np.mean(deg_values):.1f}")
    print(f"  Max degree: {max(deg_values)}")

    # Top connected nodes
    top_nodes = sorted(degrees.items(), key=lambda x: -x[1])[:15]
    print(f"\n  Top 15 most connected:")
    for nid, deg in top_nodes:
        node = node_store.get(nid, {})
        print(f"    {deg:3d} edges | [{node.get('entity_type','')}] {node.get('name','')}")

# 3. Summary comparison with v1
print(f"\n=== Comparison with Stage 3 v1 ===")
print(f"  v1: 2,180 nodes, 2,026 edges, 471 components")
print(f"  v2: {len(nodes_df)} nodes, {len(edge_triples)} unique triples, {len(components)} components")
print(f"  Reduction: {100*(1-len(nodes_df)/2180):.1f}% fewer nodes")


=== Near-Duplicate Check ===

  [Instrument] 22 potential duplicates:
    0.80 | "2024 National Circular Economy Framework" ↔ "National Circular Economy Framework"
    0.75 | "National Waste Report" ↔ "National Waste Report 2020"
    0.75 | "National Waste Report" ↔ "National Waste Report 2022"
    0.71 | "National End-of-Waste Decision for Recycled Aggregates" ↔ "Single-Case End-of-Waste Decision for Recycled Aggregates"
    0.67 | "A Circular Economy for Victoria" ↔ "Designers for a Circular Economy"

  [Authority] 1 potential duplicates:
    0.80 | "Australian State and Territory Governments" ↔ "State and Territory Governments"

  [Requirement] 6 potential duplicates:
    0.80 | "Embodied Energy and Carbon" ↔ "Requirement: Embodied Energy and Carbon"
    0.67 | "Acceptance Criteria" ↔ "Material Acceptance Criteria"
    0.67 | "Acceptance Criteria" ↔ "Reuse Acceptance Criteria"
    0.67 | "Circular Design Principles" ↔ "Design Principles"
    0.67 | "Construction GPP criteria" ↔ "GPP