# 5) Governance Intelligence Backend Assembly

**Purpose:** build a queryable backend over the cleaned knowledge graph (Stage 4) that combines
graph traversal with evidence retrieval (RAG) to answer structured governance questions.

**Three components:**
1. **Graph Store** — NetworkX property graph with traversal primitives
2. **Vector Store** — ChromaDB index over source chunks for evidence retrieval
3. **Query Engine** — structured query functions + Claude for grounded natural-language answers

**Supported query types:**
- `trace_governance_pathway` — What instruments/requirements govern a practice in a jurisdiction?
- `compare_jurisdictions` — How do two jurisdictions differ in coverage?
- `identify_gaps` — Which practices have barriers but no enabling instruments?
- `explain_entity` — What do we know about a specific entity?
- `free_query` — Open-ended question answered with graph context + evidence

**Pipeline:**
- §0 — Dependencies & configuration
- §1 — Load cleaned graph + post-load dedup fix
- §2 — Build graph store with traversal primitives
- §3 — Build vector store (embed & index source chunks)
- §4 — Query engine: structured query functions
- §5 — RAG integration: Claude-grounded answers
- §6 — Demo queries + validation
- §7 — Export backend artefacts


## 0) Install dependencies
Run once per Colab runtime.

In [9]:
!pip -q install anthropic pandas numpy networkx sentence-transformers chromadb tqdm tenacity
!pip -q install rapidfuzz

import os, json, re, warnings
from pathlib import Path
from collections import defaultdict, Counter
from typing import List, Dict, Optional, Tuple, Any

import pandas as pd
import numpy as np
import networkx as nx
from tqdm import tqdm

warnings.filterwarnings("ignore", category=FutureWarning)

print("✅ Dependencies installed.")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/3.2 MB[0m [31m10.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m2.9/3.2 MB[0m [31m43.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25h✅ Dependencies installed.


## 1) Configuration + load cleaned graph
Reads Stage 4 outputs and applies a post-load dedup fix for any nodes that ended up
with identical (name_norm, entity_type) after cross-type resolution changed their types.

In [3]:
# ─────────────────────────────────────────────
# 1a) Configuration
# ─────────────────────────────────────────────
from google.colab import userdata

ANTHROPIC_API_KEY = userdata.get("Anthropic_API_KEY")
MODEL = "claude-sonnet-4-5"
LLM_TEMPERATURE = 0.0

# Paths
INPUT_DIR  = "/content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/4_kg_cleaned"
CHUNKS_DIR = "/content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/batch_enhanced_KG_outputs/outputs/kept_chunks"
OUTPUT_DIR = "/content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/5_backend"

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

COUNTRY = "AU"

print(f"Config loaded.")
print(f"Input:  {INPUT_DIR}")
print(f"Output: {OUTPUT_DIR}")


Config loaded.
Input:  /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/4_kg_cleaned
Output: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/5_backend


In [4]:
# ─────────────────────────────────────────────
# 1b) Load Stage 4 outputs
# ─────────────────────────────────────────────
nodes_df = pd.read_csv(os.path.join(INPUT_DIR, "nodes_clean.csv"))
edges_df = pd.read_csv(os.path.join(INPUT_DIR, "edges_clean.csv"))
triples_df = pd.read_csv(os.path.join(INPUT_DIR, "triples_clean.csv"))

# Load KG records (chunk-level extractions with evidence)
kg_records = []
kg_path = os.path.join(INPUT_DIR, "KG_records_tagged.jsonl")
with open(kg_path) as f:
    for line in f:
        if line.strip():
            kg_records.append(json.loads(line))

# Load source chunks for RAG (the original kept chunks with full text)
source_chunks = []
master_path = os.path.join(CHUNKS_DIR, "MASTER_kept_judged.jsonl")
if os.path.exists(master_path):
    with open(master_path) as f:
        for line in f:
            if line.strip():
                source_chunks.append(json.loads(line))

# Load schema
with open(os.path.join(INPUT_DIR, "multi_country_schema.json")) as f:
    schema = json.load(f)

print(f"Loaded:")
print(f"  Nodes:    {len(nodes_df)}")
print(f"  Edges:    {len(edges_df)}")
print(f"  Triples:  {len(triples_df)}")
print(f"  KG recs:  {len(kg_records)}")
print(f"  Source chunks: {len(source_chunks)}")


Loaded:
  Nodes:    1642
  Edges:    3223
  Triples:  2788
  KG recs:  355
  Source chunks: 364


In [5]:
# ─────────────────────────────────────────────
# 1c) Post-load dedup fix
# ─────────────────────────────────────────────
# After Stage 4's cross-type resolution changed entity types, some nodes may have
# ended up with identical (name_norm, entity_type) pairs. Merge these.

nodes_df["name_norm"] = nodes_df["name_norm"].fillna(nodes_df["name"].str.lower().str.strip())

dupes = nodes_df.groupby(["name_norm", "entity_type"])["node_id"].apply(list)
dupes = dupes[dupes.apply(len) > 1]

dedup_map = {}
dedup_count = 0
for (name_norm, etype), ids in dupes.items():
    canonical = ids[0]
    for other in ids[1:]:
        dedup_map[other] = canonical
        dedup_count += 1

if dedup_map:
    # Remap edges
    edges_df["subject_id"] = edges_df["subject_id"].map(lambda x: dedup_map.get(x, x))
    edges_df["object_id"] = edges_df["object_id"].map(lambda x: dedup_map.get(x, x))
    # Remove self-loops
    edges_df = edges_df[edges_df["subject_id"] != edges_df["object_id"]]
    # Remove merged nodes
    nodes_df = nodes_df[~nodes_df["node_id"].isin(dedup_map.keys())]
    print(f"Post-load dedup: merged {dedup_count} duplicate nodes")
    for (name_norm, etype), ids in dupes.items():
        print(f"  [{etype}] \"{name_norm}\" — merged {len(ids)} copies into 1")
else:
    print("Post-load dedup: no duplicates found ✓")

print(f"\nWorking set: {len(nodes_df)} nodes, {len(edges_df)} edges")


Post-load dedup: merged 1 duplicate nodes
  [Practice] "design for disassembly (dfd)" — merged 2 copies into 1

Working set: 1641 nodes, 3223 edges


## 2) Build graph store with traversal primitives
The graph store wraps NetworkX with domain-specific traversal functions.
These are the building blocks that query functions in §4 compose.

In [6]:
# ─────────────────────────────────────────────
# 2a) Build NetworkX property graph
# ─────────────────────────────────────────────

G = nx.MultiDiGraph()

for _, row in nodes_df.iterrows():
    attrs = {k: v for k, v in row.to_dict().items() if pd.notna(v)}
    G.add_node(row["node_id"], **attrs)

for _, row in edges_df.iterrows():
    attrs = {k: v for k, v in row.to_dict().items()
             if pd.notna(v) and k not in ("subject_id", "object_id")}
    G.add_edge(row["subject_id"], row["object_id"], **attrs)

# Build lookup indices
name_to_ids = defaultdict(list)  # name_norm -> [node_ids]
type_to_ids = defaultdict(list)  # entity_type -> [node_ids]

for nid, data in G.nodes(data=True):
    name_norm = data.get("name_norm", data.get("name", "").lower().strip())
    etype = data.get("entity_type", "")
    name_to_ids[name_norm].append(nid)
    type_to_ids[etype].append(nid)

print(f"Graph store built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
print(f"Name index: {len(name_to_ids)} unique names")
print(f"Type index: {len(type_to_ids)} types")


Graph store built: 1641 nodes, 3223 edges
Name index: 1637 unique names
Type index: 10 types


In [7]:
# ─────────────────────────────────────────────
# 2b) Graph traversal primitives
# ─────────────────────────────────────────────

class GraphStore:
    """Domain-aware graph store over the governance knowledge graph."""

    def __init__(self, G, name_to_ids, type_to_ids):
        self.G = G
        self.name_to_ids = name_to_ids
        self.type_to_ids = type_to_ids

    # ── Lookup ──────────────────────────────────────────────

    def find_entity(self, query: str, entity_type: str = None) -> List[Dict]:
        """Fuzzy-find entities by name. Optionally filter by type."""
        from rapidfuzz import fuzz
        query_lower = query.lower().strip()

        # Exact match first
        if query_lower in self.name_to_ids:
            ids = self.name_to_ids[query_lower]
            if entity_type:
                ids = [i for i in ids if self.G.nodes[i].get("entity_type") == entity_type]
            return [self._node_summary(i) for i in ids]

        # Fuzzy match
        candidates = []
        for name_norm, ids in self.name_to_ids.items():
            score = fuzz.token_sort_ratio(query_lower, name_norm)
            if score >= 70:
                for nid in ids:
                    if entity_type and self.G.nodes[nid].get("entity_type") != entity_type:
                        continue
                    candidates.append((score, nid))

        candidates.sort(key=lambda x: -x[0])
        return [self._node_summary(nid) for _, nid in candidates[:10]]

    def get_node(self, node_id: str) -> Dict:
        """Get full node data."""
        if node_id not in self.G:
            return {}
        return dict(self.G.nodes[node_id])

    def _node_summary(self, node_id: str) -> Dict:
        d = self.G.nodes[node_id]
        return {
            "node_id": node_id,
            "name": d.get("name", ""),
            "entity_type": d.get("entity_type", ""),
            "degree": self.G.degree(node_id),
        }

    # ── Neighbours ──────────────────────────────────────────

    def get_neighbours(self, node_id: str, direction: str = "both",
                       predicate: str = None, neighbour_type: str = None) -> List[Dict]:
        """Get neighbouring nodes with optional filters."""
        results = []

        if direction in ("out", "both"):
            for _, target, data in self.G.out_edges(node_id, data=True):
                if predicate and data.get("predicate") != predicate:
                    continue
                if neighbour_type and self.G.nodes[target].get("entity_type") != neighbour_type:
                    continue
                results.append({
                    **self._node_summary(target),
                    "predicate": data.get("predicate", ""),
                    "direction": "outgoing",
                    "evidence": data.get("evidence_excerpt", ""),
                })

        if direction in ("in", "both"):
            for source, _, data in self.G.in_edges(node_id, data=True):
                if predicate and data.get("predicate") != predicate:
                    continue
                if neighbour_type and self.G.nodes[source].get("entity_type") != neighbour_type:
                    continue
                results.append({
                    **self._node_summary(source),
                    "predicate": data.get("predicate", ""),
                    "direction": "incoming",
                    "evidence": data.get("evidence_excerpt", ""),
                })

        return results

    # ── Path finding ────────────────────────────────────────

    def find_paths(self, source_id: str, target_id: str, max_length: int = 4) -> List[List[str]]:
        """Find all simple paths between two nodes up to max_length."""
        G_undir = self.G.to_undirected()
        try:
            paths = list(nx.all_simple_paths(G_undir, source_id, target_id, cutoff=max_length))
            return paths[:20]  # Cap at 20 paths
        except (nx.NetworkXError, nx.NodeNotFound):
            return []

    # ── Subgraph extraction ─────────────────────────────────

    def ego_subgraph(self, node_id: str, radius: int = 2) -> nx.MultiDiGraph:
        """Extract ego network around a node."""
        G_undir = self.G.to_undirected()
        ego_nodes = nx.ego_graph(G_undir, node_id, radius=radius).nodes()
        return self.G.subgraph(ego_nodes).copy()

    def type_subgraph(self, entity_types: List[str]) -> nx.MultiDiGraph:
        """Extract subgraph containing only specified entity types."""
        nodes = [n for n, d in self.G.nodes(data=True) if d.get("entity_type") in entity_types]
        return self.G.subgraph(nodes).copy()

    # ── Jurisdiction queries ────────────────────────────────

    def get_jurisdiction_entities(self, jurisdiction_name: str,
                                  entity_type: str = None) -> List[Dict]:
        """Get all entities linked to a jurisdiction via APPLIES_IN."""
        j_nodes = self.find_entity(jurisdiction_name, entity_type="Jurisdiction")
        if not j_nodes:
            return []

        results = []
        for j in j_nodes:
            neighbours = self.get_neighbours(j["node_id"], direction="in", predicate="APPLIES_IN")
            if entity_type:
                neighbours = [n for n in neighbours if n["entity_type"] == entity_type]
            results.extend(neighbours)
        return results

    def get_entities_by_type(self, entity_type: str) -> List[Dict]:
        """Get all entities of a given type."""
        ids = self.type_to_ids.get(entity_type, [])
        return [self._node_summary(i) for i in ids]

    # ── Statistics ──────────────────────────────────────────

    def summary_stats(self) -> Dict:
        comps = list(nx.weakly_connected_components(self.G))
        largest = max(comps, key=len) if comps else set()
        return {
            "nodes": self.G.number_of_nodes(),
            "edges": self.G.number_of_edges(),
            "components": len(comps),
            "largest_component": len(largest),
            "isolates": sum(1 for n in self.G if self.G.degree(n) == 0),
            "types": dict(Counter(d.get("entity_type","") for _,d in self.G.nodes(data=True))),
        }


# Instantiate
gs = GraphStore(G, name_to_ids, type_to_ids)

stats = gs.summary_stats()
print(f"GraphStore ready:")
print(f"  Nodes: {stats['nodes']}, Edges: {stats['edges']}")
print(f"  Components: {stats['components']}, Largest: {stats['largest_component']}")
print(f"  Isolates: {stats['isolates']}")


GraphStore ready:
  Nodes: 1641, Edges: 3223
  Components: 218, Largest: 1340
  Isolates: 174


In [10]:
# ─────────────────────────────────────────────
# 2c) Quick smoke test of graph traversal
# ─────────────────────────────────────────────

# Test entity lookup
print("=== Entity Lookup: 'design for disassembly' ===")
results = gs.find_entity("design for disassembly")
for r in results[:3]:
    print(f"  [{r['entity_type']}] {r['name']} (degree {r['degree']})")

# Test neighbours
if results:
    top = results[0]
    print(f"\n=== Neighbours of '{top['name']}' (Instruments only) ===")
    neighbours = gs.get_neighbours(top["node_id"], neighbour_type="Instrument")
    for n in neighbours[:8]:
        print(f"  {n['direction']:8s} --{n['predicate']:12s}--> [{n['entity_type']}] {n['name']}")

# Test jurisdiction query
print(f"\n=== Instruments applying in 'Australia' ===")
au_instruments = gs.get_jurisdiction_entities("australia", entity_type="Instrument")
for inst in au_instruments[:8]:
    print(f"  [{inst['entity_type']}] {inst['name']}")
print(f"  ... total: {len(au_instruments)}")


=== Entity Lookup: 'design for disassembly' ===
  [Practice] Design for Disassembly (DfD) (degree 133)
  [Practice] Building Disassembly (degree 57)

=== Neighbours of 'Design for Disassembly (DfD)' (Instruments only) ===
  outgoing --ENABLES     --> [Instrument] National Waste Policy
  outgoing --REFERENCES  --> [Instrument] ISO 20887:2020
  outgoing --REFERENCES  --> [Instrument] EU Level(s) Indicator 2.4 Design for Deconstruction
  outgoing --PRODUCES    --> [Instrument] Deconstruction Plan
  outgoing --PRODUCES    --> [Instrument] Sustainability Report
  incoming --REQUIRES    --> [Instrument] Engineering for Australia's Circular Economy: A National Strategy
  incoming --REQUIRES    --> [Instrument] National Circular Design Standards
  incoming --REQUIRES    --> [Instrument] Circular Economy Action Plan

=== Instruments applying in 'Australia' ===
  [Instrument] National Waste Policy
  [Instrument] National Waste Policy
  [Instrument] National Circular Design Standards
  [Instrumen

## 3) Build vector store (embed & index source chunks)
Embeds the 355+ source chunks into ChromaDB for evidence retrieval.
Each chunk is tagged with the entities and relations extracted from it,
enabling hybrid retrieval: graph traversal finds relevant entities →
vector search retrieves supporting evidence passages.

In [11]:
# ─────────────────────────────────────────────
# 3a) Prepare chunks for embedding
# ─────────────────────────────────────────────

# Build chunk_id -> entities/relations mapping from KG records
chunk_entity_map = defaultdict(set)
chunk_relation_map = defaultdict(set)

for rec in kg_records:
    cid = rec.get("chunk_id", "")
    entities = rec.get("entities", rec.get("nodes", []))
    relations = rec.get("relations", rec.get("edges", []))

    if isinstance(entities, list):
        for e in entities:
            if isinstance(e, dict):
                chunk_entity_map[cid].add(e.get("name", e.get("canonical_name", "")))
            elif isinstance(e, str):
                chunk_entity_map[cid].add(e)

    if isinstance(relations, list):
        for r in relations:
            if isinstance(r, dict):
                subj = r.get("subject", r.get("source", ""))
                pred = r.get("predicate", r.get("relation", ""))
                obj = r.get("object", r.get("target", ""))
                chunk_relation_map[cid].add(f"{subj} --{pred}--> {obj}")

# Prepare documents for ChromaDB
documents = []
metadatas = []
ids = []

for chunk in source_chunks:
    cid = chunk.get("chunk_id", chunk.get("id", ""))
    text = chunk.get("text", chunk.get("content", chunk.get("chunk_text", "")))

    if not text or not cid:
        continue

    # Metadata
    meta = {
        "chunk_id": str(cid),
        "source_file": str(chunk.get("source_file", chunk.get("filename", ""))),
        "page_num": str(chunk.get("page_num", chunk.get("page", ""))),
        "judge_label": str(chunk.get("judge_label", chunk.get("label", ""))),
        "country": COUNTRY,
    }

    # Add entity tags as searchable metadata
    entities_in_chunk = list(chunk_entity_map.get(cid, set()))
    if entities_in_chunk:
        meta["entities"] = " | ".join(sorted(entities_in_chunk)[:30])  # Cap for metadata size

    documents.append(text)
    metadatas.append(meta)
    ids.append(str(cid))

print(f"Prepared {len(documents)} chunks for embedding")
print(f"Chunks with entity tags: {sum(1 for m in metadatas if 'entities' in m)}")


Prepared 364 chunks for embedding
Chunks with entity tags: 355


In [12]:
# ─────────────────────────────────────────────
# 3b) Build ChromaDB collection
# ─────────────────────────────────────────────
import chromadb
from chromadb.utils import embedding_functions

# Use sentence-transformers (same model as KG extraction for consistency)
ef = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-MiniLM-L6-v2"
)

# Create persistent collection
chroma_client = chromadb.Client()  # In-memory for Colab; swap to PersistentClient for production

collection = chroma_client.get_or_create_collection(
    name="governance_chunks_au",
    embedding_function=ef,
    metadata={"hnsw:space": "cosine"},
)

# Add in batches (ChromaDB limit)
BATCH_SIZE = 100
for i in tqdm(range(0, len(documents), BATCH_SIZE), desc="Indexing chunks"):
    batch_end = min(i + BATCH_SIZE, len(documents))
    collection.add(
        documents=documents[i:batch_end],
        metadatas=metadatas[i:batch_end],
        ids=ids[i:batch_end],
    )

print(f"\nChromaDB collection '{collection.name}' built:")
print(f"  Documents: {collection.count()}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Indexing chunks: 100%|██████████| 4/4 [00:43<00:00, 10.92s/it]


ChromaDB collection 'governance_chunks_au' built:
  Documents: 364





In [13]:
# ─────────────────────────────────────────────
# 3c) Vector store wrapper
# ─────────────────────────────────────────────

class VectorStore:
    """Evidence retrieval over source chunks."""

    def __init__(self, collection, source_chunks):
        self.collection = collection
        # Build chunk lookup for full metadata
        self.chunk_lookup = {}
        for chunk in source_chunks:
            cid = str(chunk.get("chunk_id", chunk.get("id", "")))
            self.chunk_lookup[cid] = chunk

    def search(self, query: str, n_results: int = 5,
               entity_filter: str = None) -> List[Dict]:
        """Semantic search over chunks. Optionally filter by entity presence."""
        where_filter = None
        if entity_filter:
            where_filter = {"entities": {"$contains": entity_filter}}

        try:
            results = self.collection.query(
                query_texts=[query],
                n_results=n_results,
                where=where_filter,
            )
        except Exception:
            # Fall back without filter if it fails
            results = self.collection.query(
                query_texts=[query],
                n_results=n_results,
            )

        output = []
        for i, doc_id in enumerate(results["ids"][0]):
            chunk_data = self.chunk_lookup.get(doc_id, {})
            meta = results["metadatas"][0][i] if results["metadatas"] else {}
            output.append({
                "chunk_id": doc_id,
                "text": results["documents"][0][i],
                "distance": results["distances"][0][i] if results["distances"] else None,
                "source_file": meta.get("source_file", ""),
                "page_num": meta.get("page_num", ""),
                "entities": meta.get("entities", ""),
            })

        return output

    def get_evidence_for_entity(self, entity_name: str, n_results: int = 5) -> List[Dict]:
        """Retrieve chunks most relevant to a specific entity."""
        return self.search(entity_name, n_results=n_results, entity_filter=entity_name)

    def get_evidence_for_triple(self, subject: str, predicate: str, obj: str,
                                 n_results: int = 3) -> List[Dict]:
        """Retrieve evidence for a specific relationship."""
        query = f"{subject} {predicate.lower().replace('_',' ')} {obj}"
        return self.search(query, n_results=n_results)


# Instantiate
vs = VectorStore(collection, source_chunks)

# Smoke test
print("=== Evidence search: 'design for disassembly requirements' ===")
evidence = vs.search("design for disassembly requirements", n_results=3)
for e in evidence:
    print(f"  [{e['source_file']}, p.{e['page_num']}] dist={e['distance']:.3f}")
    print(f"    {e['text'][:150]}...")
    print()


=== Evidence search: 'design for disassembly requirements' ===
  [70.The Reuse Playbook - Circular Buildings design.pdf, p.26] dist=0.328
    - When relevant, integrate "adaptability"-related information within the
building's Material Passport document

Project Reference


[PAGE 26]
5. Desi...

  [47.gbca-a-practical-guide-to-circular-procurement.pdf, p.25] dist=0.392
    [PAGE 25]
Design -
• Collaborate with stakeholders to reduce material use and specify lighter, space-efficient materials.
Procurement
Dematerialisatio...

  [47.gbca-a-practical-guide-to-circular-procurement.pdf, p.33] dist=0.395
    [PAGE 33]
End of use - • Ensure any final repair and maintenance work is carried out in line with operations and maintenance manuals to ensure the
bes...



## 4) Query engine: structured governance queries
These functions compose graph traversals into domain-meaningful queries.
Each returns a structured result dict that can be rendered directly or
passed to Claude for natural-language synthesis.

In [14]:
# ─────────────────────────────────────────────
# 4a) Query engine class
# ─────────────────────────────────────────────

class GovernanceQueryEngine:
    """Structured queries over the governance knowledge graph + evidence store."""

    def __init__(self, graph_store: GraphStore, vector_store: VectorStore):
        self.gs = graph_store
        self.vs = vector_store

    # ── Query 1: Governance Pathway ─────────────────────────

    def trace_governance_pathway(self, practice_area: str,
                                  jurisdiction: str = None) -> Dict:
        """
        Trace the governance pathway for a practice area:
        Authority --ISSUED_BY--> Instrument --REQUIRES/ENABLES--> Practice --PRODUCES--> Outcome
        Optionally scoped to a jurisdiction.
        """
        # Find the practice
        practices = self.gs.find_entity(practice_area, entity_type="Practice")
        if not practices:
            practices = self.gs.find_entity(practice_area)  # Try without type filter
        if not practices:
            return {"error": f"No entity found for '{practice_area}'", "results": []}

        target = practices[0]
        nid = target["node_id"]

        # Trace incoming: what instruments/requirements govern this practice?
        instruments = self.gs.get_neighbours(nid, direction="in", neighbour_type="Instrument")
        requirements = self.gs.get_neighbours(nid, direction="in", neighbour_type="Requirement")
        enablers = self.gs.get_neighbours(nid, direction="in", neighbour_type="Enabler")
        barriers = self.gs.get_neighbours(nid, direction="both", neighbour_type="Barrier")

        # Trace outgoing: what outcomes does this practice produce?
        outcomes = self.gs.get_neighbours(nid, direction="out", neighbour_type="OutcomeMetric")

        # For each instrument, find issuing authority
        instrument_details = []
        for inst in instruments:
            authorities = self.gs.get_neighbours(
                inst["node_id"], direction="out", neighbour_type="Authority"
            )
            # Check jurisdiction applicability
            jurisdictions = self.gs.get_neighbours(
                inst["node_id"], direction="out", predicate="APPLIES_IN"
            )
            jur_names = [j["name"] for j in jurisdictions]

            if jurisdiction and not any(
                jurisdiction.lower() in j.lower() for j in jur_names
            ):
                continue  # Skip instruments not applying in target jurisdiction

            instrument_details.append({
                "instrument": inst["name"],
                "predicate": inst["predicate"],
                "authorities": [a["name"] for a in authorities],
                "jurisdictions": jur_names,
            })

        # Get evidence
        evidence = self.vs.search(
            f"{practice_area} governance requirements instruments",
            n_results=5,
        )

        return {
            "query_type": "governance_pathway",
            "practice": target["name"],
            "practice_type": target["entity_type"],
            "jurisdiction_filter": jurisdiction,
            "instruments": instrument_details,
            "requirements": [{"name": r["name"], "predicate": r["predicate"]} for r in requirements],
            "enablers": [{"name": e["name"]} for e in enablers],
            "barriers": [{"name": b["name"]} for b in barriers],
            "outcomes": [{"name": o["name"]} for o in outcomes],
            "evidence": [{
                "text": e["text"][:500],
                "source": e["source_file"],
                "page": e["page_num"],
            } for e in evidence[:5]],
            "summary_counts": {
                "instruments": len(instrument_details),
                "requirements": len(requirements),
                "enablers": len(enablers),
                "barriers": len(barriers),
                "outcomes": len(outcomes),
            },
        }

    # ── Query 2: Jurisdictional Comparison ──────────────────

    def compare_jurisdictions(self, jurisdictions: List[str],
                               dimension: str = "Instrument") -> Dict:
        """
        Compare what instruments/practices/requirements apply in each jurisdiction.
        """
        comparison = {}

        for jur in jurisdictions:
            entities = self.gs.get_jurisdiction_entities(jur, entity_type=dimension)
            entity_names = sorted(set(e["name"] for e in entities))
            comparison[jur] = entity_names

        # Find shared vs unique
        all_entities = set()
        for names in comparison.values():
            all_entities.update(names)

        coverage_matrix = {}
        for entity in sorted(all_entities):
            coverage_matrix[entity] = {
                jur: entity in comparison.get(jur, [])
                for jur in jurisdictions
            }

        shared = [e for e, cov in coverage_matrix.items() if all(cov.values())]
        unique_per_jur = {
            jur: [e for e, cov in coverage_matrix.items()
                  if cov[jur] and sum(cov.values()) == 1]
            for jur in jurisdictions
        }

        # Evidence
        evidence = self.vs.search(
            f"comparison {' '.join(jurisdictions)} {dimension.lower()}",
            n_results=5,
        )

        return {
            "query_type": "jurisdictional_comparison",
            "jurisdictions": jurisdictions,
            "dimension": dimension,
            "entity_counts": {jur: len(names) for jur, names in comparison.items()},
            "coverage_matrix": coverage_matrix,
            "shared_entities": shared,
            "unique_entities": unique_per_jur,
            "evidence": [{
                "text": e["text"][:500],
                "source": e["source_file"],
                "page": e["page_num"],
            } for e in evidence[:5]],
        }

    # ── Query 3: Gap Analysis ───────────────────────────────

    def identify_gaps(self, practice_area: str = None) -> Dict:
        """
        Find practices that have barriers but lack enabling instruments.
        These represent governance gaps — areas where problems are known
        but policy solutions are missing.
        """
        practices = self.gs.get_entities_by_type("Practice")
        if practice_area:
            practices = [p for p in practices
                         if practice_area.lower() in p["name"].lower()]

        gaps = []
        well_governed = []

        for practice in practices:
            pid = practice["node_id"]

            barriers = self.gs.get_neighbours(pid, direction="both",
                                               neighbour_type="Barrier")
            instruments = self.gs.get_neighbours(pid, direction="in",
                                                  neighbour_type="Instrument")
            enablers = self.gs.get_neighbours(pid, direction="in",
                                               neighbour_type="Enabler")

            if barriers and not instruments and not enablers:
                gaps.append({
                    "practice": practice["name"],
                    "barriers": [b["name"] for b in barriers],
                    "barrier_count": len(barriers),
                    "has_instruments": False,
                    "has_enablers": False,
                    "gap_severity": "high",
                })
            elif barriers and (instruments or enablers):
                if len(barriers) > len(instruments) + len(enablers):
                    gaps.append({
                        "practice": practice["name"],
                        "barriers": [b["name"] for b in barriers],
                        "barrier_count": len(barriers),
                        "instrument_count": len(instruments),
                        "enabler_count": len(enablers),
                        "has_instruments": bool(instruments),
                        "has_enablers": bool(enablers),
                        "gap_severity": "medium",
                    })
                else:
                    well_governed.append({
                        "practice": practice["name"],
                        "barrier_count": len(barriers),
                        "instrument_count": len(instruments),
                        "enabler_count": len(enablers),
                    })

        # Sort gaps by severity then barrier count
        gaps.sort(key=lambda x: (-{"high": 2, "medium": 1}.get(x["gap_severity"], 0),
                                  -x["barrier_count"]))

        return {
            "query_type": "gap_analysis",
            "practice_filter": practice_area,
            "total_practices_analysed": len(practices),
            "gaps": gaps,
            "gap_count": len(gaps),
            "well_governed_count": len(well_governed),
            "high_severity_gaps": [g for g in gaps if g["gap_severity"] == "high"],
            "top_well_governed": sorted(well_governed,
                                         key=lambda x: -x["instrument_count"])[:10],
        }

    # ── Query 4: Entity Explainer ───────────────────────────

    def explain_entity(self, entity_name: str) -> Dict:
        """
        Comprehensive profile of a single entity: what it is, what it connects to,
        and supporting evidence from the corpus.
        """
        matches = self.gs.find_entity(entity_name)
        if not matches:
            return {"error": f"No entity found for '{entity_name}'"}

        entity = matches[0]
        nid = entity["node_id"]
        node_data = self.gs.get_node(nid)

        # All connections grouped by type and predicate
        all_neighbours = self.gs.get_neighbours(nid, direction="both")

        connections_by_type = defaultdict(list)
        connections_by_predicate = defaultdict(list)
        for n in all_neighbours:
            connections_by_type[n["entity_type"]].append(n["name"])
            connections_by_predicate[n["predicate"]].append(n["name"])

        # Evidence
        evidence = self.vs.get_evidence_for_entity(entity_name, n_results=5)

        return {
            "query_type": "entity_explanation",
            "name": entity["name"],
            "entity_type": entity["entity_type"],
            "degree": entity["degree"],
            "node_data": {k: v for k, v in node_data.items()
                          if k not in ("node_id",) and pd.notna(v) if not isinstance(v, float) or not np.isnan(v)},
            "connections_by_type": {k: sorted(set(v)) for k, v in connections_by_type.items()},
            "connections_by_predicate": {k: sorted(set(v)) for k, v in connections_by_predicate.items()},
            "total_connections": len(all_neighbours),
            "evidence": [{
                "text": e["text"][:500],
                "source": e["source_file"],
                "page": e["page_num"],
                "distance": e["distance"],
            } for e in evidence],
        }


# Instantiate
qe = GovernanceQueryEngine(gs, vs)
print("QueryEngine ready ✓")


QueryEngine ready ✓


In [15]:
# ─────────────────────────────────────────────
# 4b) Smoke test: structured queries
# ─────────────────────────────────────────────

# Test 1: Governance pathway
print("=" * 60)
print("QUERY 1: Governance pathway for 'Design for Disassembly'")
print("=" * 60)
result = qe.trace_governance_pathway("Design for Disassembly")
print(f"Practice: {result['practice']}")
print(f"Instruments: {result['summary_counts']['instruments']}")
for inst in result["instruments"][:5]:
    auth = ", ".join(inst["authorities"]) if inst["authorities"] else "—"
    jur = ", ".join(inst["jurisdictions"]) if inst["jurisdictions"] else "—"
    print(f"  • {inst['instrument']} (auth: {auth}, jur: {jur})")
print(f"Requirements: {result['summary_counts']['requirements']}")
for req in result["requirements"][:5]:
    print(f"  • {req['name']}")
print(f"Barriers: {result['summary_counts']['barriers']}")
print(f"Outcomes: {result['summary_counts']['outcomes']}")

# Test 2: Gap analysis
print(f"\n{'='*60}")
print("QUERY 2: Gap analysis (all practices)")
print("=" * 60)
gaps = qe.identify_gaps()
print(f"Practices analysed: {gaps['total_practices_analysed']}")
print(f"Gaps found: {gaps['gap_count']} ({len(gaps['high_severity_gaps'])} high severity)")
print(f"\nHigh-severity gaps (barriers, no instruments or enablers):")
for g in gaps["high_severity_gaps"][:8]:
    print(f"  ⚠ {g['practice']} — {g['barrier_count']} barriers: {', '.join(g['barriers'][:3])}")

# Test 3: Entity explainer
print(f"\n{'='*60}")
print("QUERY 3: Explain 'Material passports'")
print("=" * 60)
profile = qe.explain_entity("Material passports")
print(f"Name: {profile['name']} [{profile['entity_type']}]")
print(f"Connections: {profile['total_connections']}")
for etype, names in sorted(profile["connections_by_type"].items()):
    print(f"  {etype}: {', '.join(names[:5])}" + (f" (+{len(names)-5} more)" if len(names) > 5 else ""))


QUERY 1: Governance pathway for 'Design for Disassembly'
Practice: Design for Disassembly (DfD)
Instruments: 16
  • Engineering for Australia's Circular Economy: A National Strategy (auth: —, jur: —)
  • National Circular Design Standards (auth: —, jur: Australia)
  • Circular Economy Action Plan (auth: European Commission, jur: European Commission)
  • Design for Deconstruction Guide for Building Services (auth: Arup, jur: —)
  • Design for Deconstruction Guide for Building Services (auth: Arup, jur: —)
Requirements: 1
  • System accessibility for end-of-life removal
Barriers: 2
Outcomes: 23

QUERY 2: Gap analysis (all practices)
Practices analysed: 427
Gaps found: 39 (31 high severity)

High-severity gaps (barriers, no instruments or enablers):
  ⚠ Selective Deconstruction — 6 barriers: Time Constraints, Time Constraints, Cost Barriers
  ⚠ Demolition — 3 barriers: Downcycling, Downcycling, Time and Cost Pressures
  ⚠ Steel Extraction — 3 barriers: Locked-in stresses, Strip-out coinci

## 5) RAG integration: Claude-grounded answers
Takes structured query results from §4, combines with evidence from the vector store,
and passes everything to Claude for a grounded natural-language response.
Every claim in the response traces back to the knowledge graph and source documents.

In [16]:
# ─────────────────────────────────────────────
# 5a) RAG answer generator
# ─────────────────────────────────────────────
import anthropic
from tenacity import retry, wait_exponential, stop_after_attempt

client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

@retry(wait=wait_exponential(min=1, max=30), stop=stop_after_attempt(3))
def call_claude(system: str, user_msg: str, max_tokens: int = 2048) -> str:
    resp = client.messages.create(
        model=MODEL, max_tokens=max_tokens, temperature=LLM_TEMPERATURE,
        system=system, messages=[{"role": "user", "content": user_msg}],
    )
    return resp.content[0].text

RAG_SYSTEM = """You are an expert analyst for circular construction governance policy.
You answer questions using ONLY the structured knowledge graph data and evidence excerpts provided.
You must:
1. Ground every claim in the provided data — cite specific instruments, requirements, or evidence passages.
2. Use the format [Source: filename, p.X] for evidence citations.
3. If the data is insufficient to answer fully, say so explicitly.
4. Be precise about what applies in which jurisdiction.
5. Distinguish between instruments (formal policies), requirements (specific obligations),
   practices (what people do), and outcomes (measurable results).
6. Present findings in clear, structured prose suitable for a policy audience.
"""

def rag_answer(question: str, query_result: Dict,
               additional_evidence: List[Dict] = None) -> str:
    """Generate a grounded answer from structured query results + evidence."""

    # Format the structured data as context
    context_parts = [f"## Structured Query Result\n```json\n{json.dumps(query_result, indent=2, default=str)[:6000]}\n```"]

    if additional_evidence:
        context_parts.append("\n## Additional Evidence Passages")
        for i, ev in enumerate(additional_evidence[:5]):
            context_parts.append(
                f"\n### Evidence {i+1} [Source: {ev.get('source','')}, p.{ev.get('page','')}]\n{ev.get('text','')[:500]}"
            )

    context = "\n".join(context_parts)

    user_msg = f"""QUESTION: {question}

CONTEXT (from knowledge graph + source documents):
{context}

Please provide a grounded, evidence-based answer. Cite sources for specific claims."""

    return call_claude(RAG_SYSTEM, user_msg, max_tokens=2048)


def free_query(question: str, n_evidence: int = 8) -> str:
    """Open-ended question: search graph + vectors, then synthesise."""
    # Step 1: search vector store for relevant evidence
    evidence = vs.search(question, n_results=n_evidence)

    # Step 2: extract entity names from question and look them up in graph
    # Simple keyword extraction: find any graph entities mentioned
    question_lower = question.lower()
    mentioned_entities = []
    for name_norm, ids in name_to_ids.items():
        if len(name_norm) > 4 and name_norm in question_lower:
            for nid in ids:
                mentioned_entities.append(gs._node_summary(nid))

    # Step 3: for mentioned entities, get their graph neighbourhood
    graph_context = {}
    for ent in mentioned_entities[:5]:
        neighbours = gs.get_neighbours(ent["node_id"], direction="both")
        graph_context[ent["name"]] = {
            "type": ent["entity_type"],
            "connections": [{"name": n["name"], "type": n["entity_type"],
                            "predicate": n["predicate"]} for n in neighbours[:15]]
        }

    # Step 4: synthesise
    combined_result = {
        "query_type": "free_query",
        "mentioned_entities": [e["name"] for e in mentioned_entities],
        "graph_context": graph_context,
    }

    return rag_answer(
        question, combined_result,
        additional_evidence=[{
            "text": e["text"][:500],
            "source": e["source_file"],
            "page": e["page_num"],
        } for e in evidence]
    )

print("RAG answer generator ready ✓")


RAG answer generator ready ✓


## 6) Demo queries + validation
Run the four core query types with Claude-synthesised answers.
These outputs demonstrate the backend's capabilities for the presentation.

In [17]:
# ─────────────────────────────────────────────
# 6a) Demo 1: Governance pathway for DfD in Australia
# ─────────────────────────────────────────────
print("=" * 70)
print("DEMO 1: What instruments govern Design for Disassembly in Australia?")
print("=" * 70)

pathway = qe.trace_governance_pathway("Design for Disassembly", jurisdiction="Australia")
answer1 = rag_answer(
    "What instruments and requirements govern Design for Disassembly (DfD) practices in Australia?",
    pathway,
    pathway.get("evidence", []),
)
print(answer1)


DEMO 1: What instruments govern Design for Disassembly in Australia?
# Design for Disassembly (DfD) Governance in Australia

Based on the structured knowledge graph data and evidence provided, the governance of Design for Disassembly practices in Australia operates through the following instruments and requirements:

## Governing Instruments

Two primary policy instruments govern DfD practices in Australia:

1. **National Circular Design Standards** - This instrument formally REQUIRES Design for Disassembly practices [Source: Knowledge graph data, Australia jurisdiction]

2. **Government Policies** - These also REQUIRE DfD implementation [Source: Knowledge graph data, Australia jurisdiction]

**Important limitation**: The provided data does not specify which particular government policies or the issuing authorities for these instruments. The knowledge graph indicates empty authority fields for both instruments, meaning the specific government departments or agencies responsible for the

In [18]:
# ─────────────────────────────────────────────
# 6b) Demo 2: Jurisdictional comparison
# ─────────────────────────────────────────────
print("=" * 70)
print("DEMO 2: How do Victoria, NSW, and Queensland compare?")
print("=" * 70)

comparison = qe.compare_jurisdictions(
    ["Victoria", "New South Wales", "Queensland"],
    dimension="Instrument",
)
answer2 = rag_answer(
    "Compare the circular economy instruments that apply in Victoria, New South Wales, and Queensland. What are the key differences?",
    comparison,
)
print(answer2)
print(f"\n--- Raw comparison ---")
for jur, count in comparison["entity_counts"].items():
    print(f"  {jur}: {count} instruments")
print(f"  Shared across all three: {len(comparison['shared_entities'])}")


DEMO 2: How do Victoria, NSW, and Queensland compare?
# Comparison of Circular Economy Instruments: Victoria, New South Wales, and Queensland

## Overview

Based on the structured knowledge graph data, there are **significant differences** in the documented circular economy instruments across these three jurisdictions. Victoria and New South Wales each have three identified instruments, while **Queensland has zero documented circular economy instruments** in the provided data.

## Victoria's Circular Economy Instruments

Victoria has established three distinct policy instruments:

1. **Recycling Victoria: A New Economy (2020)** - A comprehensive circular economy strategy
2. **Recycled First Policy** - Focused on procurement and material use requirements
3. **Circular Economy Market Report** - An annual reporting mechanism that "provides Victorian specific construction" data [Source: 48.What-Works-3-For-circular-design-in-the-built-environment.pdf, p.10]

Victoria also demonstrates inst

In [19]:
# ─────────────────────────────────────────────
# 6c) Demo 3: Gap analysis
# ─────────────────────────────────────────────
print("=" * 70)
print("DEMO 3: What practices face barriers without enabling instruments?")
print("=" * 70)

gaps = qe.identify_gaps()
answer3 = rag_answer(
    "Which circular construction practices in Australia face significant barriers but lack enabling instruments or policy support? Identify the most critical governance gaps.",
    gaps,
)
print(answer3)


DEMO 3: What practices face barriers without enabling instruments?
# Critical Governance Gaps in Australian Circular Construction: Practices Facing Barriers Without Policy Support

Based on the structured knowledge graph analysis of 427 circular construction practices in Australia, several critical practices face significant barriers yet lack enabling instruments or policy support. These represent the most severe governance gaps requiring urgent attention.

## 1. Selective Deconstruction: The Most Critical Gap

**Selective Deconstruction** emerges as the practice with the highest barrier burden (6 distinct barriers) and complete absence of policy support:

**Barriers identified:**
- Time Constraints (cited twice, indicating severity)
- Cost Barriers
- Skill Gaps
- Economic Balance challenges
- Insurability Issues

This practice is fundamental to circular construction, as it enables material recovery at end-of-life. The absence of any enabling instruments despite multiple, well-document

In [20]:
# ─────────────────────────────────────────────
# 6d) Demo 4: Free-form query
# ─────────────────────────────────────────────
print("=" * 70)
print("DEMO 4: Free query — material passports and building deconstruction")
print("=" * 70)

answer4 = free_query(
    "How do material passports support building deconstruction practices, "
    "and what policy instruments encourage their adoption in Australia?"
)
print(answer4)


DEMO 4: Free query — material passports and building deconstruction
# Material Passports and Building Deconstruction in Australia: Policy Analysis

## How Material Passports Support Building Deconstruction Practices

Material passports serve as critical enablers of building deconstruction through several interconnected mechanisms:

### 1. **Enabling Selective Demolition and Material Recovery**

Material passports directly enable selective demolition practices by providing detailed documentation of building components and materials [Source: Knowledge Graph - Material passports ENABLES Selective Demolition]. This documentation supports material recovery outcomes by creating transparency about what materials exist within buildings and their specifications [Source: Knowledge Graph - Material passports ENABLES Material Recovery].

### 2. **Facilitating Material Tracking and Buildings as Material Banks**

The passports enable material tracking practices throughout the building lifecycle [Sou

## 7) Export backend artefacts
Saves the complete backend state for production deployment and presentation generation.

In [21]:
# ─────────────────────────────────────────────
# 7a) Save backend artefacts
# ─────────────────────────────────────────────

# Save the demo answers for presentation use
demos = {
    "demo_1_dfd_pathway": {
        "question": "What instruments govern Design for Disassembly in Australia?",
        "structured_result": pathway,
        "answer": answer1,
    },
    "demo_2_jurisdiction_comparison": {
        "question": "How do Victoria, NSW, and Queensland compare in circular economy instruments?",
        "structured_result": comparison,
        "answer": answer2,
    },
    "demo_3_gap_analysis": {
        "question": "Which practices face barriers without enabling instruments?",
        "structured_result": gaps,
        "answer": answer3,
    },
    "demo_4_free_query": {
        "question": "How do material passports support building deconstruction?",
        "answer": answer4,
    },
}

with open(os.path.join(OUTPUT_DIR, "demo_answers.json"), "w") as f:
    json.dump(demos, f, indent=2, default=str, ensure_ascii=False)

# Save graph store state (for reload without re-embedding)
import pickle

backend_state = {
    "nodes_df": nodes_df,
    "edges_df": edges_df,
    "triples_df": triples_df,
    "source_chunks": source_chunks,
    "kg_records": kg_records,
    "schema": schema,
}
with open(os.path.join(OUTPUT_DIR, "backend_state.pkl"), "wb") as f:
    pickle.dump(backend_state, f)

# Save summary statistics
backend_summary = {
    "graph_stats": gs.summary_stats(),
    "vector_store_count": collection.count(),
    "query_types": ["trace_governance_pathway", "compare_jurisdictions",
                     "identify_gaps", "explain_entity", "free_query"],
    "country": COUNTRY,
    "demo_count": len(demos),
}
with open(os.path.join(OUTPUT_DIR, "backend_summary.json"), "w") as f:
    json.dump(backend_summary, f, indent=2, default=str)

print(f"{'='*60}")
print(f"STAGE 5 — BACKEND ARTEFACTS SAVED")
print(f"{'='*60}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"  demo_answers.json       ({len(demos)} demo Q&A pairs)")
print(f"  backend_state.pkl       (full reload state)")
print(f"  backend_summary.json    (summary statistics)")
print(f"\nBackend is ready for presentation generation (Notebook 6).")


STAGE 5 — BACKEND ARTEFACTS SAVED
Output directory: /content/drive/MyDrive/ACTIVE/AU_deconstruction_domain/data_analysis/5_backend
  demo_answers.json       (4 demo Q&A pairs)
  backend_state.pkl       (full reload state)
  backend_summary.json    (summary statistics)

Backend is ready for presentation generation (Notebook 6).
