# Stage 05 — Agent 2 cluster linking + relationship verification (glass-box)

This notebook reads **Stage 04** semantic clusters and produces **proposed relationships** between clusters (Agent 2), then runs a **verification layer** that computes explicit evidence signals.

Outputs (parquet-first):
- `exports/stage_05_agent2_linking/cluster_links.parquet`
- `exports/stage_05_agent2_linking/agent2_memory.parquet`
- `exports/stage_05_agent2_linking/link_evidence.parquet`
- `exports/stage_05_agent2_linking/link_flags.parquet`
- `exports/stage_05_agent2_linking/cluster_links_verified.parquet`

**Why verification?**
Agentic relationships are useful but must be grounded:
- centroid similarity
- dataset overlap
- label overlap
- sampled cross-item similarity
- montage pairs (visual evidence)

This stage is the “0.5 testing notebook” for the KG build in Stage 06.


In [None]:
# --- Colab-first setup ---
import os, sys, time
from pathlib import Path

FORCE_REBUILD = False
FAST_MODE = True
EDA_LEVEL = "core"  # "core" | "standard" | "deep"

SHOW_PLOTS = True
SAVE_PLOTS = True

DRIVE_SEARCH_BASE = "/content/drive/MyDrive"

def _is_colab() -> bool:
    return "google.colab" in sys.modules

if _is_colab():
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

def _resolve_project_root() -> Path:
    ev = os.environ.get("HISTO_PROJECT_ROOT")
    if ev and Path(ev).exists():
        return Path(ev)

    base = Path(DRIVE_SEARCH_BASE)
    candidates = []
    if base.exists():
        for p in base.glob("**/pipeline_config.yaml"):
            parent = p.parent
            if (parent / "label_taxonomy.yaml").exists():
                candidates.append(parent)
    if candidates:
        candidates = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)
        return candidates[0]

    p = Path.cwd()
    for _ in range(10):
        if (p / "pipeline_config.yaml").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not resolve PROJECT_ROOT. Set HISTO_PROJECT_ROOT env var.")

PROJECT_ROOT = _resolve_project_root()
sys.path.insert(0, str(PROJECT_ROOT))
print("PROJECT_ROOT:", PROJECT_ROOT)

# Install deps
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "-r", str(PROJECT_ROOT / "requirements.txt")])

import yaml
cfg = yaml.safe_load((PROJECT_ROOT / "pipeline_config.yaml").read_text())

EXPORTS_DIR = PROJECT_ROOT / str(cfg.get("paths", {}).get("exports_dir", "exports"))
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)

SAFE_MODE = bool(cfg.get("project", {}).get("safe_mode", True))
SEED = int(cfg.get("project", {}).get("seed", 1337))

print("SAFE_MODE:", SAFE_MODE, "| EDA_LEVEL:", EDA_LEVEL)


In [None]:
# --- Secrets (do NOT print tokens) ---
import os
assert os.environ.get("OPENAI_API_KEY"), "Missing OPENAI_API_KEY. Set it in Colab Secrets or env vars."
print("✅ OPENAI_API_KEY is set (value not printed).")


In [None]:
# --- Stage paths + registries ---
from pathlib import Path
import pandas as pd

from histo_cartography.viz import ensure_dir, save_and_display, register_plot, display_image
from histo_cartography.artifact_registry import register_artifact, append_stage_manifest
from histo_cartography.critic import run_critic, write_critic_report, critic_result_table, critic_issues_table

stage_in_clusters = EXPORTS_DIR / "stage_04_agent1_cleanup" / "clusters_semantic.parquet"
stage_in_items = EXPORTS_DIR / "stage_04_agent1_cleanup" / "items_after_agent1.parquet"
stage_in_centroids = EXPORTS_DIR / "stage_03_clustering" / "cluster_centroids.parquet"

assert stage_in_clusters.exists(), f"missing clusters_semantic: {stage_in_clusters}"
assert stage_in_items.exists(), f"missing items_after_agent1: {stage_in_items}"
assert stage_in_centroids.exists(), f"missing cluster_centroids: {stage_in_centroids}"

stage_dir = EXPORTS_DIR / "stage_05_agent2_linking"
plots_dir = ensure_dir(stage_dir / "plots")
qa_dir = ensure_dir(stage_dir / "qa")
eda_dir = ensure_dir(stage_dir / "eda")

out_links_path = stage_dir / "cluster_links.parquet"
out_memory_path = stage_dir / "agent2_memory.parquet"

# Verification outputs
out_evidence_path = stage_dir / "link_evidence.parquet"
out_flags_path = stage_dir / "link_flags.parquet"
out_verified_links_path = stage_dir / "cluster_links_verified.parquet"

viz_records = []

print("stage_dir:", stage_dir)


In [None]:
# --- Load upstream data (Stage 04 + Stage 03) ---
import pandas as pd
from IPython.display import display

clusters_semantic = pd.read_parquet(stage_in_clusters)
items_after_agent1 = pd.read_parquet(stage_in_items)
centroids = pd.read_parquet(stage_in_centroids)

display(clusters_semantic.head(5))
print("clusters_semantic:", clusters_semantic.shape)
print("items_after_agent1:", items_after_agent1.shape)
print("centroids:", centroids.shape)


## PEEP — Preflight health + EDA

In [None]:
# PEEP (1/4) — clusters_semantic overview table
from IPython.display import display
from histo_cartography.eda_reports import df_overview_table

display(df_overview_table(clusters_semantic, max_cols=40).head(40))


In [None]:
# PEEP (2/4) — Critic gates (clusters_semantic)
from IPython.display import display

critic_clusters = run_critic(
    df=clusters_semantic,
    stage="stage_05_agent2_linking",
    gate="peep_clusters_semantic",
    required_cols=["cluster_id","cluster_name","cluster_description","cluster_keywords"],
    id_col="cluster_id",
    min_rows=2,
    key_nonnull_cols=["cluster_id","cluster_name"],
)
write_critic_report(critic_clusters, qa_dir / "critic_peep_clusters_semantic.json")
display(critic_result_table(critic_clusters))
display(critic_issues_table(critic_clusters).head(50))


In [None]:
# PEEP (3/4) — Build candidate pairs (centroid cosine similarity)
import numpy as np
from histo_cartography.agentic import candidate_pairs_from_centroids

# conservative defaults for test notebook
K = 5 if SAFE_MODE else 15
MIN_SIM = 0.60 if SAFE_MODE else 0.45

candidate_pairs = candidate_pairs_from_centroids(
    centroids=centroids,
    top_k=int(K),
    min_similarity=float(MIN_SIM),
    random_state=SEED,
)

print("candidate_pairs:", candidate_pairs.shape)
candidate_pairs.head()


In [None]:
# PEEP (4/4) — Candidate similarity histogram
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(7, 4))
plt.hist(candidate_pairs["similarity"].astype(float), bins=30, edgecolor="black")
plt.title("Candidate pair centroid similarity (hist)")
plt.xlabel("cosine similarity")
plt.ylabel("count")
plt.tight_layout()

out_path = plots_dir / "peep_candidate_similarity_hist.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="peep_candidate_similarity_hist", title="Candidate pair similarity histogram", path=out_path, tags=["peep","candidates","similarity"], is_core=True)


**Interpretation**: Candidate pairs define which relationships Agent 2 will consider.

**Warning signs**: All similarities near threshold (weak signal) or extremely high for many pairs (collapsed centroids).

## Stage logic — Agent 2 linking (idempotent)

In [None]:
# --- Run Agent 2 on candidate pairs ---
import pandas as pd

from histo_cartography.agentic import run_agent2_cluster_linking

t0 = time.time()

if out_links_path.exists() and out_memory_path.exists() and not FORCE_REBUILD:
    cluster_links = pd.read_parquet(out_links_path)
    agent2_memory = pd.read_parquet(out_memory_path)
    print("✅ Loaded existing agent2 outputs:", cluster_links.shape, agent2_memory.shape)
else:
    cluster_links, agent2_memory = run_agent2_cluster_linking(
        candidate_pairs=candidate_pairs,
        clusters_semantic=clusters_semantic,
        out_links_path=out_links_path,
        out_memory_path=out_memory_path,
        model=str(cfg.get("agentic", {}).get("agent2_model", "gpt-4o-mini")),
        temperature=float(cfg.get("agentic", {}).get("agent2_temperature", 0.2)),
        force_rebuild=FORCE_REBUILD,
    )

runtime_sec = time.time() - t0
print("runtime_sec:", round(runtime_sec, 2))


## CHECKPOINT — Verification layer (evidence + flags)

In [None]:
# Build explicit evidence + flags for each link (glass-box)
import pandas as pd

from histo_cartography.relationship_verification import build_link_evidence, write_link_verification_artifacts

evidence_df, flags_df, verified_links = build_link_evidence(
    clusters=clusters_semantic,
    links=cluster_links,
    centroids=centroids,
    items=items_after_agent1,
    cluster_col="cluster_id",
    dataset_col="source",
    label_col="label",
    vector_col_items="vector",
    vector_col_centroids="vector",
    cross_sim_sample_n=30 if FAST_MODE else 60,
    seed=SEED,
)

paths = write_link_verification_artifacts(
    evidence_df=evidence_df,
    flags_df=flags_df,
    verified_links_df=verified_links,
    out_dir=stage_dir,
)

print("✅ wrote verification artifacts:", paths)
print("verified_links:", verified_links.shape)


In [None]:
# CHECKPOINT: critic gates on verified_links (must include evidence columns)
from IPython.display import display

critic_links = run_critic(
    df=verified_links,
    stage="stage_05_agent2_linking",
    gate="checkpoint_cluster_links_verified",
    required_cols=["src_cluster_id","dst_cluster_id","relationship","confidence","centroid_similarity","needs_more_evidence"],
    id_col=None,
    min_rows=1,
    key_nonnull_cols=["src_cluster_id","dst_cluster_id","relationship"],
)

write_critic_report(critic_links, qa_dir / "critic_checkpoint_cluster_links_verified.json")
display(critic_result_table(critic_links))
display(critic_issues_table(critic_links).head(50))


In [None]:
# Register artifacts + stage manifest
schema_version = str(cfg.get("project", {}).get("schema_version", "0.1.0"))

register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_05_agent2_linking",
    artifact="cluster_links",
    path=out_links_path,
    schema_version=schema_version,
    inputs=[stage_in_clusters, stage_in_centroids],
    df=cluster_links,
    warnings_count=int(critic_links.warnings_count),
    fails_count=int(critic_links.fails_count),
    runtime_sec=float(runtime_sec),
    notes="Agent 2 proposed links (raw)",
)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_05_agent2_linking",
    artifact="cluster_links_verified",
    path=out_verified_links_path,
    schema_version=schema_version,
    inputs=[out_links_path, stage_in_centroids],
    df=verified_links,
    warnings_count=int(critic_links.warnings_count),
    fails_count=int(critic_links.fails_count),
    runtime_sec=float(runtime_sec),
    notes="Agent 2 links + explicit evidence + flags",
)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_05_agent2_linking",
    artifact="link_evidence",
    path=out_evidence_path,
    schema_version=schema_version,
    inputs=[out_links_path],
    df=evidence_df,
    warnings_count=int(critic_links.warnings_count),
    fails_count=int(critic_links.fails_count),
    runtime_sec=float(runtime_sec),
    notes="per-link evidence signals",
)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_05_agent2_linking",
    artifact="link_flags",
    path=out_flags_path,
    schema_version=schema_version,
    inputs=[out_links_path],
    df=flags_df,
    warnings_count=int(critic_links.warnings_count),
    fails_count=int(critic_links.fails_count),
    runtime_sec=float(runtime_sec),
    notes="per-link flags + needs_more_evidence",
)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_05_agent2_linking",
    artifact="agent2_memory",
    path=out_memory_path,
    schema_version=schema_version,
    inputs=[out_links_path],
    df=agent2_memory,
    warnings_count=int(critic_links.warnings_count),
    fails_count=int(critic_links.fails_count),
    runtime_sec=float(runtime_sec),
    notes="Agent 2 memory (prompt/response history)",
)

append_stage_manifest(
    project_root=PROJECT_ROOT,
    stage="stage_05_agent2_linking",
    inputs=[stage_in_clusters, stage_in_items, stage_in_centroids],
    outputs=[out_links_path, out_verified_links_path, out_evidence_path, out_flags_path, out_memory_path],
    schema_version=schema_version,
    warnings_count=int(critic_links.warnings_count),
    fails_count=int(critic_links.fails_count),
    runtime_sec=float(runtime_sec),
    notes="stage 05 run summary",
)


## Core relationship diagnostics (one plot per cell)

In [None]:
# Core plot 1 — Relationship type counts
import matplotlib.pyplot as plt

vc = verified_links["relationship"].astype(str).value_counts()

fig = plt.figure(figsize=(7, 4))
plt.bar(vc.index.astype(str), vc.values)
plt.xticks(rotation=30, ha="right")
plt.title("Relationship type counts (verified_links)")
plt.ylabel("count")
plt.tight_layout()

out_path = plots_dir / "relationship_type_counts.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="relationship_type_counts", title="Relationship type counts", path=out_path, tags=["core","relationships"], is_core=True)


**Interpretation**: Expect a mix, but `related_to` often dominates. Too many `same_as` may indicate duplicate clusters.

**Warning signs**: many `unrelated` edges (candidate threshold too low), or many `same_as` with low similarity.

In [None]:
# Core plot 2 — Confidence histogram
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(7, 4))
plt.hist(verified_links["confidence"].astype(float), bins=20, edgecolor="black")
plt.title("Agent 2 confidence histogram")
plt.xlabel("confidence")
plt.ylabel("count")
plt.tight_layout()

out_path = plots_dir / "confidence_hist.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="confidence_hist", title="Confidence histogram", path=out_path, tags=["core","relationships","confidence"], is_core=True)


**Interpretation**: Confidence should correlate with similarity/evidence.

**Warning signs**: very high confidence on weak-similarity pairs.

In [None]:
# Core plot 3 — Centroid similarity histogram (verified_links)
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(7, 4))
plt.hist(verified_links["centroid_similarity"].astype(float).dropna(), bins=30, edgecolor="black")
plt.title("Centroid similarity histogram (verified_links)")
plt.xlabel("cosine similarity")
plt.ylabel("count")
plt.tight_layout()

out_path = plots_dir / "verified_centroid_similarity_hist.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="verified_centroid_similarity_hist", title="Centroid similarity histogram (verified_links)", path=out_path, tags=["core","relationships","similarity"], is_core=True)


In [None]:
# Core plot 4 — Similarity vs confidence scatter
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(6, 5))
plt.scatter(verified_links["centroid_similarity"].astype(float), verified_links["confidence"].astype(float), s=20, alpha=0.6)
plt.xlabel("centroid_similarity")
plt.ylabel("confidence")
plt.title("Similarity vs confidence (Agent 2)")
plt.tight_layout()

out_path = plots_dir / "scatter_similarity_vs_confidence.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="scatter_similarity_vs_confidence", title="Similarity vs confidence scatter", path=out_path, tags=["core","relationships","calibration"], is_core=True)


**Interpretation**: Expect a loose upward trend.

**Warning signs**: high-confidence points at low similarity; those should be reviewed with montages.

In [None]:
# Core plot 5 — Needs-more-evidence rate by relationship
import matplotlib.pyplot as plt
import pandas as pd

tab = verified_links.copy()
tab["needs_more_evidence"] = tab["needs_more_evidence"].fillna(True).astype(bool)

rate = tab.groupby("relationship")["needs_more_evidence"].mean().sort_values(ascending=False)

fig = plt.figure(figsize=(7, 4))
plt.bar(rate.index.astype(str), rate.values)
plt.xticks(rotation=30, ha="right")
plt.title("Needs-more-evidence rate by relationship")
plt.ylabel("fraction flagged")
plt.tight_layout()

out_path = plots_dir / "needs_more_evidence_rate.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="needs_more_evidence_rate", title="Needs-more-evidence rate by relationship", path=out_path, tags=["core","relationships","flags"], is_core=True)


**Interpretation**: Some relationships (e.g., `same_as`) should require strong evidence.

**Warning signs**: high flagged rate across all relationships indicates weak candidate generation or noisy centroids.

In [None]:
# Core diagnostic 6 — Show flagged edges (table)
from IPython.display import display

flagged_edges = verified_links[verified_links["needs_more_evidence"].fillna(True).astype(bool)].copy()
display(flagged_edges[["src_cluster_id","dst_cluster_id","relationship","confidence","centroid_similarity","dataset_overlap_jaccard","label_overlap_jaccard","flags"]].head(50))
print("flagged_edges:", flagged_edges.shape)


In [None]:
# Select a few edges for montage-pair review (core)
# prioritise: same_as/subtype_of + needs_more_evidence, highest confidence
cand = flagged_edges.copy()
cand["rel_rank"] = cand["relationship"].astype(str).map({"same_as":0,"subtype_of":1,"overlaps_with":2,"related_to":3}).fillna(9)
cand = cand.sort_values(["rel_rank","confidence"], ascending=[True, False])
review_edges = cand.head(3)[["src_cluster_id","dst_cluster_id","relationship"]].to_dict(orient="records")
print("review_edges:", review_edges)


In [None]:
# Montage pair 1 — Visual evidence for review_edges[0]
from histo_cartography.image_viz import montage_pair_for_link

if len(review_edges) > 0:
    e = review_edges[0]
    a = int(e["src_cluster_id"])
    b = int(e["dst_cluster_id"])
    out_path = plots_dir / ("montage_pair_" + str(a) + "_vs_" + str(b) + ".png")
    montage_pair_for_link(items_after_agent1, cluster_a=a, cluster_b=b, out_path=out_path, image_col="image_path", cluster_col="cluster_id", n_each=18, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="montage_pair_" + str(a) + "_vs_" + str(b), title="Montage pair " + str(a) + " vs " + str(b), path=out_path, tags=["core","montage","edges"], is_core=True, notes=str(e.get("relationship")))
else:
    print("not enough edges for montage review")


**Interpretation**: For `same_as`, montages should look nearly identical. For `subtype_of`, one should look like a specialized subset of the other.

**Warning signs**: visually dissimilar montages with high-confidence `same_as`.

In [None]:
# Montage pair 2 — Visual evidence for review_edges[1]
from histo_cartography.image_viz import montage_pair_for_link

if len(review_edges) > 1:
    e = review_edges[1]
    a = int(e["src_cluster_id"])
    b = int(e["dst_cluster_id"])
    out_path = plots_dir / ("montage_pair_" + str(a) + "_vs_" + str(b) + ".png")
    montage_pair_for_link(items_after_agent1, cluster_a=a, cluster_b=b, out_path=out_path, image_col="image_path", cluster_col="cluster_id", n_each=18, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="montage_pair_" + str(a) + "_vs_" + str(b), title="Montage pair " + str(a) + " vs " + str(b), path=out_path, tags=["core","montage","edges"], is_core=True, notes=str(e.get("relationship")))
else:
    print("not enough edges for montage review")


**Interpretation**: For `same_as`, montages should look nearly identical. For `subtype_of`, one should look like a specialized subset of the other.

**Warning signs**: visually dissimilar montages with high-confidence `same_as`.

In [None]:
# Montage pair 3 — Visual evidence for review_edges[2]
from histo_cartography.image_viz import montage_pair_for_link

if len(review_edges) > 2:
    e = review_edges[2]
    a = int(e["src_cluster_id"])
    b = int(e["dst_cluster_id"])
    out_path = plots_dir / ("montage_pair_" + str(a) + "_vs_" + str(b) + ".png")
    montage_pair_for_link(items_after_agent1, cluster_a=a, cluster_b=b, out_path=out_path, image_col="image_path", cluster_col="cluster_id", n_each=18, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_05_agent2_linking", plot_id="montage_pair_" + str(a) + "_vs_" + str(b), title="Montage pair " + str(a) + " vs " + str(b), path=out_path, tags=["core","montage","edges"], is_core=True, notes=str(e.get("relationship")))
else:
    print("not enough edges for montage review")


**Interpretation**: For `same_as`, montages should look nearly identical. For `subtype_of`, one should look like a specialized subset of the other.

**Warning signs**: visually dissimilar montages with high-confidence `same_as`.

In [None]:
# Human-in-the-loop review template (CSV)
import pandas as pd

template = verified_links.copy()
template["human_relationship"] = ""
template["human_approved"] = ""
template["human_notes"] = ""

cols = [
    "src_cluster_id","dst_cluster_id","relationship","confidence","centroid_similarity",
    "dataset_overlap_jaccard","label_overlap_jaccard","needs_more_evidence","flags",
    "human_relationship","human_approved","human_notes"
]
template = template[cols]

out_csv = stage_dir / "human_review_template.csv"
template.to_csv(out_csv, index=False)
print("✅ wrote:", out_csv)


In [None]:
# Write viz index (parquet + csv) + show preview
from IPython.display import display
from histo_cartography.viz import write_viz_index, viz_records_to_df

viz_index_path = stage_dir / "viz_index.parquet"
write_viz_index(viz_records, out_parquet=viz_index_path, out_csv=stage_dir / "viz_index.csv")

viz_df = viz_records_to_df(viz_records)
display(viz_df.head(200))
print("✅ wrote viz_index:", viz_index_path)


## Next actions
- Proceed to Stage 06 to build the knowledge graph.
- If many edges are flagged, adjust candidate generation (MIN_SIM, top_k) or revisit embedding/clustering.