# Stage 04 — Agent 1 cluster cleanup (semantic enrichment, glass-box)

This notebook reads **Stage 03** `items_with_clusters.parquet` and runs **Agent 1** to produce **semantic cluster labels**.

Outputs (parquet-first):
- `exports/stage_04_agent1_cleanup/clusters_semantic.parquet`
- `exports/stage_04_agent1_cleanup/agent1_memory.parquet`
- `exports/stage_04_agent1_cleanup/items_after_agent1.parquet`

**Critical rule**: semantics must be *inspectable*:
- representative montages for clusters
- tables showing label/dataset mixture
- uniqueness stabilization (no duplicate `cluster_name`)


In [None]:
# --- Colab-first setup ---
import os, sys, time
from pathlib import Path

FORCE_REBUILD = False
FAST_MODE = True
EDA_LEVEL = "core"  # "core" | "standard" | "deep"

SHOW_PLOTS = True
SAVE_PLOTS = True

DRIVE_SEARCH_BASE = "/content/drive/MyDrive"

def _is_colab() -> bool:
    return "google.colab" in sys.modules

if _is_colab():
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

def _resolve_project_root() -> Path:
    ev = os.environ.get("HISTO_PROJECT_ROOT")
    if ev and Path(ev).exists():
        return Path(ev)

    base = Path(DRIVE_SEARCH_BASE)
    candidates = []
    if base.exists():
        for p in base.glob("**/pipeline_config.yaml"):
            parent = p.parent
            if (parent / "label_taxonomy.yaml").exists():
                candidates.append(parent)
    if candidates:
        candidates = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)
        return candidates[0]

    p = Path.cwd()
    for _ in range(10):
        if (p / "pipeline_config.yaml").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not resolve PROJECT_ROOT. Set HISTO_PROJECT_ROOT env var.")

PROJECT_ROOT = _resolve_project_root()
sys.path.insert(0, str(PROJECT_ROOT))
print("PROJECT_ROOT:", PROJECT_ROOT)

# Install deps
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "-r", str(PROJECT_ROOT / "requirements.txt")])

import yaml
cfg = yaml.safe_load((PROJECT_ROOT / "pipeline_config.yaml").read_text())

EXPORTS_DIR = PROJECT_ROOT / str(cfg.get("paths", {}).get("exports_dir", "exports"))
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)

SAFE_MODE = bool(cfg.get("project", {}).get("safe_mode", True))
SEED = int(cfg.get("project", {}).get("seed", 1337))

print("SAFE_MODE:", SAFE_MODE, "| EDA_LEVEL:", EDA_LEVEL)


In [None]:
# --- Secrets (do NOT print tokens) ---
import os

# Agentic stages require OPENAI_API_KEY.
# In Colab: Runtime → Secrets → add OPENAI_API_KEY
assert os.environ.get("OPENAI_API_KEY"), "Missing OPENAI_API_KEY. Set it in Colab Secrets or env vars."
print("✅ OPENAI_API_KEY is set (value not printed).")


In [None]:
# --- Stage paths + registries ---
from pathlib import Path
import pandas as pd
import yaml

from histo_cartography.viz import ensure_dir, save_and_display, register_plot, display_image
from histo_cartography.artifact_registry import register_artifact, append_stage_manifest
from histo_cartography.critic import run_critic, write_critic_report, critic_result_table, critic_issues_table

stage_in_items = EXPORTS_DIR / "stage_03_clustering" / "items_with_clusters.parquet"
stage_in_centroids = EXPORTS_DIR / "stage_03_clustering" / "cluster_centroids.parquet"
assert stage_in_items.exists(), f"missing upstream parquet: {stage_in_items}"
assert stage_in_centroids.exists(), f"missing upstream parquet: {stage_in_centroids}"

stage_dir = EXPORTS_DIR / "stage_04_agent1_cleanup"
plots_dir = ensure_dir(stage_dir / "plots")
qa_dir = ensure_dir(stage_dir / "qa")
eda_dir = ensure_dir(stage_dir / "eda")

clusters_semantic_path = stage_dir / "clusters_semantic.parquet"
agent1_memory_path = stage_dir / "agent1_memory.parquet"
items_after_agent1_path = stage_dir / "items_after_agent1.parquet"

viz_records = []

print("stage_dir:", stage_dir)


In [None]:
# --- Load upstream data (Stage 03) ---
import pandas as pd
from IPython.display import display

items = pd.read_parquet(stage_in_items)
centroids = pd.read_parquet(stage_in_centroids)

display(items.head(3))
print("items shape:", items.shape)
print("centroids shape:", centroids.shape)


## PEEP — Preflight health + EDA

In [None]:
# PEEP (1/4) — Overview table
from IPython.display import display
from histo_cartography.eda_reports import df_overview_table

display(df_overview_table(items, max_cols=45).head(45))


In [None]:
# PEEP (2/4) — Cluster size distribution (excluding -1)
import matplotlib.pyplot as plt

vc = items["cluster_id"].astype(int).value_counts()
vc_no_noise = vc[vc.index != -1]

fig = plt.figure(figsize=(7, 4))
plt.hist(vc_no_noise.values, bins=min(30, max(5, len(vc_no_noise))), edgecolor="black")
plt.title("Stage 04 PEEP: cluster size distribution (excluding -1)")
plt.xlabel("cluster size")
plt.ylabel("count of clusters")
plt.tight_layout()

out_path = plots_dir / "peep_cluster_size_hist.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="peep_cluster_size_hist", title="PEEP cluster size distribution (excluding -1)", path=out_path, tags=["peep","clustering"], is_core=True)


**Interpretation**: Agent 1 will name clusters; tiny clusters may be noisy and hard to name.

**Warning signs**: many tiny clusters → consider revisiting Stage 03 parameters.

In [None]:
# PEEP (3/4) — Montage of the largest cluster (glass-box sanity check)
from histo_cartography.image_viz import montage_by_cluster

largest_cluster_id = int(vc_no_noise.index[0]) if len(vc_no_noise) else 0
out_path = plots_dir / ("peep_montage_largest_cluster_" + str(largest_cluster_id) + ".png")
montage_by_cluster(items, cluster_id=largest_cluster_id, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
display_image(out_path)

register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="peep_montage_largest_cluster_" + str(largest_cluster_id), title="PEEP montage: largest cluster " + str(largest_cluster_id), path=out_path, tags=["peep","montage","cluster"], is_core=True)


**Interpretation**: A quick visual check that clusters correspond to meaningful visual motifs.

**Warning signs**: montage looks random/noisy/artifact-heavy.

In [None]:
# PEEP (4/4) — Critic gates (items_with_clusters)
from IPython.display import display

critic_in = run_critic(
    df=items,
    stage="stage_04_agent1_cleanup",
    gate="peep_items_with_clusters",
    required_cols=["item_id","cluster_id","image_path","label","source","vector"],
    id_col="item_id",
    min_rows=100 if not SAFE_MODE else 10,
    key_nonnull_cols=["item_id","cluster_id","image_path"],
    vector_col="vector",
)

write_critic_report(critic_in, qa_dir / "critic_peep_items_with_clusters.json")
display(critic_result_table(critic_in))
display(critic_issues_table(critic_in).head(50))


## Stage logic — Agent 1 semantic enrichment (idempotent)

In [None]:
# --- Build per-cluster evidence summaries + run Agent 1 ---
import pandas as pd
import yaml

from histo_cartography.exports import save_parquet
from histo_cartography.agentic import run_agent1_cluster_cleanup

t0 = time.time()

# Optional label taxonomy for human-readable names in prompts
tax_path = PROJECT_ROOT / "label_taxonomy.yaml"
label_map = {}
if tax_path.exists():
    tax = yaml.safe_load(tax_path.read_text())
    for code, spec in (tax.get("labels", {}) or {}).items():
        label_map[str(code)] = str(spec.get("name", code))

def dominant_labels(df: pd.DataFrame, top_k: int = 5):
    vc = df["label"].astype(str).value_counts(dropna=False)
    total = float(len(df)) if len(df) else 1.0
    out = []
    for label, c in vc.head(top_k).items():
        out.append({"label": str(label), "label_name": label_map.get(str(label), str(label)), "count": int(c), "frac": float(c)/total})
    return out

# Build one-row-per-cluster evidence summaries used by Agent 1 (cheap, inspectable)
summ_rows = []
for cid, g in items.groupby("cluster_id"):
    cid = int(cid)
    dom = dominant_labels(g, top_k=5)
    sample_texts = [d["label_name"] for d in dom]  # keep prompts short
    summ_rows.append({"cluster_id": cid, "n_items": int(len(g)), "dominant_labels": dom, "sample_texts": sample_texts})

clusters_summary = pd.DataFrame(summ_rows).sort_values("cluster_id").reset_index(drop=True)

if clusters_semantic_path.exists() and agent1_memory_path.exists() and items_after_agent1_path.exists() and not FORCE_REBUILD:
    clusters_semantic = pd.read_parquet(clusters_semantic_path)
    agent1_memory = pd.read_parquet(agent1_memory_path)
    items_after_agent1 = pd.read_parquet(items_after_agent1_path)
    print("✅ Loaded existing agent1 outputs:", clusters_semantic.shape, agent1_memory.shape, items_after_agent1.shape)
else:
    clusters_semantic, agent1_memory = run_agent1_cluster_cleanup(
        clusters_summary=clusters_summary,
        out_clusters_path=clusters_semantic_path,
        out_memory_path=agent1_memory_path,
        model=str(cfg.get("agentic", {}).get("agent1", {}).get("model", "gpt-4o-mini")),
        temperature=float(cfg.get("agentic", {}).get("agent1", {}).get("temperature", 0.2)),
        max_clusters=int(cfg.get("agentic", {}).get("agent1", {}).get("max_clusters", 999999)),
        force_rebuild=FORCE_REBUILD,
    )

    # Join back to items
    items_after_agent1 = items.merge(
        clusters_semantic[["cluster_id","cluster_name","cluster_description","cluster_keywords"]],
        on="cluster_id",
        how="left",
    )
    save_parquet(items_after_agent1, items_after_agent1_path)

runtime_sec = time.time() - t0
print("runtime_sec:", round(runtime_sec, 2))


## CHECKPOINT — After Agent 1

In [None]:
# CHECKPOINT: critic on clusters_semantic (must have unique names)
from IPython.display import display

required = ["cluster_id","cluster_name","cluster_description","cluster_keywords"]

critic_clusters = run_critic(
    df=clusters_semantic,
    stage="stage_04_agent1_cleanup",
    gate="checkpoint_clusters_semantic",
    required_cols=required,
    id_col="cluster_id",
    min_rows=2,
    key_nonnull_cols=["cluster_id","cluster_name"],
)

# Hard uniqueness check (must pass)
name_dups = clusters_semantic["cluster_name"].astype(str).duplicated().sum()
if name_dups > 0:
    critic_clusters.fails.append("cluster_name_not_unique: duplicate_count=" + str(int(name_dups)))
    critic_clusters.passed = False

write_critic_report(critic_clusters, qa_dir / "critic_checkpoint_clusters_semantic.json")
display(critic_result_table(critic_clusters))
display(critic_issues_table(critic_clusters).head(50))


In [None]:
# Register artifacts + stage manifest
schema_version = str(cfg.get("project", {}).get("schema_version", "0.1.0"))

register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_04_agent1_cleanup",
    artifact="clusters_semantic",
    path=clusters_semantic_path,
    schema_version=schema_version,
    inputs=[stage_in_items],
    df=clusters_semantic,
    warnings_count=int(critic_clusters.warnings_count),
    fails_count=int(critic_clusters.fails_count),
    runtime_sec=float(runtime_sec),
    notes="Agent 1 semantic cluster table",
)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_04_agent1_cleanup",
    artifact="agent1_memory",
    path=agent1_memory_path,
    schema_version=schema_version,
    inputs=[stage_in_items],
    df=agent1_memory,
    warnings_count=int(critic_clusters.warnings_count),
    fails_count=int(critic_clusters.fails_count),
    runtime_sec=float(runtime_sec),
    notes="Agent 1 memory (prompt/response history)",
)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_04_agent1_cleanup",
    artifact="items_after_agent1",
    path=items_after_agent1_path,
    schema_version=schema_version,
    inputs=[stage_in_items, clusters_semantic_path],
    df=items_after_agent1,
    warnings_count=int(critic_clusters.warnings_count),
    fails_count=int(critic_clusters.fails_count),
    runtime_sec=float(runtime_sec),
    notes="items + semantic cluster columns",
)

append_stage_manifest(
    project_root=PROJECT_ROOT,
    stage="stage_04_agent1_cleanup",
    inputs=[stage_in_items, stage_in_centroids],
    outputs=[clusters_semantic_path, agent1_memory_path, items_after_agent1_path],
    schema_version=schema_version,
    warnings_count=int(critic_clusters.warnings_count),
    fails_count=int(critic_clusters.fails_count),
    runtime_sec=float(runtime_sec),
    notes="stage 04 run summary",
)


## POST — Semantic health + glass-box checks

In [None]:
# POST (1/4) — Cluster semantics table (length diagnostics)
from IPython.display import display

df = clusters_semantic.copy()
df["name_len"] = df["cluster_name"].astype(str).map(len)
df["desc_len"] = df["cluster_description"].astype(str).map(len)
df["kw_len"] = df["cluster_keywords"].astype(str).map(lambda x: len(x) if isinstance(x, (list,tuple)) else len(str(x)))
display(df[["cluster_id","cluster_name","name_len","desc_len","kw_len"]].head(50))


In [None]:
# POST (2/4) — Name uniqueness / collision report (should be all unique)
import matplotlib.pyplot as plt

vc = clusters_semantic["cluster_name"].astype(str).value_counts()
dups = vc[vc.values > 1]

fig = plt.figure(figsize=(8, 3))
if len(dups):
    plt.bar(dups.index.astype(str), dups.values)
    plt.xticks(rotation=45, ha="right")
    plt.title("⚠️ Duplicate cluster_name collisions (should be empty)")
else:
    plt.bar(["unique"], [len(clusters_semantic)])
    plt.title("✅ cluster_name uniqueness check passed")

plt.tight_layout()

out_path = plots_dir / "name_collision_report.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="name_collision_report", title="Name collision report", path=out_path, tags=["post","semantic","uniqueness"], is_core=True)


**Interpretation**: Agent 1 must stabilize names so each cluster has a unique short label.

**Warning signs**: any duplicates indicate stabilization failed; rerun Agent 1 with FORCE_REBUILD=True.

In [None]:
# POST (3/4) — "Junk semantics" detector (heuristic flags)
from IPython.display import display

def is_junk(name: str) -> bool:
    n = (name or "").strip().lower()
    if len(n) < 3:
        return True
    junk_terms = ["unknown","other","misc","artifact","noise","junk","background"]
    return any(t in n for t in junk_terms)

flags = clusters_semantic.copy()
flags["junk_name"] = flags["cluster_name"].astype(str).map(is_junk)
flags["short_desc"] = flags["cluster_description"].astype(str).map(lambda s: len(s.strip()) < 20)

flagged = flags[flags["junk_name"] | flags["short_desc"]].copy()
display(flagged[["cluster_id","cluster_name","junk_name","short_desc"]].head(50))

(flag_path := qa_dir / "junk_semantics_flags.parquet")
flagged.to_parquet(flag_path, index=False)


In [None]:
# POST (4/4) — Critic summary (clusters_semantic)
from IPython.display import display

display(critic_result_table(critic_clusters))
display(critic_issues_table(critic_clusters).head(50))


## Core semantic visual verification (one montage per cell)
Montages are the primary evidence backing semantic labels.

In [None]:
# Select a few clusters for flashcard review (core)
vc_sizes = items_after_agent1["cluster_id"].astype(int).value_counts()
vc_sizes = vc_sizes[vc_sizes.index != -1]
review_cluster_ids = [int(x) for x in vc_sizes.head(2).index.tolist()]
if len(flagged):
    review_cluster_ids.append(int(flagged.iloc[0]["cluster_id"]))
review_cluster_ids = list(dict.fromkeys(review_cluster_ids))[:3]
print("review_cluster_ids:", review_cluster_ids)


In [None]:
# Flashcard 1 — cluster semantics (if review_cluster_ids[0] exists)
from IPython.display import display, Markdown

if len(review_cluster_ids) > 0:
    cid = review_cluster_ids[0]
    row = clusters_semantic[clusters_semantic["cluster_id"].astype(int) == int(cid)].iloc[0]
    md = "### Cluster " + str(cid) + ": " + str(row["cluster_name"]) + "\n\n" +          "**Description**: " + str(row["cluster_description"]) + "\n\n" +          "**Keywords**: " + str(row["cluster_keywords"]) + "\n"
    display(Markdown(md))
else:
    print("not enough clusters for review")


In [None]:
# Montage 1 — Representative images for review_cluster_ids[0]
from histo_cartography.image_viz import montage_by_cluster

if len(review_cluster_ids) > 0:
    cid = review_cluster_ids[0]
    out_path = plots_dir / ("montage_agent1_cluster_" + str(cid) + ".png")
    montage_by_cluster(items_after_agent1, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="montage_agent1_cluster_" + str(cid), title="Agent1 cluster montage: " + str(cid), path=out_path, tags=["core","montage","agent1"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Does the montage visually match the semantic label? If not, either clustering is off (Stage 03) or Agent 1 needs better evidence.

**Warning signs**: montage contradicts label; re-run with better prompts/evidence or adjust clustering.

In [None]:
# Flashcard 2 — cluster semantics (if review_cluster_ids[1] exists)
from IPython.display import display, Markdown

if len(review_cluster_ids) > 1:
    cid = review_cluster_ids[1]
    row = clusters_semantic[clusters_semantic["cluster_id"].astype(int) == int(cid)].iloc[0]
    md = "### Cluster " + str(cid) + ": " + str(row["cluster_name"]) + "\n\n" +          "**Description**: " + str(row["cluster_description"]) + "\n\n" +          "**Keywords**: " + str(row["cluster_keywords"]) + "\n"
    display(Markdown(md))
else:
    print("not enough clusters for review")


In [None]:
# Montage 2 — Representative images for review_cluster_ids[1]
from histo_cartography.image_viz import montage_by_cluster

if len(review_cluster_ids) > 1:
    cid = review_cluster_ids[1]
    out_path = plots_dir / ("montage_agent1_cluster_" + str(cid) + ".png")
    montage_by_cluster(items_after_agent1, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="montage_agent1_cluster_" + str(cid), title="Agent1 cluster montage: " + str(cid), path=out_path, tags=["core","montage","agent1"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Does the montage visually match the semantic label? If not, either clustering is off (Stage 03) or Agent 1 needs better evidence.

**Warning signs**: montage contradicts label; re-run with better prompts/evidence or adjust clustering.

In [None]:
# Flashcard 3 — cluster semantics (if review_cluster_ids[2] exists)
from IPython.display import display, Markdown

if len(review_cluster_ids) > 2:
    cid = review_cluster_ids[2]
    row = clusters_semantic[clusters_semantic["cluster_id"].astype(int) == int(cid)].iloc[0]
    md = "### Cluster " + str(cid) + ": " + str(row["cluster_name"]) + "\n\n" +          "**Description**: " + str(row["cluster_description"]) + "\n\n" +          "**Keywords**: " + str(row["cluster_keywords"]) + "\n"
    display(Markdown(md))
else:
    print("not enough clusters for review")


In [None]:
# Montage 3 — Representative images for review_cluster_ids[2]
from histo_cartography.image_viz import montage_by_cluster

if len(review_cluster_ids) > 2:
    cid = review_cluster_ids[2]
    out_path = plots_dir / ("montage_agent1_cluster_" + str(cid) + ".png")
    montage_by_cluster(items_after_agent1, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="montage_agent1_cluster_" + str(cid), title="Agent1 cluster montage: " + str(cid), path=out_path, tags=["core","montage","agent1"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Does the montage visually match the semantic label? If not, either clustering is off (Stage 03) or Agent 1 needs better evidence.

**Warning signs**: montage contradicts label; re-run with better prompts/evidence or adjust clustering.

In [None]:
# Core plot — Dataset mix per cluster (top clusters)
import matplotlib.pyplot as plt

top_clusters = vc_sizes.head(10).index.tolist()
sub = items_after_agent1[items_after_agent1["cluster_id"].astype(int).isin(top_clusters)].copy()

ct = pd.crosstab(sub["cluster_id"].astype(int), sub["source"].astype(str))
ctn = ct.div(ct.sum(axis=1), axis=0)

fig = plt.figure(figsize=(8, 5))
bottom = None
for col in ctn.columns:
    vals = ctn[col].values
    if bottom is None:
        plt.bar(ctn.index.astype(str), vals, label=col)
        bottom = vals
    else:
        plt.bar(ctn.index.astype(str), vals, bottom=bottom, label=col)
        bottom = bottom + vals

plt.title("Dataset mix per cluster (top 10 clusters)")
plt.xticks(rotation=45, ha="right")
plt.ylabel("fraction")
plt.legend(fontsize=7, bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()

out_path = plots_dir / "dataset_mix_top_clusters.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="dataset_mix_top_clusters", title="Dataset mix per cluster (top clusters)", path=out_path, tags=["core","semantic","dataset_mix"], is_core=True)


**Interpretation**: Mixed clusters across datasets are often more robust; single-dataset clusters may reflect domain artifacts.

**Warning signs**: many clusters dominated by one dataset.

In [None]:
# Core plot — Label mix per cluster (top clusters)
import matplotlib.pyplot as plt

ct = pd.crosstab(sub["cluster_id"].astype(int), sub["label"].astype(str))
ctn = ct.div(ct.sum(axis=1), axis=0)

fig = plt.figure(figsize=(8, 5))
bottom = None
for col in ctn.columns[:10]:
    vals = ctn[col].values
    if bottom is None:
        plt.bar(ctn.index.astype(str), vals, label=col)
        bottom = vals
    else:
        plt.bar(ctn.index.astype(str), vals, bottom=bottom, label=col)
        bottom = bottom + vals

plt.title("Label mix per cluster (top 10 clusters)")
plt.xticks(rotation=45, ha="right")
plt.ylabel("fraction")
plt.legend(fontsize=7, bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()

out_path = plots_dir / "label_mix_top_clusters.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_04_agent1_cleanup", plot_id="label_mix_top_clusters", title="Label mix per cluster (top clusters)", path=out_path, tags=["core","semantic","label_mix"], is_core=True)


**Interpretation**: Label mix is a diagnostic; mixed clusters can be valid, but large highly mixed clusters may be too broad.

**Warning signs**: no dominant signal across labels in many clusters.

In [None]:
# Semantic-vs-visual consistency spot check (nearest clusters by centroid similarity)
from IPython.display import display
import numpy as np

if len(review_cluster_ids):
    cid = int(review_cluster_ids[0])
else:
    cid = int(vc_sizes.index[0])

C = centroids.sort_values("cluster_id").copy()
vecs = np.asarray(C["vector"].tolist(), dtype=np.float32)
vecs = vecs / (np.linalg.norm(vecs, axis=1, keepdims=True) + 1e-12)
ids = C["cluster_id"].astype(int).tolist()

i = ids.index(cid) if cid in ids else 0
sims = vecs @ vecs[i]
order = np.argsort(-sims)[:10]
nn = pd.DataFrame({"cluster_id":[ids[j] for j in order], "similarity":[float(sims[j]) for j in order]})
nn = nn.merge(clusters_semantic[["cluster_id","cluster_name"]], on="cluster_id", how="left")
display(nn)


In [None]:
# Write viz index (parquet + csv) + show preview
from IPython.display import display
from histo_cartography.viz import write_viz_index, viz_records_to_df

viz_index_path = stage_dir / "viz_index.parquet"
write_viz_index(viz_records, out_parquet=viz_index_path, out_csv=stage_dir / "viz_index.csv")

viz_df = viz_records_to_df(viz_records)
display(viz_df.head(150))
print("✅ wrote viz_index:", viz_index_path)


## Next actions
- Proceed to Stage 05 for Agent 2 linking + relationship verification.
- If semantic labels look wrong: improve evidence (more representative samples) or revisit Stage 03 clustering.