# Stage 03 — Cluster embeddings / cartography (parquet-first, visual-first)

This notebook reads **Stage 02** `items_with_embeddings.parquet`, performs clustering (UMAP → HDBSCAN sweep with safe fallback), and writes:

- `exports/stage_03_clustering/items_with_clusters.parquet` (adds `cluster_id` and 2D coords `x`,`y`)
- `exports/stage_03_clustering/cluster_centroids.parquet`

**Quality focus**
- Cluster quality is extremely sensitive to embedding health (Stage 02).
- This stage adds **core cartography plots** + cluster purity/mix checks before agentic enrichment.


In [None]:
# --- Colab-first setup (safe defaults) ---
import os, sys, time
from pathlib import Path

FORCE_REBUILD = False
FAST_MODE = True
EDA_LEVEL = "core"  # "core" | "standard" | "deep"

SHOW_PLOTS = True
SAVE_PLOTS = True

DRIVE_SEARCH_BASE = "/content/drive/MyDrive"

def _is_colab() -> bool:
    return "google.colab" in sys.modules

if _is_colab():
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

def _resolve_project_root() -> Path:
    ev = os.environ.get("HISTO_PROJECT_ROOT")
    if ev and Path(ev).exists():
        return Path(ev)

    base = Path(DRIVE_SEARCH_BASE)
    candidates = []
    if base.exists():
        for p in base.glob("**/pipeline_config.yaml"):
            parent = p.parent
            if (parent / "label_taxonomy.yaml").exists():
                candidates.append(parent)
    if candidates:
        candidates = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)
        return candidates[0]

    p = Path.cwd()
    for _ in range(10):
        if (p / "pipeline_config.yaml").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not resolve PROJECT_ROOT. Set HISTO_PROJECT_ROOT env var.")

PROJECT_ROOT = _resolve_project_root()
sys.path.insert(0, str(PROJECT_ROOT))
print("PROJECT_ROOT:", PROJECT_ROOT)

# Install deps
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "-r", str(PROJECT_ROOT / "requirements.txt")])

# Optional clustering extras
try:
    subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "umap-learn"])
except Exception as e:
    print("⚠️ optional install failed (umap-learn):", e)

try:
    subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "hdbscan"])
except Exception as e:
    print("⚠️ optional install failed (hdbscan):", e)

import yaml
cfg = yaml.safe_load((PROJECT_ROOT / "pipeline_config.yaml").read_text())

EXPORTS_DIR = PROJECT_ROOT / str(cfg.get("paths", {}).get("exports_dir", "exports"))
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)

SAFE_MODE = bool(cfg.get("project", {}).get("safe_mode", True))
SEED = int(cfg.get("project", {}).get("seed", 1337))

print("SAFE_MODE:", SAFE_MODE, "| EDA_LEVEL:", EDA_LEVEL)


In [None]:
# --- Stage paths + registries ---
from pathlib import Path
import pandas as pd

from histo_cartography.viz import ensure_dir, save_and_display, register_plot, display_image
from histo_cartography.artifact_registry import register_artifact, append_stage_manifest
from histo_cartography.critic import run_critic, write_critic_report, critic_result_table, critic_issues_table

stage_in = EXPORTS_DIR / "stage_02_embeddings" / "items_with_embeddings.parquet"
assert stage_in.exists(), f"missing upstream parquet: {stage_in}"

stage_dir = EXPORTS_DIR / "stage_03_clustering"
plots_dir = ensure_dir(stage_dir / "plots")
qa_dir = ensure_dir(stage_dir / "qa")
eda_dir = ensure_dir(stage_dir / "eda")

items_with_clusters_path = stage_dir / "items_with_clusters.parquet"
centroids_path = stage_dir / "cluster_centroids.parquet"

viz_records = []

print("stage_in:", stage_in)
print("stage_dir:", stage_dir)


In [None]:
# --- Load upstream parquet (Stage 02) ---
import pandas as pd
from IPython.display import display

items = pd.read_parquet(stage_in)
display(items.head(3))
print("items shape:", items.shape)


## PEEP — Preflight health + EDA

In [None]:
# PEEP (1/4) — Overview table
from IPython.display import display
from histo_cartography.eda_reports import df_overview_table

display(df_overview_table(items, max_cols=40).head(40))


In [None]:
# PEEP (2/4) — Missingness plot
from histo_cartography.eda_reports import plot_missingness

fig = plot_missingness(items, top_k=25, title="Stage 03 PEEP: items_with_embeddings missingness (top 25)")
out_path = plots_dir / "peep_missingness_top25.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="peep_missingness_top25", title="PEEP missingness (top 25 columns)", path=out_path, tags=["peep","missingness"], is_core=True)


**Interpretation**: vectors must be present; missing `vector` or `dim` will break clustering.

**Warning signs**: Any null vectors, inconsistent `dim`, or NaNs in embeddings.

In [None]:
# PEEP (3/4) — Label distribution (bar)
import matplotlib.pyplot as plt

vc = items["label"].astype(str).value_counts().head(30)

fig = plt.figure(figsize=(8, 4))
plt.bar(vc.index.astype(str), vc.values)
plt.xticks(rotation=45, ha="right")
plt.title("Stage 03 PEEP: label distribution (top 30)")
plt.ylabel("n items")
plt.tight_layout()

out_path = plots_dir / "peep_label_distribution_top30.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="peep_label_distribution_top30", title="PEEP label distribution (top 30)", path=out_path, tags=["peep","distribution","label"], is_core=True)


**Interpretation**: In mixed datasets, labels can be imbalanced; clustering should not just replicate label skew.

**Warning signs**: missing labels, or a single label dominating 95%+ of rows.

In [None]:
# PEEP (4/4) — Critic gates (items_with_embeddings)
from IPython.display import display

critic_in = run_critic(
    df=items,
    stage="stage_03_clustering",
    gate="peep_items_with_embeddings",
    required_cols=["item_id","source","label","image_path","vector","dim"],
    id_col="item_id",
    min_rows=100 if not SAFE_MODE else 10,
    key_nonnull_cols=["item_id","image_path","vector"],
    vector_col="vector",
    finite_cols=[],
)

write_critic_report(critic_in, qa_dir / "critic_peep_items_with_embeddings.json")
display(critic_result_table(critic_in))
display(critic_issues_table(critic_in).head(50))


## Stage logic — Clustering (idempotent)
Default: UMAP → HDBSCAN parameter sweep. Falls back to KMeans if needed.

In [None]:
# --- Main stage logic ---
from pathlib import Path
import json
import numpy as np
import pandas as pd

from histo_cartography.exports import save_parquet
from histo_cartography import clustering as clus

t0 = time.time()

if items_with_clusters_path.exists() and centroids_path.exists() and not FORCE_REBUILD:
    items_with_clusters = pd.read_parquet(items_with_clusters_path)
    cluster_centroids = pd.read_parquet(centroids_path)
    print(f"✅ Loaded existing clustering outputs: items={items_with_clusters.shape}, centroids={cluster_centroids.shape}")
else:
    fused = items[["item_id", "vector"]].copy()
    labels = items["label"].astype(str)

    cart_cfg = cfg.get("cartography", {})
    sweep_cfg = cart_cfg.get("clustering_sweep", {})

    def _safe_seq(x, fallback):
        if isinstance(x, list) and len(x) > 0:
            return x
        return fallback

    if SAFE_MODE:
        umap_n_neighbors = [int(cart_cfg.get("umap", {}).get("n_neighbors", 15))]
        umap_min_dist = [float(cart_cfg.get("umap", {}).get("min_dist", 0.1))]
        hdbscan_mcs = [int(cart_cfg.get("hdbscan", {}).get("min_cluster_size", 8))]
        hdbscan_ms = [None]
    else:
        umap_n_neighbors = [int(v) for v in _safe_seq(sweep_cfg.get("umap_n_neighbors"), [10, 20, 50])]
        umap_min_dist = [float(v) for v in _safe_seq(sweep_cfg.get("umap_min_dist"), [0.0, 0.1, 0.3])]
        hdbscan_mcs = [int(v) for v in _safe_seq(sweep_cfg.get("hdbscan_min_cluster_size"), [5, 8, 10, 15])]
        hdbscan_ms = [None if v is None else int(v) for v in _safe_seq(sweep_cfg.get("hdbscan_min_samples"), [None, 5, 10])]

    out_sweep_dir = stage_dir / "hdbscan_sweep"
    results_df, best_run, best_clusters_df = clus.hdbscan_parameter_sweep(
        fused=fused,
        labels=labels,
        out_dir=out_sweep_dir,
        umap_n_neighbors=umap_n_neighbors,
        umap_min_dist=umap_min_dist,
        hdbscan_min_cluster_size=hdbscan_mcs,
        hdbscan_min_samples=hdbscan_ms,
        random_state=SEED,
        make_plots=not SAFE_MODE,  # sweep plots can be heavy; core plots are generated below
        max_noise_ratio=float(cart_cfg.get("clustering", {}).get("best_run", {}).get("max_noise_ratio", 0.40)),
    )

    if best_clusters_df is not None:
        clusters_df = best_clusters_df.copy()
        (stage_dir / "best_run.json").write_text(json.dumps(best_run, indent=2) if best_run is not None else "{}")
        print("✅ Using best HDBSCAN run:", best_run.get("run_name") if best_run else None)
    else:
        k = int(cart_cfg.get("kmeans", {}).get("k", 9))
        y_pred, meta = clus.run_kmeans(clus._to_matrix(fused), k=k, random_state=SEED)
        clusters_df = pd.DataFrame({"item_id": fused["item_id"].tolist(), "cluster_id": y_pred.tolist()})
        (stage_dir / "kmeans_meta.json").write_text(json.dumps(meta, indent=2))
        print("✅ Using KMeans fallback: k=", k)

    # 2D coords for cartography (UMAP preferred; PCA fallback inside compute_umap)
    X = clus._to_matrix(fused)
    coords2d, meta2d = clus.compute_umap(
        X,
        n_neighbors=int(cart_cfg.get("umap", {}).get("n_neighbors", 15)),
        min_dist=float(cart_cfg.get("umap", {}).get("min_dist", 0.1)),
        n_components=2,
        metric="cosine",
        random_state=SEED,
    )
    coords_df = pd.DataFrame({"item_id": fused["item_id"].tolist(), "x": coords2d[:, 0], "y": coords2d[:, 1]})
    (stage_dir / "cartography_2d_meta.json").write_text(json.dumps(meta2d, indent=2))

    # Merge
    items_with_clusters = items.merge(clusters_df, on="item_id", how="left").merge(coords_df, on="item_id", how="left")
    assert items_with_clusters["cluster_id"].isna().sum() == 0, "missing cluster_id assignments"

    # Centroids (exclude noise cluster -1 only for summaries; still compute it here for evidence)
    rows = []
    for cid, g in items_with_clusters.groupby("cluster_id"):
        cid = int(cid)
        vecs = np.asarray(g["vector"].tolist(), dtype=np.float32)
        centroid = vecs.mean(axis=0).astype(np.float32)
        rows.append({"cluster_id": cid, "n_items": int(len(g)), "vector": centroid.tolist()})
    cluster_centroids = pd.DataFrame(rows).sort_values("cluster_id").reset_index(drop=True)

    save_parquet(items_with_clusters, items_with_clusters_path)
    save_parquet(cluster_centroids, centroids_path)

runtime_sec = time.time() - t0
print("runtime_sec:", round(runtime_sec, 2))


## CHECKPOINT — After clustering + cartography

In [None]:
# CHECKPOINT: critic on items_with_clusters (cluster_id + coords must be finite)
from IPython.display import display

critic_clusters = run_critic(
    df=items_with_clusters,
    stage="stage_03_clustering",
    gate="checkpoint_items_with_clusters",
    required_cols=["item_id","cluster_id","x","y","vector","image_path","label","source"],
    id_col="item_id",
    min_rows=100 if not SAFE_MODE else 10,
    key_nonnull_cols=["item_id","cluster_id","x","y"],
    vector_col="vector",
    finite_cols=["x","y"],
)

write_critic_report(critic_clusters, qa_dir / "critic_checkpoint_items_with_clusters.json")
display(critic_result_table(critic_clusters))
display(critic_issues_table(critic_clusters).head(50))


In [None]:
# Register artifacts + stage manifest
schema_version = str(cfg.get("project", {}).get("schema_version", "0.1.0"))

register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_03_clustering",
    artifact="items_with_clusters",
    path=items_with_clusters_path,
    schema_version=schema_version,
    inputs=[stage_in],
    df=items_with_clusters,
    warnings_count=int(critic_clusters.warnings_count),
    fails_count=int(critic_clusters.fails_count),
    runtime_sec=float(runtime_sec),
    notes="items + cluster assignments + 2d coords",
)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_03_clustering",
    artifact="cluster_centroids",
    path=centroids_path,
    schema_version=schema_version,
    inputs=[stage_in],
    df=cluster_centroids,
    warnings_count=int(critic_clusters.warnings_count),
    fails_count=int(critic_clusters.fails_count),
    runtime_sec=float(runtime_sec),
    notes="cluster centroids (mean vector)",
)

append_stage_manifest(
    project_root=PROJECT_ROOT,
    stage="stage_03_clustering",
    inputs=[stage_in],
    outputs=[items_with_clusters_path, centroids_path],
    schema_version=schema_version,
    warnings_count=int(critic_clusters.warnings_count),
    fails_count=int(critic_clusters.fails_count),
    runtime_sec=float(runtime_sec),
    notes="stage 03 run summary",
)


## POST — Postflight health + EDA

In [None]:
# POST (1/3) — Overview table (items_with_clusters)
from IPython.display import display
from histo_cartography.eda_reports import df_overview_table

display(df_overview_table(items_with_clusters, max_cols=45).head(45))


In [None]:
# POST (2/3) — Missingness plot (items_with_clusters)
from histo_cartography.eda_reports import plot_missingness

fig = plot_missingness(items_with_clusters, top_k=25, title="Stage 03 POST: items_with_clusters missingness (top 25)")
out_path = plots_dir / "post_missingness_top25.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="post_missingness_top25", title="POST missingness (top 25 columns)", path=out_path, tags=["post","missingness"], is_core=True)


**Interpretation**: `cluster_id`, `x`, `y` should be complete for all items.

**Warning signs**: missing coords indicate dimensionality reduction failure.

In [None]:
# POST (3/3) — Critic summary
from IPython.display import display

display(critic_result_table(critic_clusters))
display(critic_issues_table(critic_clusters).head(50))


## Core clustering/cartography diagnostics (one plot per cell)

In [None]:
import json
# Core plot 1 — Cluster size distribution (excluding -1)
import numpy as np
import matplotlib.pyplot as plt

vc = items_with_clusters["cluster_id"].astype(int).value_counts()
vc_no_noise = vc[vc.index != -1]

fig = plt.figure(figsize=(7, 4))
plt.hist(vc_no_noise.values, bins=min(30, max(5, len(vc_no_noise))), edgecolor="black")
plt.title("Cluster size distribution (excluding -1)")
plt.xlabel("cluster size")
plt.ylabel("count of clusters")
plt.tight_layout()

out_path = plots_dir / "cluster_size_hist_excluding_noise.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="cluster_size_hist_excluding_noise", title="Cluster size distribution (excluding -1)", path=out_path, tags=["core","clustering"], is_core=True)

# gini coefficient (imbalance diagnostic)
x = vc_no_noise.values.astype(float)
x = np.sort(x)
n = len(x)
gini = float((2*np.arange(1,n+1)-n-1).dot(x) / (n*x.sum() + 1e-12)) if n>0 else None
(qa_dir / "cluster_size_gini.json").write_text(json.dumps({"gini": gini}, indent=2))


**Interpretation**: Healthy clustering has a mix of sizes; extreme imbalance can indicate overly aggressive clustering.

**Warning signs**: Many tiny clusters or one giant cluster; very high Gini.

In [None]:
# Core plot 2 — Top 30 clusters by size (excluding -1)
import matplotlib.pyplot as plt

top = vc_no_noise.head(30)

fig = plt.figure(figsize=(10, 4))
plt.bar(top.index.astype(str), top.values)
plt.xticks(rotation=90)
plt.title("Top 30 clusters by size (excluding -1)")
plt.ylabel("n items")
plt.tight_layout()

out_path = plots_dir / "cluster_sizes_top30.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="cluster_sizes_top30", title="Top 30 clusters by size", path=out_path, tags=["core","clustering"], is_core=True)


**Interpretation**: Large clusters often dominate semantics; consider reviewing them first.

**Warning signs**: One cluster dwarfs all others; may need different HDBSCAN settings.

In [None]:
# Core plot 3 — 2D map colored by tissue label
import matplotlib.pyplot as plt

df = items_with_clusters.copy()
df["label_str"] = df["label"].astype(str)

fig = plt.figure(figsize=(7, 6))
for lab, g in df.groupby("label_str"):
    plt.scatter(g["x"], g["y"], s=8, alpha=0.6, label=str(lab))
plt.legend(markerscale=2, bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=7)
plt.title("2D map colored by tissue label")
plt.tight_layout()

out_path = plots_dir / "map2d_by_label.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="map2d_by_label", title="2D map colored by tissue label", path=out_path, tags=["core","cartography","label"], is_core=True)


**Interpretation**: Labels should show local coherence, not necessarily perfect separation.

**Warning signs**: Completely mixed labels everywhere (no structure) or perfect separation (possible leakage).

In [None]:
# Core plot 4 — 2D map colored by cluster_id (proxy for “named clusters”)
import matplotlib.pyplot as plt

df = items_with_clusters.copy()
df["cluster_str"] = df["cluster_id"].astype(int).astype(str)

fig = plt.figure(figsize=(7, 6))
for cid, g in df.groupby("cluster_str"):
    # skip too many clusters in legend
    if len(df["cluster_str"].unique()) > 30:
        break
    plt.scatter(g["x"], g["y"], s=8, alpha=0.6, label=str(cid))
plt.legend(markerscale=2, bbox_to_anchor=(1.05, 1), loc="upper left", fontsize=7)
plt.title("2D map colored by cluster_id")
plt.tight_layout()

out_path = plots_dir / "map2d_by_cluster.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="map2d_by_cluster", title="2D map colored by cluster_id", path=out_path, tags=["core","cartography","cluster"], is_core=True)


**Interpretation**: Clusters should correspond to coherent regions in the map.

**Warning signs**: Clusters are spatially scattered everywhere (unstable clustering).

In [None]:
# Core plot 5 — Cluster × Label composition heatmap (row-normalized, top clusters)
import matplotlib.pyplot as plt
import numpy as np

# focus on largest clusters for readability
top_clusters = vc_no_noise.head(15).index.tolist()
sub = items_with_clusters[items_with_clusters["cluster_id"].astype(int).isin(top_clusters)].copy()

ct = pd.crosstab(sub["cluster_id"].astype(int), sub["label"].astype(str))
ctn = ct.div(ct.sum(axis=1), axis=0)

fig = plt.figure(figsize=(max(8, 0.6 * len(ctn.columns)), max(4, 0.3 * len(ctn.index))))
plt.imshow(ctn.values, aspect="auto")
plt.colorbar(label="fraction within cluster")
plt.xticks(range(len(ctn.columns)), ctn.columns.tolist(), rotation=45, ha="right")
plt.yticks(range(len(ctn.index)), [str(x) for x in ctn.index.tolist()])
plt.title("Cluster × Label composition (row-normalized, top clusters)")
plt.tight_layout()

out_path = plots_dir / "cluster_label_heatmap_top.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="cluster_label_heatmap_top", title="Cluster × Label composition heatmap (top clusters)", path=out_path, tags=["core","clustering","purity"], is_core=True)


**Interpretation**: Pure clusters (one dominant label) can be easier to name; mixed clusters may represent transitions or confounds.

**Warning signs**: Large clusters that are highly mixed may need higher embedding quality or different clustering resolution.

In [None]:
# Core plot 6 — Cluster label purity distribution (excluding -1, top clusters)
import matplotlib.pyplot as plt
import numpy as np

purity = []
for cid, g in items_with_clusters.groupby(items_with_clusters["cluster_id"].astype(int)):
    if int(cid) == -1:
        continue
    vc = g["label"].astype(str).value_counts()
    if len(vc)==0:
        continue
    purity.append(float(vc.iloc[0] / len(g)))

fig = plt.figure(figsize=(7, 4))
plt.hist(purity, bins=20, edgecolor="black")
plt.title("Cluster label purity distribution (excluding -1)")
plt.xlabel("purity (majority label fraction)")
plt.ylabel("count of clusters")
plt.tight_layout()

out_path = plots_dir / "cluster_label_purity_hist.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="cluster_label_purity_hist", title="Cluster label purity distribution (excluding -1)", path=out_path, tags=["core","clustering","purity"], is_core=True)


**Interpretation**: Purity is a diagnostic, not a target. Mixed clusters can be valid.

**Warning signs**: Very low purity across most clusters may indicate embeddings not capturing tissue signal.

In [None]:
# Core diagnostic 7 — Silhouette score distribution on a sample (optional)
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import silhouette_samples

# sample for speed
n = min(len(items_with_clusters), 2000 if FAST_MODE else 8000)
idx = np.random.default_rng(SEED).choice(len(items_with_clusters), size=n, replace=False)

X = np.asarray(items_with_clusters.iloc[idx]["vector"].tolist(), dtype=np.float32)
y = items_with_clusters.iloc[idx]["cluster_id"].astype(int).to_numpy()

# silhouette requires >1 cluster and no singletons
if len(set(y)) > 1 and (pd.Series(y).value_counts().min() >= 2):
    sil = silhouette_samples(X, y, metric="cosine")
    fig = plt.figure(figsize=(7, 4))
    plt.hist(sil, bins=30, edgecolor="black")
    plt.title("Silhouette distribution (sampled, cosine)")
    plt.xlabel("silhouette")
    plt.ylabel("count")
    plt.tight_layout()
else:
    fig = plt.figure(figsize=(7, 4))
    plt.title("Silhouette distribution (not enough clusters / singletons)")

out_path = plots_dir / "silhouette_hist_sampled.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_03_clustering", plot_id="silhouette_hist_sampled", title="Silhouette distribution (sampled)", path=out_path, tags=["core","clustering","silhouette"], is_core=True)


**Interpretation**: Silhouette > 0 indicates separation, < 0 indicates overlap.

**Warning signs**: Most values near 0 or negative might mean clusters are not well-separated.

In [None]:
# Choose clusters for representative montages (core)
# We prioritize largest non-noise clusters.
core_cluster_ids = [int(x) for x in vc_no_noise.head(3).index.tolist()]
flagged_cluster_ids = [int(x) for x in vc_no_noise.tail(3).index.tolist()]  # smallest clusters (diagnostic)
print("core_cluster_ids:", core_cluster_ids)
print("flagged_cluster_ids:", flagged_cluster_ids)


In [None]:
# Core plot montage — Representative images for core_cluster_ids[0] (if available)
from histo_cartography.image_viz import montage_by_cluster

if len(core_cluster_ids) > 0:
    cid = core_cluster_ids[0]
    out_path = plots_dir / f"montage_cluster_{cid}.png"
    montage_by_cluster(items_with_clusters, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_03_clustering", plot_id=f"montage_cluster_{cid}", title=f"Representative montage: cluster {cid}", path=out_path, tags=["core","montage","cluster"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Montages are the most important “glass box” evidence for cluster coherence.

**Warning signs**: Mixed morphologies within a single cluster; obvious artifacts; blank/failed image loads.

In [None]:
# Core plot montage — Representative images for core_cluster_ids[1] (if available)
from histo_cartography.image_viz import montage_by_cluster

if len(core_cluster_ids) > 1:
    cid = core_cluster_ids[1]
    out_path = plots_dir / f"montage_cluster_{cid}.png"
    montage_by_cluster(items_with_clusters, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_03_clustering", plot_id=f"montage_cluster_{cid}", title=f"Representative montage: cluster {cid}", path=out_path, tags=["core","montage","cluster"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Montages are the most important “glass box” evidence for cluster coherence.

**Warning signs**: Mixed morphologies within a single cluster; obvious artifacts; blank/failed image loads.

In [None]:
# Core plot montage — Representative images for core_cluster_ids[2] (if available)
from histo_cartography.image_viz import montage_by_cluster

if len(core_cluster_ids) > 2:
    cid = core_cluster_ids[2]
    out_path = plots_dir / f"montage_cluster_{cid}.png"
    montage_by_cluster(items_with_clusters, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_03_clustering", plot_id=f"montage_cluster_{cid}", title=f"Representative montage: cluster {cid}", path=out_path, tags=["core","montage","cluster"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Montages are the most important “glass box” evidence for cluster coherence.

**Warning signs**: Mixed morphologies within a single cluster; obvious artifacts; blank/failed image loads.

In [None]:
# Diagnostic montage — Representative images for flagged_cluster_ids[0] (small/tail clusters)
from histo_cartography.image_viz import montage_by_cluster

if len(flagged_cluster_ids) > 0:
    cid = flagged_cluster_ids[0]
    out_path = plots_dir / f"montage_flagged_cluster_{cid}.png"
    montage_by_cluster(items_with_clusters, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_03_clustering", plot_id=f"montage_flagged_cluster_{cid}", title=f"Flagged montage: cluster {cid}", path=out_path, tags=["diagnostic","montage","cluster","flagged"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Tail clusters are often noise/outliers; review before agentic naming.

**Warning signs**: Many tiny clusters → consider increasing min_cluster_size or improving embeddings.

In [None]:
# Diagnostic montage — Representative images for flagged_cluster_ids[1] (small/tail clusters)
from histo_cartography.image_viz import montage_by_cluster

if len(flagged_cluster_ids) > 1:
    cid = flagged_cluster_ids[1]
    out_path = plots_dir / f"montage_flagged_cluster_{cid}.png"
    montage_by_cluster(items_with_clusters, cluster_id=cid, out_path=out_path, image_col="image_path", n=36, random_state=SEED)
    display_image(out_path)
    register_plot(viz_records, stage="stage_03_clustering", plot_id=f"montage_flagged_cluster_{cid}", title=f"Flagged montage: cluster {cid}", path=out_path, tags=["diagnostic","montage","cluster","flagged"], is_core=True)
else:
    print("not enough clusters for montage")


**Interpretation**: Tail clusters are often noise/outliers; review before agentic naming.

**Warning signs**: Many tiny clusters → consider increasing min_cluster_size or improving embeddings.

In [None]:
# Write viz index (parquet + csv) + show preview
from IPython.display import display
from histo_cartography.viz import write_viz_index, viz_records_to_df

viz_index_path = stage_dir / "viz_index.parquet"
write_viz_index(viz_records, out_parquet=viz_index_path, out_csv=stage_dir / "viz_index.csv")

viz_df = viz_records_to_df(viz_records)
display(viz_df.head(80))
print("✅ wrote viz_index:", viz_index_path)


## Next actions
- If montages look incoherent: revisit Stage 02 (embeddings) or adjust HDBSCAN sweep.
- If cluster purity is too low for your use-case: try different fusion strategy or clustering parameters.
- Proceed to Stage 04 for **agentic semantic enrichment** (Agent 1).