# Stage 02 — Compute embeddings (parquet-first, visual-first)

This notebook reads **Stage 01** `exports/stage_01_prepare/items.parquet`, computes **hybrid embeddings** (image + optional morphology/text), fuses them into a single vector per item, and writes:

- `exports/stage_02_embeddings/items_with_embeddings.parquet`
- plus intermediate parquet artifacts: `image_embeddings.parquet`, `morph_features.parquet`, `morph_embeddings.parquet`, `text_embeddings.parquet` (if enabled), `fused_embeddings.parquet`

**New in this refactor**
- **PEEP/POST** health gates (critic checks)
- **Visual-first**: key plots are shown inline *and* saved to `exports/stage_02_embeddings/plots/`
- `viz_index.parquet` is written for easy browsing of plots per stage


In [None]:
# --- Colab-first setup (safe defaults) ---
import os, sys, time
from pathlib import Path

# Notebook toggles (edit as needed)
FORCE_REBUILD = False        # recompute stage outputs even if parquet exists
FAST_MODE = True             # smaller samples / cheaper diagnostics
EDA_LEVEL = "core"           # "core" | "standard" | "deep"
SHOW_PLOTS = True            # always True for this repo
SAVE_PLOTS = True            # always True for this repo

DRIVE_SEARCH_BASE = "/content/drive/MyDrive"  # adjust if needed

def _is_colab() -> bool:
    return "google.colab" in sys.modules

if _is_colab():
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

def _resolve_project_root() -> Path:
    # 1) explicit override
    ev = os.environ.get("HISTO_PROJECT_ROOT")
    if ev and Path(ev).exists():
        return Path(ev)

    # 2) search on Drive (best effort)
    base = Path(DRIVE_SEARCH_BASE)
    candidates = []
    if base.exists():
        for p in base.glob("**/pipeline_config.yaml"):
            parent = p.parent
            if (parent / "label_taxonomy.yaml").exists():
                candidates.append(parent)
    if candidates:
        candidates = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)
        return candidates[0]

    # 3) local fallback: walk up from CWD
    p = Path.cwd()
    for _ in range(10):
        if (p / "pipeline_config.yaml").exists():
            return p
        p = p.parent

    raise FileNotFoundError("Could not resolve PROJECT_ROOT. Set HISTO_PROJECT_ROOT env var.")

PROJECT_ROOT = _resolve_project_root()
sys.path.insert(0, str(PROJECT_ROOT))
print("PROJECT_ROOT:", PROJECT_ROOT)

# Install deps (fast, uses requirements.txt)
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "-r", str(PROJECT_ROOT / "requirements.txt")])

# Optional extras
if EDA_LEVEL in ("standard", "deep"):
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "umap-learn"])
    except Exception as e:
        print("⚠️ optional install failed (umap-learn):", e)

if EDA_LEVEL == "deep":
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "ydata-profiling"])
    except Exception as e:
        print("⚠️ optional install failed (ydata-profiling):", e)

import yaml
cfg = yaml.safe_load((PROJECT_ROOT / "pipeline_config.yaml").read_text())

EXPORTS_DIR = PROJECT_ROOT / str(cfg.get("paths", {}).get("exports_dir", "exports"))
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)

SAFE_MODE = bool(cfg.get("project", {}).get("safe_mode", True))
SEED = int(cfg.get("project", {}).get("seed", 1337))

print("SAFE_MODE:", SAFE_MODE, "| EDA_LEVEL:", EDA_LEVEL)


In [None]:
# --- Stage paths + registries ---
from pathlib import Path
import pandas as pd

from histo_cartography.viz import ensure_dir
from histo_cartography.artifact_registry import register_artifact, append_stage_manifest
from histo_cartography.critic import run_critic, write_critic_report, critic_result_table, critic_issues_table

stage_in = EXPORTS_DIR / "stage_01_prepare" / "items.parquet"
assert stage_in.exists(), f"missing upstream parquet: {stage_in}"

stage_dir = EXPORTS_DIR / "stage_02_embeddings"
plots_dir = ensure_dir(stage_dir / "plots")
qa_dir = ensure_dir(stage_dir / "qa")
eda_dir = ensure_dir(stage_dir / "eda")

out_items_path = stage_dir / "items_with_embeddings.parquet"

viz_records = []  # appended throughout the notebook

print("stage_in:", stage_in)
print("stage_dir:", stage_dir)


In [None]:
# --- PEEP: load upstream items (Stage 01) ---
import pandas as pd
from IPython.display import display

items = pd.read_parquet(stage_in)
display(items.head(3))
print("items shape:", items.shape)


## PEEP — Preflight health + EDA

In [None]:
# PEEP (1/4) — Overview table (schema / missingness / examples)
from IPython.display import display
from histo_cartography.eda_reports import df_overview_table

display(df_overview_table(items, max_cols=40).head(40))


In [None]:
# PEEP (2/4) — Missingness plot (top columns)
from histo_cartography.eda_reports import plot_missingness
from histo_cartography.viz import save_and_display, register_plot

fig = plot_missingness(items, top_k=25, title="Stage 02 PEEP: items missingness (top 25)")
out_path = plots_dir / "peep_missingness_top25.png"
save_and_display(fig, out_path)

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="peep_missingness_top25",
    title="PEEP missingness (top 25 columns)",
    path=out_path,
    tags=["peep", "missingness"],
    is_core=True,
)


**Interpretation**: Large missingness in required columns (e.g., `image_path`, `label`) usually indicates a broken ingest or an incomplete staging step.

**Warning signs**: `image_path` missingness > 0, or `item_id` duplicates.

In [None]:
# PEEP (3/4) — Dataset/source distribution (bar)
import matplotlib.pyplot as plt
from histo_cartography.viz import save_and_display, register_plot

vc = items["source"].astype(str).value_counts().head(30)

fig = plt.figure(figsize=(8, 4))
plt.bar(vc.index.astype(str), vc.values)
plt.xticks(rotation=45, ha="right")
plt.title("Stage 02 PEEP: source/dataset distribution (top 30)")
plt.ylabel("n items")
plt.tight_layout()

out_path = plots_dir / "peep_source_distribution_top30.png"
save_and_display(fig, out_path)

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="peep_source_distribution_top30",
    title="PEEP source/dataset distribution (top 30)",
    path=out_path,
    tags=["peep", "distribution", "dataset"],
    is_core=True,
)


**Interpretation**: In SAFE_MODE you may only see a subset. In full mode, this should reflect all enabled datasets.

**Warning signs**: Only one dataset shows up when multiple were expected; or extremely imbalanced datasets leading to dataset islands.

In [None]:
# PEEP (4/4) — Critic hard/soft gates (items)
from IPython.display import display

critic_items = run_critic(
    df=items,
    stage="stage_02_embeddings",
    gate="peep_items",
    required_cols=["item_id","source","split","label","text","image_path","width","height","mpp"],
    id_col="item_id",
    min_rows=100 if not SAFE_MODE else 10,
    key_nonnull_cols=["item_id","image_path"],
)

write_critic_report(critic_items, qa_dir / "critic_peep_items.json")
display(critic_result_table(critic_items))
display(critic_issues_table(critic_items).head(50))


## Stage logic — Compute + fuse embeddings

In [None]:
# --- Main stage logic: compute embeddings (idempotent) ---
from pathlib import Path
import pandas as pd

from histo_cartography.exports import save_parquet
from histo_cartography import embeddings as emb

t0 = time.time()

# config
emb_cfg = cfg.get("embeddings", {})
img_cfg = emb_cfg.get("image", {})
txt_cfg = emb_cfg.get("text", {})
morph_cfg = emb_cfg.get("morphology", {})
fusion_cfg = emb_cfg.get("fusion", {})

batch_size = int(img_cfg.get("batch_size_safe", 64) if SAFE_MODE else img_cfg.get("batch_size_full", 256))
target_dim = int(fusion_cfg.get("target_dim", 256))
use_text_modality = bool(txt_cfg.get("use_text_modality", False))

if out_items_path.exists() and not FORCE_REBUILD:
    items_with_embeddings = pd.read_parquet(out_items_path)
    print(f"✅ Loaded existing items_with_embeddings.parquet: {items_with_embeddings.shape}")
else:
    # 1) image embeddings
    img_emb = emb.embed_images_resnet50(
        items,
        image_col="image_path",
        batch_size=batch_size,
        device=None,
        max_items=None,  # stage 01 already sampled in SAFE_MODE
    )
    save_parquet(img_emb, stage_dir / "image_embeddings.parquet")

    emb_dfs = [img_emb]

    # 2) morphology embeddings (best effort)
    try:
        morph_feats = emb.compute_morphology_features(items, image_col="image_path")
        save_parquet(morph_feats, stage_dir / "morph_features.parquet")
        morph_emb = emb.embed_morphology_features(morph_feats)
        save_parquet(morph_emb, stage_dir / "morph_embeddings.parquet")
        emb_dfs.append(morph_emb)
    except Exception as e:
        print("⚠️ morphology embedding skipped:", e)

    # 3) text embeddings (optional; default off to avoid label leakage)
    if use_text_modality:
        txt_emb = emb.embed_text_tfidf_svd(
            items,
            text_col="text",
            max_features=int(txt_cfg.get("max_features", 8192)),
            svd_dim=int(txt_cfg.get("svd_dim", 128)),
        )
        save_parquet(txt_emb, stage_dir / "text_embeddings.parquet")
        emb_dfs.append(txt_emb)
    else:
        print("text modality disabled (recommended default)")

    # 4) fuse -> one vector per item
    if len(emb_dfs) == 1:
        fused = emb_dfs[0][["item_id", "model_id", "dim", "vector"]].copy()
        fused["model_id"] = fused["model_id"].astype(str) + "|single"
    else:
        fused = emb.fuse_embeddings_concat_pca(emb_dfs, target_dim=target_dim)

    save_parquet(fused, stage_dir / "fused_embeddings.parquet")

    # 5) join back to items
    items_with_embeddings = items.merge(fused, on="item_id", how="inner")
    assert len(items_with_embeddings) == len(items), "embedding join dropped rows"

    save_parquet(items_with_embeddings, out_items_path)

runtime_sec = time.time() - t0
print("runtime_sec:", round(runtime_sec, 2))


## CHECKPOINT — After embedding fusion

In [None]:
# CHECKPOINT: critic gates on items_with_embeddings
from IPython.display import display

critic_out = run_critic(
    df=items_with_embeddings,
    stage="stage_02_embeddings",
    gate="checkpoint_items_with_embeddings",
    required_cols=["item_id","source","label","image_path","vector","dim"],
    id_col="item_id",
    min_rows=100 if not SAFE_MODE else 10,
    key_nonnull_cols=["item_id","image_path","vector"],
    vector_col="vector",
)

write_critic_report(critic_out, qa_dir / "critic_checkpoint_items_with_embeddings.json")
display(critic_result_table(critic_out))
display(critic_issues_table(critic_out).head(50))


In [None]:
# Register stage artifacts in manifest (parquet-first DAG wiring)
schema_version = str(cfg.get("project", {}).get("schema_version", "0.1.0"))

# Artifact-level registry (keeps backward compatibility)
register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_02_embeddings",
    artifact="items_with_embeddings",
    path=out_items_path,
    schema_version=schema_version,
    inputs=[stage_in],
    df=items_with_embeddings,
    warnings_count=int(critic_out.warnings_count),
    fails_count=int(critic_out.fails_count),
    runtime_sec=float(runtime_sec),
    notes="items + fused embedding vector",
)

# Stage-run summary row
append_stage_manifest(
    project_root=PROJECT_ROOT,
    stage="stage_02_embeddings",
    inputs=[stage_in],
    outputs=[out_items_path],
    schema_version=schema_version,
    warnings_count=int(critic_out.warnings_count),
    fails_count=int(critic_out.fails_count),
    runtime_sec=float(runtime_sec),
    notes="stage 02 run summary",
)


## POST — Postflight health + EDA

In [None]:
# POST (1/3) — Overview table (items_with_embeddings)
from IPython.display import display
from histo_cartography.eda_reports import df_overview_table

display(df_overview_table(items_with_embeddings, max_cols=40).head(40))


In [None]:
# POST (2/3) — Missingness plot (items_with_embeddings)
from histo_cartography.eda_reports import plot_missingness
from histo_cartography.viz import save_and_display, register_plot

fig = plot_missingness(items_with_embeddings, top_k=25, title="Stage 02 POST: items_with_embeddings missingness (top 25)")
out_path = plots_dir / "post_missingness_top25.png"
save_and_display(fig, out_path)

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="post_missingness_top25",
    title="POST missingness (top 25 columns)",
    path=out_path,
    tags=["post", "missingness"],
    is_core=True,
)


**Interpretation**: After fusion, `vector` should be present for every row.

**Warning signs**: Any null vectors or inconsistent `dim`.

In [None]:
# POST (3/3) — Critic summary (should match checkpoint)
from IPython.display import display

display(critic_result_table(critic_out))
display(critic_issues_table(critic_out).head(50))


## Core embedding diagnostics (one plot per cell)
These are the most important plots for cluster quality downstream (Stage 03+).

In [None]:
# Core plot 1 — Embedding norm histogram
from histo_cartography.eda_reports import plot_embedding_norms
from histo_cartography.viz import save_and_display, register_plot

emb_df = items_with_embeddings[["item_id","vector"]].copy()

fig = plot_embedding_norms(emb_df, vector_col="vector", title="Embedding norm histogram (fused)")
out_path = plots_dir / "embedding_norm_hist.png"
save_and_display(fig, out_path)

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="embedding_norm_hist",
    title="Embedding norm histogram (fused)",
    path=out_path,
    tags=["core", "embeddings", "norms"],
    is_core=True,
)


**Interpretation**: Norms should not collapse to a single value.

**Warning signs**: Extremely narrow distribution (collapsed embeddings) or heavy spikes at 0 (bad vectors).

In [None]:
import json
# Core plot 2 — PCA explained variance curve (cumulative)
from histo_cartography.eda_reports import plot_pca_explained_variance
from histo_cartography.viz import save_and_display, register_plot
from histo_cartography.stats_tests import effective_rank
import numpy as np

fig, meta = plot_pca_explained_variance(emb_df, vector_col="vector", n_components=50, title="PCA cumulative explained variance (fused)")
out_path = plots_dir / "pca_explained_variance_cumsum.png"
save_and_display(fig, out_path)

# effective rank from singular values approximation
try:
    # Convert explained variance ratio -> approximate singular value weights
    evr = np.asarray(meta.get("explained_variance_ratio", []), dtype=float)
    er = effective_rank(np.sqrt(np.maximum(evr, 0)))
except Exception:
    er = None

(meta_path := qa_dir / "pca_meta.json").write_text(json.dumps({**meta, "effective_rank_approx": er}, indent=2))

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="pca_explained_variance_cumsum",
    title="PCA cumulative explained variance (fused)",
    path=out_path,
    tags=["core", "embeddings", "pca"],
    is_core=True,
    notes=f"effective_rank_approx={er}",
)


**Interpretation**: If a tiny number of components explain almost all variance, embeddings may be low-rank or collapsed.

**Warning signs**: Effective rank extremely small relative to embedding dim.

In [None]:
# Core plot 3 — Pairwise cosine similarity distribution (sampled)
from histo_cartography.eda_reports import plot_cosine_similarity_distribution
from histo_cartography.viz import save_and_display, register_plot

fig, meta = plot_cosine_similarity_distribution(emb_df, vector_col="vector", sample_pairs=2000, title="Pairwise cosine similarity (sampled, fused)")
out_path = plots_dir / "cosine_similarity_distribution.png"
save_and_display(fig, out_path)

(qa_dir / "cosine_similarity_meta.json").write_text(json.dumps(meta, indent=2))

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="cosine_similarity_distribution",
    title="Pairwise cosine similarity distribution (sampled)",
    path=out_path,
    tags=["core", "embeddings", "similarity"],
    is_core=True,
)


**Interpretation**: Similarity distribution should be broad enough to allow clustering.

**Warning signs**: Nearly all pairs have very high similarity (collapsed) or near-zero similarity (no structure).

In [None]:
# Core plot 4 — 2D map (UMAP/PCA) colored by dataset/source
from histo_cartography.eda_reports import compute_umap_2d, plot_category_scatter_2d
from histo_cartography.viz import save_and_display, register_plot

xy, meta = compute_umap_2d(items_with_embeddings, vector_col="vector", sample_n=3000 if FAST_MODE else 8000, random_state=SEED)
cats = items_with_embeddings.iloc[xy["sample_idx"].tolist()]["source"].astype(str).tolist()

fig = plot_category_scatter_2d(xy, cats, title=f"2D map colored by dataset/source ({meta['method']})", max_legend=20)
out_path = plots_dir / "map2d_by_source.png"
save_and_display(fig, out_path)

(qa_dir / "map2d_meta_source.json").write_text(json.dumps(meta, indent=2))
register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="map2d_by_source",
    title="2D map colored by dataset/source",
    path=out_path,
    tags=["core", "embeddings", "umap", "dataset"],
    is_core=True,
    notes=f"method={meta.get('method')}",
)


**Interpretation**: Ideally, structure is driven by morphology, not dataset.

**Warning signs**: Strong dataset islands → potential domain shift or leakage (e.g., different stain/scan artifacts).

In [None]:
# Core plot 5 — 2D map (UMAP/PCA) colored by tissue label
from histo_cartography.eda_reports import plot_category_scatter_2d
from histo_cartography.viz import save_and_display, register_plot

cats = items_with_embeddings.iloc[xy["sample_idx"].tolist()]["label"].astype(str).tolist()

fig = plot_category_scatter_2d(xy, cats, title=f"2D map colored by tissue label ({meta['method']})", max_legend=25)
out_path = plots_dir / "map2d_by_label.png"
save_and_display(fig, out_path)

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="map2d_by_label",
    title="2D map colored by tissue label",
    path=out_path,
    tags=["core", "embeddings", "umap", "label"],
    is_core=True,
)


**Interpretation**: Labels (if meaningful) should show local coherence, but not perfect separation.

**Warning signs**: Perfect separation may indicate leakage; no separation may indicate embeddings not capturing tissue structure.

In [None]:
# Core diagnostic 6A — Dataset separability probe (table)
from IPython.display import display
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# build sample
X = np.asarray(items_with_embeddings["vector"].tolist(), dtype=np.float32)
y = items_with_embeddings["source"].astype(str).to_numpy()

# small subsample for speed
n = min(len(X), 5000 if FAST_MODE else 20000)
idx = np.random.default_rng(SEED).choice(len(X), size=n, replace=False)
Xs = X[idx]
ys = y[idx]

Xtr, Xte, ytr, yte = train_test_split(Xs, ys, test_size=0.25, random_state=SEED, stratify=ys if len(set(ys))>1 else None)

clf = LogisticRegression(max_iter=500, n_jobs=None)
clf.fit(Xtr, ytr)
pred = clf.predict(Xte)
acc = accuracy_score(yte, pred)

tab = pd.DataFrame([{"metric":"accuracy","value":float(acc),"n_train":len(Xtr),"n_test":len(Xte)}])
display(tab)

(qa_dir / "dataset_separability.json").write_text(json.dumps(tab.to_dict(orient="records")[0], indent=2))


In [None]:
# Core diagnostic 6B — Dataset separability confusion matrix (plot)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

labels = sorted(list(set(yte)))
cm = confusion_matrix(yte, pred, labels=labels)

fig = plt.figure(figsize=(6, 5))
plt.imshow(cm, aspect="auto")
plt.colorbar(label="count")
plt.xticks(range(len(labels)), labels, rotation=45, ha="right")
plt.yticks(range(len(labels)), labels)
plt.title("Dataset separability (confusion matrix)")
plt.tight_layout()

out_path = plots_dir / "dataset_separability_confusion.png"
save_and_display(fig, out_path)

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="dataset_separability_confusion",
    title="Dataset separability confusion matrix",
    path=out_path,
    tags=["core", "embeddings", "dataset", "separability"],
    is_core=True,
    notes=f"accuracy={float(acc):.3f}",
)


**Interpretation**: If dataset is *too* predictable from embeddings, clustering may reflect dataset artifacts.

**Warning signs**: Very high accuracy → domain shift; consider stain normalization, domain adaptation, or rebalancing.

In [None]:
# Core diagnostic 7A — Drift test across datasets (KS test on embedding norms)
from IPython.display import display
import numpy as np
from histo_cartography.stats_tests import ks_2samp

# norms
X = np.asarray(items_with_embeddings["vector"].tolist(), dtype=np.float32)
norms = np.linalg.norm(X, axis=1)
items_with_embeddings["_norm"] = norms

# pairwise KS vs first dataset (diagnostic)
datasets = items_with_embeddings["source"].astype(str).unique().tolist()
datasets = datasets[:5]  # keep core output small

rows = []
if len(datasets) >= 2:
    base = datasets[0]
    a = items_with_embeddings[items_with_embeddings["source"].astype(str) == base]["_norm"].tolist()
    for ds in datasets[1:]:
        b = items_with_embeddings[items_with_embeddings["source"].astype(str) == ds]["_norm"].tolist()
        tr = ks_2samp(a, b, name=f"ks_norm__{base}__vs__{ds}")
        rows.append({"base":base,"compare":ds,"statistic":tr.statistic,"p_value":tr.p_value,"n_a":tr.n_a,"n_b":tr.n_b,"notes":tr.notes})

tab = pd.DataFrame(rows)
display(tab)

(tab_path := qa_dir / "drift_ks_norms_table.parquet")
tab.to_parquet(tab_path, index=False)


In [None]:
# Core diagnostic 7B — Drift visualization: embedding norm ECDF by dataset
import numpy as np
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(7, 4))

for ds in datasets:
    x = items_with_embeddings[items_with_embeddings["source"].astype(str) == ds]["_norm"].to_numpy()
    x = x[np.isfinite(x)]
    if x.size == 0:
        continue
    xs = np.sort(x)
    ys = np.arange(1, xs.size + 1) / xs.size
    plt.plot(xs, ys, label=str(ds))

plt.title("Embedding norm ECDF by dataset (diagnostic)")
plt.xlabel("norm")
plt.ylabel("ECDF")
plt.legend(fontsize=7)
plt.tight_layout()

out_path = plots_dir / "drift_norm_ecdf_by_dataset.png"
save_and_display(fig, out_path)

register_plot(
    viz_records,
    stage="stage_02_embeddings",
    plot_id="drift_norm_ecdf_by_dataset",
    title="Embedding norm ECDF by dataset",
    path=out_path,
    tags=["core", "embeddings", "drift", "dataset"],
    is_core=True,
)


**Interpretation**: Large ECDF shifts indicate dataset drift.

**Warning signs**: Datasets with clearly separated ECDF curves; consider normalizing or rebalancing.

In [None]:
# Extended (deep) — YData Profiling report (HTML) (optional heavy)
from histo_cartography.eda_reports import ydata_profiling_report

if EDA_LEVEL == "deep":
    out_html = eda_dir / "ydata_profile_items_with_embeddings.html"
    p = ydata_profiling_report(items_with_embeddings.drop(columns=["vector"], errors="ignore"), out_html=out_html, sample_rows=2000, minimal=True)
    if p:
        print("✅ wrote profiling report:", p)
    else:
        print("⚠️ ydata-profiling not available")
else:
    print("EDA_LEVEL != deep (skipping profiling)")


In [None]:
# Write viz index (parquet + csv) + show a small preview
from IPython.display import display
from histo_cartography.viz import write_viz_index, viz_records_to_df

viz_index_path = stage_dir / "viz_index.parquet"
write_viz_index(viz_records, out_parquet=viz_index_path, out_csv=stage_dir / "viz_index.csv")

viz_df = viz_records_to_df(viz_records)
display(viz_df.head(50))
print("✅ wrote viz_index:", viz_index_path)


## Next actions

- If **dataset separability** is very high:
  - expect “dataset islands” in Stage 03 clustering.
  - consider stain normalization, domain balancing, or enabling morphology/text fusion (carefully; avoid label leakage).
- If embeddings look **collapsed** (very low variance / narrow norm histogram):
  - verify image paths are valid
  - check ResNet preprocessing and image decoding
  - try re-running Stage 02 with a different embedding model.
