# Mantis CSV Export (V3)

This notebook fixes the main causes of **dataset islands** and **junk cluster names**:
- decodes HF extra columns (LC25000 organ 0/1 -> Lung/Colorectal)
- aligns ontology to the spec (5-level core + 7-level extended)
- exports a **fixed-length `vector` JSON column** so Mantis doesn't compute mixed-size embeddings
- adds **anchor nodes** + kNN neighbors to create connectivity.


In [1]:
#@title 1) Colab setup (deps + project root + HF auth so imports work)

# ----------------------------
# A) Mount Google Drive (Colab)
# ----------------------------
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
    IN_COLAB = True
except Exception as e:
    IN_COLAB = False
    print("Not running in Colab or Drive mount not available:", e)

# ----------------------------
# B) Install dependencies
# ----------------------------
# NOTE: torch/torchvision are usually preinstalled on Colab.
!pip -q install --upgrade pip
!pip -q install pandas numpy pillow tqdm scikit-learn umap-learn sentence-transformers datasets medmnist requests gdown huggingface_hub

# ----------------------------
# C) Fix PYTHONPATH to find your repo package (histo_cartography/)
# ----------------------------
import os, sys, importlib
from pathlib import Path
from collections import deque

# >>> USER CONFIG: this is the folder you told me is your project root
PROJECT_BASE = Path("/content/drive/MyDrive/mit/histopathology_mantis_20260115")

# Directories we should NOT crawl (can be huge)
IGNORE_DIRS = {
    ".git", "__pycache__", ".ipynb_checkpoints",
    "data_raw", "data_staging", "staging", "exports", "checkpoints", "_colab_data", "_cache",
    ".venv", "venv", "site-packages", "node_modules"
}

def _looks_like_repo_root(p: Path) -> bool:
    return (p / "histo_cartography").is_dir() and (p / "histo_cartography" / "__init__.py").exists()

def find_repo_root(base: Path, max_depth: int = 6) -> Path:
    """Find a folder containing histo_cartography/ under base (handles nested unzip levels)."""
    base = base.resolve()
    if _looks_like_repo_root(base):
        return base
    if not base.exists():
        raise FileNotFoundError(f"PROJECT_BASE does not exist: {base}")

    q = deque([(base, 0)])
    seen = {str(base)}

    while q:
        cur, depth = q.popleft()
        if depth >= max_depth:
            continue

        try:
            children = list(cur.iterdir())
        except Exception:
            continue

        for sub in children:
            if not sub.is_dir():
                continue
            if sub.name in IGNORE_DIRS:
                continue

            sub_res = str(sub.resolve())
            if sub_res in seen:
                continue
            seen.add(sub_res)

            if _looks_like_repo_root(sub):
                return sub

            q.append((sub, depth + 1))

    raise FileNotFoundError(
        f"Could not find a folder containing 'histo_cartography/' under: {base}.\n"
        "Fix: unzip/copy your repo folder under PROJECT_BASE, or change PROJECT_BASE to the correct Drive path.\n"
        "Tip: ensure your Drive folder contains something like: <...>/revamped_project/histo_cartography/"
    )

# Resolve PROJECT_ROOT (repo root)
PROJECT_ROOT = find_repo_root(PROJECT_BASE) if PROJECT_BASE.exists() else Path.cwd()

# Put repo on sys.path and cd into it so relative paths work
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))
os.chdir(PROJECT_ROOT)
importlib.invalidate_caches()

# Keep caches inside the repo (Drive persists downloads)
CACHE_ROOT = PROJECT_ROOT / "_cache"
CACHE_ROOT.mkdir(parents=True, exist_ok=True)
os.environ.setdefault("HF_HOME", str(CACHE_ROOT / "hf_home"))
os.environ.setdefault("HF_DATASETS_CACHE", str(CACHE_ROOT / "hf_datasets"))
os.environ.setdefault("TRANSFORMERS_CACHE", str(CACHE_ROOT / "hf_transformers"))
os.environ.setdefault("TORCH_HOME", str(CACHE_ROOT / "torch"))

# ----------------------------
# D) HuggingFace token via Colab "Secrets"
# ----------------------------
# In Colab: click the key icon (Secrets) → add HF_TOKEN.
HF_TOKEN = None
if IN_COLAB:
    try:
        from google.colab import userdata  # type: ignore
        HF_TOKEN = userdata.get("HF_TOKEN")
    except Exception as e:
        HF_TOKEN = None

if HF_TOKEN:
    # Do NOT print the token.
    os.environ["HF_TOKEN"] = HF_TOKEN
    os.environ["HUGGINGFACE_HUB_TOKEN"] = HF_TOKEN
    os.environ["HUGGING_FACE_HUB_TOKEN"] = HF_TOKEN  # some libs use this variant

    try:
        from huggingface_hub import login  # type: ignore
        login(HF_TOKEN, add_to_git_credential=False)
        print("✅ HF_TOKEN loaded from Colab userdata and huggingface_hub.login() succeeded.")
    except Exception as e:
        print("⚠️ HF_TOKEN set in env, but huggingface_hub.login() failed (continuing):", e)
else:
    print("ℹ️ HF_TOKEN not found in Colab userdata. Public HF datasets will still work.")

print("✅ PROJECT_ROOT:", PROJECT_ROOT.resolve())
print("✅ sys.path[0]:", sys.path[0])

# Sanity check import (this is what was failing for you)
import histo_cartography
print("✅ histo_cartography imported from:", histo_cartography.__file__)


Mounted at /content/drive
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[?25h

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


✅ HF_TOKEN loaded from Colab userdata and huggingface_hub.login() succeeded.
✅ PROJECT_ROOT: /content/drive/MyDrive/mit/histopathology_mantis_20260115
✅ sys.path[0]: /content/drive/MyDrive/mit/histopathology_mantis_20260115
✅ histo_cartography imported from: /content/drive/MyDrive/mit/histopathology_mantis_20260115/histo_cartography/__init__.py


In [2]:
#@title 2) Configuration

from pathlib import Path
import json
import numpy as np
import pandas as pd
from tqdm import tqdm

# --------------------------
# Core switches
# --------------------------
SAFE_MODE = True              # sample aggressively for quick iteration
SEED = 1337
np.random.seed(SEED)

# Data locations (your project root MUST contain histo_cartography/)
PROJECT_BASE = Path("/content/drive/MyDrive/mit/histopathology_mantis_20260115")  # <-- CHANGE IF NEEDED
DATA_ROOT = PROJECT_BASE

# IMPORTANT: Use ONE staging root, BUT stage each dataset into its own subfolder:
#   STAGING_DIR/<DATASET_KEY>/images/...
# This prevents silent dataset-mixing that causes "island" clusters.
STAGING_DIR = PROJECT_BASE / "data_staging"
EXPORT_DIR = PROJECT_BASE / "exports"
EXPORT_DIR.mkdir(parents=True, exist_ok=True)

# Dataset toggles (SAFE_MODE disables huge zips by default)
ENABLE = {
    "CRC_VAL_HE_7K": True,
    "NCT_CRC_HE_100K": False,   # will be forced False in SAFE_MODE
    "MEDMNIST_PATHMNIST": True,
    "HF_PCAM": False,           # optional; enable if you want (HF token needed)
    # Additional datasets
    "HF_LC25000": True,
    "HF_BACH": True,
    "HF_BREAKHIS_RCL_7500": True,
    "ORCA_ORAL_ANNOTATED_100": False,  # optional (Google Drive download)
}

if SAFE_MODE:
    ENABLE["NCT_CRC_HE_100K"] = False

# SAFE_MODE sampling controls (per dataset)
MAX_ITEMS_PER_DATASET = 800 if SAFE_MODE else None
SPLIT = "train"

# --------------------------
# Embedding controls
# --------------------------
VISION_BACKBONE = "resnet50"      # "resnet50" (default). Optional: add Phikon later.
TEXT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

# Fusion
FUSION_MODE = "concat"            # "concat" recommended (visual || semantic)
CONCAT_IMAGE_WEIGHT = 1.0
CONCAT_TEXT_WEIGHT = 3.0           # bump text weight to reduce dataset-style dominance

# Dimensionality reduction
REDUCE_DIM = 512
BATCH_ALIGN_BY_DATASET = True      # stronger than mean-centering: mean+std alignment per dataset

# 2D layout controls (optional; you can upload the "no_xy" CSV to let Mantis compute layout)
COMPUTE_2D_LAYOUT = True
UMAP_N_NEIGHBORS = 60
UMAP_MIN_DIST = 0.10
UMAP_SUPERVISE_ON = "cluster_l2"  # supervise on tissue superclass for cross-dataset mixing

# Ontology anchor nodes
ADD_ANCHORS = True

# How node titles should be shown in Mantis
#  - "semantic_plus_id": e.g., "Adenocarcinoma - CRC_VAL_HE_7K::TUM::ADI-TCGA-..."
#  - "id_only": full id
TITLE_MODE = "semantic_plus_id"

# Output filenames
EXPORT_BASENAME = "mantis_unified_atlas_multimodal_v5"

print("PROJECT_BASE:", PROJECT_BASE)
print("STAGING_DIR:", STAGING_DIR)
print("EXPORT_DIR:", EXPORT_DIR)


PROJECT_BASE: /content/drive/MyDrive/mit/histopathology_mantis_20260115
STAGING_DIR: /content/drive/MyDrive/mit/histopathology_mantis_20260115/data_staging
EXPORT_DIR: /content/drive/MyDrive/mit/histopathology_mantis_20260115/exports


## 3) Dataset registry (sources + licenses)

We keep dataset metadata **separate** from the ontology labels.  
This metadata is injected into the **`metadata` JSON column** and also contributes to the **semantic text** used for embedding.


In [3]:
#@title Dataset registry

DATASET_REGISTRY = {
    # Zenodo CRC datasets (Kather et al.)
    "CRC_VAL_HE_7K": {
        "provider": "zenodo",
        "license": "CC-BY-4.0 (Zenodo record 1214456)",
        "url": "https://zenodo.org/records/1214456",
        "citation": "Kather et al. (2016) / Zenodo CRC-VAL-HE-7K",
        "notes": "Colorectal histology tiles; 9 tissue classes."
    },
    "NCT_CRC_HE_100K": {
        "provider": "zenodo",
        "license": "CC-BY-4.0 (Zenodo record 1214456)",
        "url": "https://zenodo.org/records/1214456",
        "citation": "Kather et al. (2016) / Zenodo NCT-CRC-HE-100K",
        "notes": "Large colorectal histology tile dataset; 9 tissue classes."
    },

    # MedMNIST
    "MEDMNIST_PATHMNIST": {
        "provider": "medmnist",
        "license": "see MedMNIST site (dataset-specific)",
        "url": "https://medmnist.com/",
        "citation": "MedMNIST v2 PathMNIST",
        "notes": "Derived from colorectal histopathology patches; 9 classes."
    },

    # HF datasets
    "HF_PCAM": {
        "provider": "huggingface",
        "license": "see HuggingFace dataset card",
        "url": "https://huggingface.co/datasets/pcam",
        "citation": "PatchCamelyon (PCam)",
        "notes": "Lymph node metastasis (binary).",
        "hf_id": "pcam",
        "image_col": "image",
        "label_col": "label",
    },

    # Additional diversity datasets
    "HF_LC25000": {
        "provider": "huggingface",
        "license": "unlicense (per dataset card)",
        "url": "https://huggingface.co/datasets/1aurent/LC25000",
        "citation": "LC25000 via HuggingFace (1aurent/LC25000)",
        "notes": "Lung + colon histopathology (classification).",
        "hf_id": "1aurent/LC25000",
        "image_col": "image",
        "label_col": "label",
        "extra_cols": ["organ"],   # IMPORTANT: needed for ontology
    },
    "HF_BACH": {
        "provider": "huggingface",
        "license": "cc-by-nc-nd-4.0 (per dataset card)",
        "url": "https://huggingface.co/datasets/1aurent/BACH",
        "citation": "BACH (breast histology) via HuggingFace (1aurent/BACH)",
        "notes": "Breast histology microscopy images (4-way).",
        "hf_id": "1aurent/BACH",
        "image_col": "image",
        "label_col": "label",
    },
    "HF_BREAKHIS_RCL_7500": {
        "provider": "huggingface",
        "license": "cc-by-4.0 (per dataset card)",
        "url": "https://huggingface.co/datasets/LuminaAI/RCL-Breast-Cancer-Biopsy-7500",
        "citation": "BreakHis-derived Breast Cancer Biopsy 7500 (LuminaAI)",
        "notes": "Breast histopathology (benign vs malignant).",
        "hf_id": "LuminaAI/RCL-Breast-Cancer-Biopsy-7500",
        "image_col": "image",
        "label_col": "label",
    },

    # ORCA (oral)
    "ORCA_ORAL_ANNOTATED_100": {
        "provider": "gdrive",
        "license": "research/educational; see ORCA site",
        "url": "https://sites.google.com/unibas.it/orca",
        "citation": "Martino et al. Applied Sciences 2020 (ORCA dataset)",
        "notes": "Oral Cancer Annotated dataset: 100 OSCC samples (validation set).",
        # Google Drive link from ORCA website (validation set) — may change
        "gdrive_url": "https://drive.google.com/drive/folders/1XfplgYK5JWzzYWXQhrPUQujXNKUDK-WR?usp=sharing",
    },
}

print("Registered datasets:", list(DATASET_REGISTRY.keys()))


Registered datasets: ['CRC_VAL_HE_7K', 'NCT_CRC_HE_100K', 'MEDMNIST_PATHMNIST', 'HF_PCAM', 'HF_LC25000', 'HF_BACH', 'HF_BREAKHIS_RCL_7500', 'ORCA_ORAL_ANNOTATED_100']


## 4) Data ingestion to a single Unified Atlas table

Each dataset is materialized to a common schema:

- `item_id` (unique)
- `image_path`
- `dataset_key`
- `raw_label`
- `split`
- `width`, `height`, `mpp`
- `metadata` (dict; later JSON)

This step intentionally **does not** assign ontology yet.


In [4]:
#@title 4A) Helpers: robust HF export (+tiling) + ORCA download

import os, shutil, hashlib, subprocess, math, re
from pathlib import Path
from PIL import Image
import numpy as np
import pandas as pd
from tqdm import tqdm

def slugify(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"[^a-z0-9]+", "_", s)
    return s.strip("_") or "unknown"

def normalize_folder_label(lbl: str) -> str:
    # keep hierarchy tokens like "__" but sanitize each component
    parts = str(lbl).split("__")
    parts = [slugify(p) for p in parts]
    return "__".join(parts)

def save_pil(pil_img: Image.Image, out_path: Path) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pil_img.save(out_path)

def _label_to_str(ds, label_col: str, label_val) -> str:
    """If label column is a HuggingFace ClassLabel, convert int->name."""
    try:
        feat = ds.features[label_col]
        # HuggingFace datasets uses ClassLabel for categorical ints
        if hasattr(feat, "int2str"):
            return str(feat.int2str(int(label_val)))
    except Exception:
        pass
    return str(label_val)

def _extract_patches(
    img: Image.Image,
    *,
    tile_size: int = 224,
    tile_mode: str = "auto",          # "auto" | "grid" | "center"
    max_patches: int = 9,
) -> list[Image.Image]:
    """Return a list of 224x224 patches to reduce dataset-specific scale bias.

    - Small images are upsampled
    - Large images are either center-cropped or gridded into patches
    """
    img = img.convert("RGB")
    w, h = img.size

    # tiny images -> upscale
    if min(w, h) < tile_size:
        return [img.resize((tile_size, tile_size), resample=Image.BILINEAR)]

    # already patch-sized
    if w == tile_size and h == tile_size:
        return [img]

    # choose mode
    if tile_mode == "auto":
        # If moderately larger than tile, center crop; if much larger, grid
        tile_mode = "center" if max(w, h) <= 512 else "grid"

    if tile_mode == "center":
        left = int((w - tile_size) / 2)
        top = int((h - tile_size) / 2)
        patch = img.crop((left, top, left + tile_size, top + tile_size))
        return [patch]

    # grid mode
    # Make an n x n grid such that n^2 <= max_patches
    n = max(1, int(math.sqrt(max_patches)))
    xs = np.linspace(0, w - tile_size, n, dtype=int)
    ys = np.linspace(0, h - tile_size, n, dtype=int)
    patches: list[Image.Image] = []
    for yy in ys:
        for xx in xs:
            patches.append(img.crop((int(xx), int(yy), int(xx) + tile_size, int(yy) + tile_size)))
            if len(patches) >= max_patches:
                return patches
    return patches


def export_hf_dataset_to_staging(
    dataset_key: str,
    staging_root: Path,
    *,
    split: str = "train",
    max_items: int | None = None,     # interpreted as MAX PATCHES to export
    seed: int = 1337,
    hf_token: str | None = None,
    hf_id: str | None = None,
    img_col: str = "image",
    label_col: str = "label",
    label_folder_fn=None,             # function(row_dict)-> str label folder (before normalize_folder_label)
    tile_mode: str = "auto",          # "auto" | "grid" | "center"
    max_patches_per_image: int = 9,
    tile_size: int = 224,
) -> Path:
    """Export (and patchify) a HF dataset to:
      staging_root/dataset_key/images/<label_folder>/*.png

    IMPORTANT: We export PATCHES, not full-size images, to reduce domain shift.
    """
    from datasets import load_dataset  # type: ignore

    images_dir = staging_root / dataset_key / "images"
    marker = images_dir / ".complete"

    if marker.exists():
        print(f"[cache] {dataset_key} already exported:", images_dir)
        return images_dir

    meta = DATASET_REGISTRY[dataset_key]
    hf_id = hf_id or meta.get("hf_id")
    assert hf_id, f"Missing hf_id for {dataset_key}"

    print(f"Loading HF dataset {hf_id} split={split} ...")
    try:
        ds = load_dataset(hf_id, split=split, token=hf_token)
    except TypeError:
        ds = load_dataset(hf_id, split=split, use_auth_token=hf_token)


    n = len(ds)
    idxs = np.arange(n)
    rng = np.random.default_rng(seed)
    rng.shuffle(idxs)

    exported = 0
    for i in tqdm(idxs, desc=f"Export {dataset_key}"):
        row = ds[int(i)]
        img = row[img_col]
        label_val = row[label_col]
        label_name = _label_to_str(ds, label_col, label_val)

        row_dict = dict(row)
        row_dict[label_col] = label_name  # replace numeric with string label for folder naming

        # --- V3: decode any ClassLabel extra columns (e.g., organ) to strings so ontology doesn't see 0/1 ---
        # This fixes LC25000 'organ' being exported as 0/1 instead of Lung/Colorectal.
        for _col, _val in list(row_dict.items()):
            if _col in (img_col,):
                continue
            try:
                _feat = ds.features.get(_col)
                if hasattr(_feat, "int2str") and isinstance(_val, (int, np.integer, str)) and str(_val).isdigit():
                    row_dict[_col] = str(_feat.int2str(int(_val)))
            except Exception:
                pass

        if label_folder_fn is None:
            lbl = str(label_name)
        else:
            lbl = str(label_folder_fn(row_dict))

        lbl = normalize_folder_label(lbl)

        patches = _extract_patches(
            img,
            tile_size=tile_size,
            tile_mode=tile_mode,
            max_patches=max_patches_per_image,
        )

        for p_idx, patch in enumerate(patches):
            out_path = images_dir / lbl / f"{dataset_key}_{split}_i{int(i)}_p{p_idx}.png"
            save_pil(patch, out_path)
            exported += 1
            if max_items is not None and exported >= max_items:
                break

        if max_items is not None and exported >= max_items:
            break

    marker.write_text(
        f"done\nhf_id={hf_id}\nsplit={split}\nn_patches={exported}\n"
    )
    print(f"Exported patches: {exported}")
    return images_dir


def download_orca_to_staging(dataset_key: str, staging_root: Path, *, overwrite: bool=False) -> Path:
    """Downloads ORCA Google Drive folder and normalizes to:
      staging_root / dataset_key / images / oral__oscc / *.png
    """
    meta = DATASET_REGISTRY[dataset_key]
    gdrive_url = meta["gdrive_url"]

    out_root = staging_root / dataset_key
    images_dir = out_root / "images"
    marker = images_dir / ".complete"
    if marker.exists() and not overwrite:
        print(f"[cache] {dataset_key} already downloaded:", images_dir)
        return images_dir

    out_root.mkdir(parents=True, exist_ok=True)
    tmp_dir = out_root / "_tmp_orca"
    if tmp_dir.exists():
        shutil.rmtree(tmp_dir)
    tmp_dir.mkdir(parents=True, exist_ok=True)

    print("Downloading ORCA from Google Drive ...")
    # requires gdown
    import gdown  # type: ignore
    gdown.download_folder(gdrive_url, output=str(tmp_dir), quiet=False, use_cookies=False)

    # find images
    imgs = []
    for ext in ("*.png","*.jpg","*.jpeg","*.tif","*.tiff"):
        imgs.extend(tmp_dir.rglob(ext))

    target = images_dir / "oral__oscc"
    target.mkdir(parents=True, exist_ok=True)
    kept = 0
    for p in imgs:
        if "mask" in p.name.lower() or "label" in p.name.lower():
            continue
        shutil.copy2(p, target / p.name)
        kept += 1

    marker.write_text(f"done\nn_images={kept}\n")
    print(f"ORCA images kept: {kept}")
    return images_dir


def build_items_from_images_dir(images_dir: Path, dataset_key: str, split: str, mpp: float = 0.5) -> pd.DataFrame:
    exts = (".png", ".jpg", ".jpeg", ".tif", ".tiff")
    paths = [p for p in images_dir.rglob("*") if p.suffix.lower() in exts]
    rows = []
    for p in paths:
        raw_label = p.parent.name
        rows.append(
            {
                "dataset_key": dataset_key,
                "split": split,
                "raw_label": raw_label,
                "label": raw_label,  # alias
                "image_path": str(p),
                "width": None,
                "height": None,
                "mpp": mpp,
                "item_id": p.stem,
            }
        )
    return pd.DataFrame(rows)

In [5]:
#@title 4B) Ingest all enabled datasets into a Unified Atlas table

from histo_cartography.datasets import prepare_dataset_to_staging  # re-use existing robust logic

unified_parts = []

# Always keep raw downloads under DATA_ROOT/data_raw (clean separation)
RAW_DIR = Path(DATA_ROOT) / "data_raw"
RAW_DIR.mkdir(parents=True, exist_ok=True)

# 1) CRC + MedMNIST + PCAM are handled by prepare_dataset_to_staging

def ingest_prepared(dataset_key: str) -> pd.DataFrame:
    ds_staging = Path(STAGING_DIR) / dataset_key
    items_df, images_dir = prepare_dataset_to_staging(
        dataset_key,
        raw_dir=RAW_DIR,
        staging_dir=ds_staging,
        split=SPLIT,
        safe_mode=SAFE_MODE,
        max_items=MAX_ITEMS_PER_DATASET,
        seed=SEED,
        overwrite=False,
        allow_large=False,
        verify_md5=False,
        mpp=0.5,
        use_text_modality=False,  # we build semantic_text later via ontology
    )
    # Ensure required columns
    items_df = items_df.copy()
    items_df["dataset_key"] = dataset_key
    items_df["staging_images_dir"] = str(images_dir)
    return items_df

for k in ["CRC_VAL_HE_7K", "NCT_CRC_HE_100K", "MEDMNIST_PATHMNIST", "HF_PCAM"]:
    if ENABLE.get(k, False):
        print(f"\n▶ Ingesting {k} ...")
        unified_parts.append(ingest_prepared(k))

# 2) Additional HF datasets use helper exporter (cell 4A)

for k in ["HF_LC25000", "HF_BACH", "HF_BREAKHIS_RCL_7500", "ORCA_ORAL_ANNOTATED_100"]:
    if not ENABLE.get(k, False):
        continue

    ds_staging = Path(STAGING_DIR) / k
    ds_staging.mkdir(parents=True, exist_ok=True)

    # Inline helpers to avoid NameError if not defined elsewhere
    def bach_folder_safe(row):
        return f"breast__{row.get('label', 'unknown')}"

    def breakhis_folder_safe(row):
        mag = row.get('magnification', 'unknown')
        lbl = row.get('label', 'unknown')
        return f"breast__{lbl}__{mag}"

    if k == "HF_LC25000":
        images_dir = export_hf_dataset_to_staging(
            dataset_key=k,
            staging_root=ds_staging,          # FIXED: staging_dir -> staging_root
            hf_id="1aurent/LC25000",
            split=SPLIT,
            img_col="image",                  # FIXED: image_col -> img_col
            label_col="label",
            max_items=MAX_ITEMS_PER_DATASET,  # FIXED: max_images -> max_items
            seed=SEED,
            tile_size=224,
            max_patches_per_image=4,          # FIXED: tiles_per_image -> max_patches_per_image
            label_folder_fn=lambda row: f"{row.get('organ','unknown')}__{row.get('label','unknown')}",
        )
        # FIXED: build_items_table_from_images_dir -> build_items_from_images_dir
        # FIXED: source -> dataset_key
        items_df = build_items_from_images_dir(images_dir, dataset_key=k, split=SPLIT, mpp=0.5)

    elif k == "HF_BACH":
        images_dir = export_hf_dataset_to_staging(
            dataset_key=k,
            staging_root=ds_staging,
            hf_id="1aurent/bach",
            split=SPLIT,
            img_col="image",
            label_col="label",
            max_items=MAX_ITEMS_PER_DATASET,
            seed=SEED,
            tile_size=224,
            max_patches_per_image=4,
            label_folder_fn=bach_folder_safe,
        )
        items_df = build_items_from_images_dir(images_dir, dataset_key=k, split=SPLIT, mpp=0.5)

    elif k == "HF_BREAKHIS_RCL_7500":
        images_dir = export_hf_dataset_to_staging(
            dataset_key=k,
            staging_root=ds_staging,
            hf_id="1aurent/breakhis",
            split=SPLIT,
            img_col="image",
            label_col="label",
            max_items=MAX_ITEMS_PER_DATASET,
            seed=SEED,
            tile_size=224,
            max_patches_per_image=4,
            label_folder_fn=breakhis_folder_safe,
        )
        items_df = build_items_from_images_dir(images_dir, dataset_key=k, split=SPLIT, mpp=0.5)

    elif k == "ORCA_ORAL_ANNOTATED_100":
        # Ensure download runs
        images_dir = download_orca_to_staging(k, STAGING_DIR, overwrite=False)
        items_df = build_items_from_images_dir(images_dir, dataset_key=k, split=SPLIT, mpp=0.5)

    else:
        continue

    items_df = items_df.copy()
    items_df["dataset_key"] = k
    items_df["staging_images_dir"] = str(images_dir)
    unified_parts.append(items_df)

# Merge
unified_atlas = pd.concat(unified_parts, ignore_index=True) if unified_parts else pd.DataFrame()
print("\nUnified atlas rows:", len(unified_atlas))

# Basic columns
if len(unified_atlas):
    unified_atlas["image_path"] = unified_atlas["image_path"].astype(str)
    unified_atlas["raw_label"] = unified_atlas.get("raw_label", unified_atlas.get("label")).astype(str)
    unified_atlas["label"] = unified_atlas.get("label", unified_atlas.get("raw_label")).astype(str)

unified_atlas.head()


▶ Ingesting CRC_VAL_HE_7K ...

▶ Ingesting MEDMNIST_PATHMNIST ...
[cache] HF_LC25000 already exported: /content/drive/MyDrive/mit/histopathology_mantis_20260115/data_staging/HF_LC25000/HF_LC25000/images
[cache] HF_BACH already exported: /content/drive/MyDrive/mit/histopathology_mantis_20260115/data_staging/HF_BACH/HF_BACH/images
[cache] HF_BREAKHIS_RCL_7500 already exported: /content/drive/MyDrive/mit/histopathology_mantis_20260115/data_staging/HF_BREAKHIS_RCL_7500/HF_BREAKHIS_RCL_7500/images

Unified atlas rows: 3712


Unnamed: 0,item_id,source,split,label,text,image_path,width,height,mpp,dataset_key,staging_images_dir,raw_label
0,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AEDALKHL,CRC_VAL_HE_7K,train,ADI,,/content/drive/MyDrive/mit/histopathology_mant...,224,224,0.5,CRC_VAL_HE_7K,/content/drive/MyDrive/mit/histopathology_mant...,
1,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AGWWSHFM,CRC_VAL_HE_7K,train,ADI,,/content/drive/MyDrive/mit/histopathology_mant...,224,224,0.5,CRC_VAL_HE_7K,/content/drive/MyDrive/mit/histopathology_mant...,
2,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AIQQNFEC,CRC_VAL_HE_7K,train,ADI,,/content/drive/MyDrive/mit/histopathology_mant...,224,224,0.5,CRC_VAL_HE_7K,/content/drive/MyDrive/mit/histopathology_mant...,
3,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AKVLMQER,CRC_VAL_HE_7K,train,ADI,,/content/drive/MyDrive/mit/histopathology_mant...,224,224,0.5,CRC_VAL_HE_7K,/content/drive/MyDrive/mit/histopathology_mant...,
4,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-ASWYCRFC,CRC_VAL_HE_7K,train,ADI,,/content/drive/MyDrive/mit/histopathology_mant...,224,224,0.5,CRC_VAL_HE_7K,/content/drive/MyDrive/mit/histopathology_mant...,


## 5) Supervised Ontology Mapper (>5 levels)

We **do not** use unsupervised cluster IDs as semantic labels.

Instead, we map every row to a **7‑level semantic path**:

1. **Domain** (Tissue vs Artifact)  
2. **Organ System** (Colon, Lung, Breast, Lymph Node, Oral Cavity, …)  
3. **General Pathology** (Neoplastic, Non‑Neoplastic, Inflammatory, Necrosis, Artifact)  
4. **Tissue Structure** (Epithelium, Stroma, Immune, Muscle, Extracellular, Background, …)  
5. **Specific Class** (Adenocarcinoma, Lymphocytes, Adipose Tissue, …)  
6. **Subtype / Context** (Invasive vs In‑situ; Mets+/−; dataset‑specific)  
7. **Diagnostic Modifier** (**mocked deterministically** if not provided)

All 7 columns are exported as `cluster_l1 ... cluster_l7`.


In [6]:
#@title 5) Ontology mapper (V5: strict 7-level, lowercase cluster_l*, NO Unknown/Unspecified)

import re
import numpy as np

# --- FIX: Bridge variable name from Step 4 ---
if 'unified_atlas' in locals():
    atlas = unified_atlas.copy()
    print(f"Initialized 'atlas' from 'unified_atlas' ({len(atlas)} rows).")
elif 'atlas' not in locals():
    raise NameError("Neither 'unified_atlas' nor 'atlas' is defined. Please run Step 4 first.")

# -------------------------
# Helpers
# -------------------------

def _norm(s) -> str:
    s = str(s) if s is not None else ""
    s = s.strip().lower()
    s = s.replace("_", " ")
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _title(s) -> str:
    s = str(s) if s is not None else ""
    s = re.sub(r"\s+", " ", s).strip()
    return s[:1].upper() + s[1:] if s else s

def _mock_grade(key: str) -> str:
    # deterministic, stable across runs for same id
    h = abs(hash(key)) % 3
    return ["Modifier: Low grade (mock)", "Modifier: Intermediate grade (mock)", "Modifier: High grade (mock)"][h]

# -------------------------
# Canonical CRC (Kather) label handling
# -------------------------
CRC_CODES = {"ADI","BACK","DEB","LYM","MUC","MUS","NORM","STR","TUM"}

def canonical_crc_code(label: str) -> str:
    t = _norm(label)
    t_up = t.upper()
    if t_up in CRC_CODES:
        return t_up

    # descriptive label → code (supports MedMNIST / non-code folder names)
    if any(x in t for x in ["background", "empty", "glass"]):
        return "BACK"
    if any(x in t for x in ["adipose", "fat", "adipocyte"]):
        return "ADI"
    if any(x in t for x in ["debris", "necros", "detrit", "dirt"]):
        return "DEB"
    if any(x in t for x in ["lymph", "immune", "lymphocyte"]):
        return "LYM"
    if any(x in t for x in ["mucus", "mucin"]):
        return "MUC"
    if "muscle" in t or "muscular" in t:
        return "MUS"
    if "stroma" in t or "fibro" in t or "connective" in t:
        return "STR"
    if ("normal" in t and "mucosa" in t) or "normal mucosa" in t or "normal colon" in t:
        return "NORM"
    if any(x in t for x in ["tumor", "adenocarcinoma", "carcinoma epithelium", "colorectal adenocarcinoma"]):
        return "TUM"

    # fallback: if label is a single token and close to a code
    if t_up.startswith("NORM"):
        return "NORM"

    return "TUM" if "carcinoma" in t else "BACK" if "no tissue" in t else "STR" if "strom" in t else "NORM"

# 7-level paths for CRC-like datasets (CRC_VAL_HE_7K, NCT_CRC_HE_100K, MedMNIST PathMNIST)
CRC_PATH = {
    "ADI":  ("Biological Tissue", "Mesenchymal", "Colorectal", "Normal/Benign", "Adipose Tissue", "Pattern: Adipocyte-rich stroma", "Modifier: Not applicable"),
    "BACK": ("Non-Tissue / Artifact", "Artifact", "Colorectal", "Not Applicable", "Background / Empty Slide", "Pattern: No tissue", "Modifier: Not applicable"),
    "DEB":  ("Biological Tissue", "Extracellular Substance", "Colorectal", "Inflammatory/Reactive", "Necrotic Debris", "Pattern: Detritus / necrosis", "Modifier: Not applicable"),
    "LYM":  ("Biological Tissue", "Hematolymphoid", "Colorectal", "Inflammatory/Reactive", "Lymphocytes", "Pattern: Lymphoid infiltrate", "Modifier: Not applicable"),
    "MUC":  ("Biological Tissue", "Extracellular Substance", "Colorectal", "Structural", "Mucus", "Pattern: Extracellular mucin", "Modifier: Not applicable"),
    "MUS":  ("Biological Tissue", "Mesenchymal", "Colorectal", "Structural", "Smooth Muscle", "Pattern: Muscularis", "Modifier: Not applicable"),
    "NORM": ("Biological Tissue", "Epithelial", "Colorectal", "Normal/Benign", "Normal Mucosa", "Pattern: Crypt architecture", "Modifier: Not applicable"),
    "STR":  ("Biological Tissue", "Mesenchymal", "Colorectal", "Normal/Benign", "Cancer-Associated Stroma", "Pattern: Fibroblast-rich stroma", "Modifier: Not applicable"),
    "TUM":  ("Biological Tissue", "Epithelial", "Colorectal", "Malignant/Invasive", "Adenocarcinoma", "Pattern: Gland-forming carcinoma", None),
}

# -------------------------
# Other dataset parsers
# -------------------------

def parse_pcam(label: str, item_id: str):
    t = _norm(label)
    is_pos = (t in {"1", "true", "tumor", "positive"})
    if is_pos:
        return (
            "Biological Tissue", "Epithelial", "Lymph Node", "Malignant/Invasive",
            "Metastatic Carcinoma", "Pattern: Metastatic focus", _mock_grade(item_id)
        )
    return (
        "Biological Tissue", "Epithelial", "Lymph Node", "Normal/Benign",
        "Normal Lymph Node Tissue", "Pattern: Lymph node parenchyma", "Modifier: Not applicable"
    )


def parse_lc25000(folder_label: str, item_id: str):
    s = _norm(folder_label)
    parts = s.split(" ")
    # original staging uses "<organ_code>__<class>" (underscores converted in _norm)
    # so reconstruct using raw string too
    raw = str(folder_label).lower()
    if "__" in raw:
        p = raw.split("__", 1)
        org_tok = p[0].strip()
        cls_tok = p[1].strip()
    else:
        org_tok, cls_tok = "", raw

    # Map organ code: by inspection, 0__squamous_carcinomas corresponds to Lung
    if org_tok in {"0", "lung"}:
        organ = "Lung"
    elif org_tok in {"1", "colon", "colorectal"}:
        organ = "Colorectal"
    else:
        # infer from class token
        organ = "Lung" if "squamous" in cls_tok else "Colorectal" if "colon" in cls_tok else "Generic"

    if "squamous" in cls_tok:
        return (
            "Biological Tissue", "Epithelial", organ, "Malignant/Invasive",
            "Squamous Cell Carcinoma", "Pattern: Squamous differentiation", _mock_grade(item_id)
        )
    if "adeno" in cls_tok:
        return (
            "Biological Tissue", "Epithelial", organ, "Malignant/Invasive",
            "Adenocarcinoma", "Pattern: Gland-forming carcinoma", _mock_grade(item_id)
        )
    # benign
    return (
        "Biological Tissue", "Epithelial", organ, "Normal/Benign",
        "Benign/Normal Tissue", "Pattern: Benign parenchyma", "Modifier: Not applicable"
    )


def parse_bach(folder_label: str, item_id: str):
    s = _norm(folder_label)
    # staging uses breast__{label}
    if "invasive" in s:
        return ("Biological Tissue", "Epithelial", "Breast", "Malignant/Invasive", "Invasive Carcinoma", "Pattern: Invasive nests", _mock_grade(item_id))
    if "in situ" in s or "insitu" in s:
        return ("Biological Tissue", "Epithelial", "Breast", "Pre-Malignant/In Situ", "Carcinoma in situ", "Pattern: In situ ducts", "Modifier: Not applicable")
    if "benign" in s:
        return ("Biological Tissue", "Epithelial", "Breast", "Normal/Benign", "Benign Breast Lesion", "Pattern: Benign glands", "Modifier: Not applicable")
    # normal
    return ("Biological Tissue", "Epithelial", "Breast", "Normal/Benign", "Normal Breast Tissue", "Pattern: Normal ducts/lobules", "Modifier: Not applicable")


def parse_breakhis(folder_label: str, item_id: str):
    s = str(folder_label).lower()
    is_benign = ("__benign" in s) or ("benign" in s and "malignant" not in s)
    organ = "Breast"

    # subtype tokens if present
    subtype = "unspecified"
    m = re.search(r"__([^_]+)__", s)
    # better: split
    parts = s.split("__")
    # expected: breast__{group}__{subtype}__{mag}
    if len(parts) >= 4:
        subtype = parts[2] or "unspecified"
        mag = parts[3] or "mag_unknown"
    else:
        mag = "mag_unknown"

    # magnification cleanup
    mag = mag.replace("x", "X")
    if mag in {"mag_unknown", "unknown", "unspecified"}:
        mag_label = "Acquisition: Magnification unknown"
    else:
        mag_label = f"Acquisition: {mag}"

    if is_benign:
        entity = "Benign Breast Lesion"
        path_state = "Normal/Benign"
        pattern = f"Pattern: Benign lesion ({subtype})" if subtype != "unspecified" else "Pattern: Benign lesion"
        return ("Biological Tissue", "Epithelial", organ, path_state, entity, pattern, "Modifier: Not applicable" if mag_label is None else mag_label)

    # malignant
    entity = "Invasive Carcinoma"
    # subtype refinement
    if "ductal" in subtype:
        entity = "Ductal Carcinoma"
    elif "lobular" in subtype:
        entity = "Lobular Carcinoma"
    elif "mucin" in subtype:
        entity = "Mucinous Carcinoma"
    elif "pap" in subtype:
        entity = "Papillary Carcinoma"

    pattern = f"Pattern: {entity} ({subtype})" if subtype != "unspecified" else f"Pattern: {entity}"
    return ("Biological Tissue", "Epithelial", organ, "Malignant/Invasive", entity, pattern, _mock_grade(item_id) if mag_label.startswith("Acquisition") else _mock_grade(item_id))


def parse_orca(folder_label: str, item_id: str):
    s = _norm(folder_label)
    organ = "Oral Cavity"
    if "oscc" in s or "squamous" in s or "carcinoma" in s:
        return ("Biological Tissue", "Epithelial", organ, "Malignant/Invasive", "Squamous Cell Carcinoma", "Pattern: Squamous differentiation", _mock_grade(item_id))
    if "dysplasia" in s or "in situ" in s:
        return ("Biological Tissue", "Epithelial", organ, "Pre-Malignant/In Situ", "Dysplasia", "Pattern: Atypical epithelium", "Modifier: Not applicable")
    return ("Biological Tissue", "Epithelial", organ, "Normal/Benign", "Normal Oral Mucosa", "Pattern: Normal squamous mucosa", "Modifier: Not applicable")


# -------------------------
# Main mapping router
# -------------------------

def map_to_ontology(row) -> tuple:
    dk = str(row.get("dataset_key", ""))
    lab = row.get("label", row.get("raw_label", ""))
    raw = row.get("raw_label", lab)
    item_id = str(row.get("item_id"))

    if dk in {"CRC_VAL_HE_7K", "NCT_CRC_HE_100K", "MEDMNIST_PATHMNIST"}:
        code = canonical_crc_code(lab)
        l1,l2,l3,l4,l5,l6,l7 = CRC_PATH.get(code, CRC_PATH["NORM"])
        if l7 is None:
            l7 = _mock_grade(item_id)
        return (l1,l2,l3,l4,l5,l6,l7)

    if dk == "HF_PCAM":
        return parse_pcam(lab, item_id)

    if dk == "HF_LC25000":
        return parse_lc25000(raw, item_id)

    if dk == "HF_BACH":
        return parse_bach(raw, item_id)

    if dk == "HF_BREAKHIS_RCL_7500":
        return parse_breakhis(raw, item_id)

    if dk == "ORCA_ORAL_ANNOTATED_100":
        return parse_orca(raw, item_id)

    # Fallback (should be rare)
    leaf = _title(_norm(lab) or "unknown")
    return (
        "Biological Tissue",
        "Other Tissue",
        "Generic",
        "Unknown State",
        leaf,
        f"Pattern: {leaf}",
        "Modifier: Not provided",
    )

# Apply mapping
mapped = atlas.apply(map_to_ontology, axis=1, result_type="expand")
mapped.columns = [f"cluster_l{i}" for i in range(1,8)]
atlas = pd.concat([atlas, mapped], axis=1)

# Build semantic_text / text (NO dataset tokens)

def build_semantic_text(r) -> str:
    return (
        f"Histopathology image of {r['cluster_l2']} tissue from the {r['cluster_l3']}, "
        f"representing {r['cluster_l4']} {r['cluster_l5']}. "
        f"{r['cluster_l6']}. {r['cluster_l7']}."
    )

atlas["semantic_text"] = atlas.apply(build_semantic_text, axis=1)

# Mantis "text" can be longer (flashcards) — still avoid dataset/split tokens
atlas["text"] = atlas.apply(
    lambda r: (
        f"{r['semantic_text']} "
        f"This node is organized under the ontology path: "
        f"{r['cluster_l1']} > {r['cluster_l2']} > {r['cluster_l3']} > {r['cluster_l4']} > {r['cluster_l5']} > {r['cluster_l6']} > {r['cluster_l7']}."
    ),
    axis=1,
)

# Temporary title base (final unique titles are set during export)
atlas["title"] = atlas.apply(lambda r: f"{r['cluster_l3']} | {r['cluster_l5']}", axis=1)

atlas[["dataset_key","raw_label","label"] + [f"cluster_l{i}" for i in range(1,8)]].head()

# --- (PATCH) Add ontology_path, stable cluster_id, and metadata JSON for downstream Mantis steps
import json

# 1) Deterministic ontology path (7-level)
cluster_cols = [f"cluster_l{i}" for i in range(1,8) if f"cluster_l{i}" in atlas.columns]
if cluster_cols:
    atlas["ontology_path"] = atlas[cluster_cols].astype(str).agg(" > ".join, axis=1)
else:
    atlas["ontology_path"] = ""

# 2) Canonical cluster key + stable numeric cluster_id
# Prefer the full ontology path; fall back to existing 'cluster' if present
if "ontology_path" in atlas.columns and atlas["ontology_path"].astype(str).str.len().gt(0).any():
    atlas["cluster_key"] = atlas["ontology_path"].astype(str)
elif "cluster" in atlas.columns:
    atlas["cluster_key"] = atlas["cluster"].astype(str)
else:
    atlas["cluster_key"] = "cluster_unknown"

atlas["cluster_id"] = pd.factorize(atlas["cluster_key"], sort=True)[0].astype(int)

# 3) Ensure a 'cluster' column exists (Mantis-friendly default)
# (We keep this as a human-readable coarse label; agentic naming later overwrites with cluster_name if desired.)
if "cluster" not in atlas.columns:
    # Prefer organ + pathology_state + entity
    parts = []
    for col in ["cluster_l3", "cluster_l5", "cluster_l4"]:
        if col in atlas.columns:
            parts.append(atlas[col].astype(str))
    if parts:
        atlas["cluster"] = pd.concat(parts, axis=1).agg(" — ".join, axis=1)
    else:
        atlas["cluster"] = atlas["cluster_key"]

# 4) Build metadata JSON column if missing
# We only include fields that exist to avoid KeyErrors
if "metadata" not in atlas.columns:
    def _row_meta(r):
        m = {}
        for k in [
            "dataset_key","source","split","label","raw_label",
            "image_path","width","height","mpp",
            "item_id","id"
        ]:
            if k in r and pd.notna(r[k]):
                # cast numpy scalars -> python
                v = r[k]
                try:
                    import numpy as _np
                    if isinstance(v, (_np.generic,)):
                        v = v.item()
                except Exception:
                    pass
                m[k] = v
        return json.dumps(m, ensure_ascii=False)

    atlas["metadata"] = [ _row_meta(row) for _, row in atlas.iterrows() ]
else:
    # Ensure metadata is a valid JSON string for every row
    def _safe_json_dump(x):
        if isinstance(x, str):
            return x
        try:
            return json.dumps(x, ensure_ascii=False)
        except Exception:
            return "{}"
    atlas["metadata"] = atlas["metadata"].apply(_safe_json_dump)

print("✅ PATCH applied: ontology_path, cluster_id, metadata ready")



Initialized 'atlas' from 'unified_atlas' (3712 rows).
✅ PATCH applied: ontology_path, cluster_id, metadata ready


## 6) Hybrid embeddings (Visual + Semantic)

We compute:

- **Visual embeddings:** ResNet50 (torchvision) by default  
- **Semantic embeddings:** Sentence‑Transformers on the ontology‑rich `text` field  

Then we fuse them (default **concatenation**) and reduce with PCA to a manageable dimension for CSV export.


In [7]:
#@title 6A) Visual embeddings (ResNet50) — batch mode

from histo_cartography.embeddings import embed_images_resnet50

img_emb = embed_images_resnet50(
    atlas[["item_id","image_path"]].copy(),
    image_col="image_path",
    batch_size=64,
    max_items=None,
)
img_emb = img_emb[["item_id","vector"]].rename(columns={"vector":"vec_img"})
print("img_emb:", img_emb.shape)
img_emb.head()


img_emb: (3712, 2)


Unnamed: 0,item_id,vec_img
0,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AEDALKHL,"[0.5608727335929871, 0.07642041891813278, 0.50..."
1,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AGWWSHFM,"[0.37905678153038025, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AIQQNFEC,"[0.09872574359178543, 0.031486280262470245, 0...."
3,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AKVLMQER,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0190953630954..."
4,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-ASWYCRFC,"[0.0, 0.015149412676692009, 0.1518165767192840..."


In [8]:
#@title 6B) Semantic embeddings (Sentence-Transformers) — IMPORTANT: use semantic_text (no dataset tokens)

from sentence_transformers import SentenceTransformer

text_model = SentenceTransformer(TEXT_MODEL)

texts = atlas["semantic_text"].tolist()
item_ids = atlas["item_id"].tolist()

vec_txt = text_model.encode(
    texts,
    batch_size=64,
    show_progress_bar=True,
    normalize_embeddings=True,
)

txt_emb = pd.DataFrame({"item_id": item_ids, "vec_txt": [v.astype(np.float32) for v in vec_txt]})
print("txt_emb:", txt_emb.shape)
txt_emb.head()




Batches:   0%|          | 0/58 [00:00<?, ?it/s]

txt_emb: (3712, 2)


Unnamed: 0,item_id,vec_txt
0,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AEDALKHL,"[0.064286225, 0.05904052, -0.016424108, 0.0262..."
1,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AGWWSHFM,"[0.064286225, 0.05904052, -0.016424108, 0.0262..."
2,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AIQQNFEC,"[0.064286225, 0.05904052, -0.016424108, 0.0262..."
3,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AKVLMQER,"[0.064286225, 0.05904052, -0.016424108, 0.0262..."
4,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-ASWYCRFC,"[0.064286225, 0.05904052, -0.016424108, 0.0262..."


In [9]:
#@title 6C) Fuse embeddings + reduce dimension (+dataset alignment) + kNN neighbors + anchors

from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize

# Merge atlas + embeddings
# img_emb: item_id, vec_img (2048)
# txt_emb: item_id, vec_txt (384)
df = atlas.merge(img_emb, on="item_id", how="left").merge(txt_emb, on="item_id", how="left")

def _to_vec(v, dim: int) -> np.ndarray:
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return np.zeros(dim, dtype=np.float32)
    arr = np.array(v, dtype=np.float32)
    if arr.ndim != 1 or arr.shape[0] != dim:
        return np.zeros(dim, dtype=np.float32)
    return arr

df["vec_img"] = df["vec_img"].apply(lambda v: _to_vec(v, 2048))
df["vec_txt"] = df["vec_txt"].apply(lambda v: _to_vec(v, 384))

V_img = np.stack(df["vec_img"].to_list())
V_txt = np.stack(df["vec_txt"].to_list())

# L2 normalize each modality
V_img = normalize(V_img)
V_txt = normalize(V_txt)

# Fuse
if FUSION_MODE != "concat":
    raise ValueError("Only FUSION_MODE='concat' is supported in this notebook.")

V = np.concatenate([CONCAT_IMAGE_WEIGHT * V_img, CONCAT_TEXT_WEIGHT * V_txt], axis=1)
print("Fused dim:", V.shape)

# Reduce dimension (whitened PCA helps de-emphasize style dimensions)
if REDUCE_DIM is not None and V.shape[1] > REDUCE_DIM:
    pca = PCA(n_components=REDUCE_DIM, random_state=SEED, whiten=True)
    V_red = pca.fit_transform(V)
else:
    V_red = V

# Global normalize
V_red = normalize(V_red)

# Stronger domain alignment: mean+std alignment per dataset
if BATCH_ALIGN_BY_DATASET:
    eps = 1e-6
    V_aligned = V_red.copy()
    g_mu = V_aligned.mean(axis=0, keepdims=True)
    g_std = V_aligned.std(axis=0, keepdims=True) + eps

    for dk in df["dataset_key"].unique():
        idx = (df["dataset_key"] == dk).to_numpy()
        if idx.sum() < 10:
            continue
        mu = V_aligned[idx].mean(axis=0, keepdims=True)
        std = V_aligned[idx].std(axis=0, keepdims=True) + eps
        V_aligned[idx] = (V_aligned[idx] - mu) / std * g_std + g_mu

    V_red = normalize(V_aligned)

df["vector"] = [v.astype(np.float32) for v in V_red]
print("Reduced dim:", V_red.shape)

# -------------------------
# Optional: Add ontology anchor nodes (helps Mantis connect continents)
# -------------------------

def _make_anchor_rows(df_in: pd.DataFrame) -> pd.DataFrame:
    rows = []
    V_stack = np.stack(df_in["vector"].to_list())

    def _mean_vec(mask):
        m = mask.to_numpy()
        return V_stack[m].mean(axis=0).astype(np.float32)

    # 1) Organ anchors
    for org, g in df_in.groupby("cluster_l3"):
        v = np.mean(np.stack(g["vector"].to_list()), axis=0)
        rows.append({
            "item_id": f"ANCHOR::ORG::{org}",
            "dataset_key": "ANCHOR",
            "raw_label": f"anchor_org::{org}",
            "label": f"anchor_org::{org}",
            **{f"cluster_l{i}": ("Ontology Anchor" if i==1 else ("Anchor" if i==2 else org if i==3 else "All States" if i==4 else f"{org} (Organ Anchor)" if i==5 else "Pattern: Anchor" if i==6 else "Modifier: Anchor")) for i in range(1,8)},
            "semantic_text": f"Ontology anchor node for the organ: {org}.",
            "title": f"ANCHOR | Organ | {org}",
            "text": f"Ontology anchor for organ {org}.",
            "vector": v.astype(np.float32),
        })

    # 2) Organ+Superclass anchors
    for (org, sc), g in df_in.groupby(["cluster_l3","cluster_l2"]):
        if len(g) < 25:
            continue
        v = np.mean(np.stack(g["vector"].to_list()), axis=0)
        rows.append({
            "item_id": f"ANCHOR::ORG_SUPER::{org}::{sc}",
            "dataset_key": "ANCHOR",
            "raw_label": f"anchor_org_super::{org}::{sc}",
            "label": f"anchor_org_super::{org}::{sc}",
            "cluster_l1": "Ontology Anchor",
            "cluster_l2": sc,
            "cluster_l3": org,
            "cluster_l4": "All States",
            "cluster_l5": f"{sc} in {org} (Anchor)",
            "cluster_l6": "Pattern: Anchor",
            "cluster_l7": "Modifier: Anchor",
            "semantic_text": f"Ontology anchor for {sc} tissue lineage in the {org}.",
            "title": f"ANCHOR | {org} | {sc}",
            "text": f"Ontology anchor for {sc} lineage in {org}.",
            "vector": v.astype(np.float32),
        })

    # 3) Global superclass anchors
    for sc, g in df_in.groupby("cluster_l2"):
        if len(g) < 25:
            continue
        v = np.mean(np.stack(g["vector"].to_list()), axis=0)
        rows.append({
            "item_id": f"ANCHOR::SUPER::{sc}",
            "dataset_key": "ANCHOR",
            "raw_label": f"anchor_super::{sc}",
            "label": f"anchor_super::{sc}",
            "cluster_l1": "Ontology Anchor",
            "cluster_l2": sc,
            "cluster_l3": "Generic",
            "cluster_l4": "All States",
            "cluster_l5": f"{sc} (Superclass Anchor)",
            "cluster_l6": "Pattern: Anchor",
            "cluster_l7": "Modifier: Anchor",
            "semantic_text": f"Ontology anchor for {sc} tissue lineage across organs.",
            "title": f"ANCHOR | Superclass | {sc}",
            "text": f"Ontology anchor for {sc}.",
            "vector": v.astype(np.float32),
        })

    return pd.DataFrame(rows)

if ADD_ANCHORS:
    anchors_df = _make_anchor_rows(df)
    df = pd.concat([df, anchors_df], ignore_index=True)
    print(f"Added {len(anchors_df)} anchor nodes.")
else:
    print("ADD_ANCHORS=False (skipping anchors)")

# -------------------------
# kNN neighbors (explicit relatedness, used by healthcheck KG)
# IMPORTANT: computed on final vectors
# -------------------------
from sklearn.neighbors import NearestNeighbors
V_for_knn = np.stack(df["vector"].to_list())
knn_k = 12
knn = NearestNeighbors(n_neighbors=knn_k+1, metric="cosine").fit(V_for_knn)
dists, nbr_idx = knn.kneighbors(V_for_knn)
ids = df["item_id"].astype(str).to_numpy()
for j in range(1, knn_k+1):
    df[f"nn{j}"] = ids[nbr_idx[:, j]]
    df[f"nn_dist{j}"] = dists[:, j].astype(np.float32)

print(f"Computed kNN neighbors (k={knn_k}).")

df[["item_id","dataset_key","raw_label","cluster_l2","cluster_l5"].copy()] if False else df[["item_id","dataset_key","raw_label","cluster_l2","cluster_l5"]].head()


Fused dim: (3712, 2432)
Reduced dim: (3712, 512)
Added 15 anchor nodes.
Computed kNN neighbors (k=12).


Unnamed: 0,item_id,dataset_key,raw_label,cluster_l2,cluster_l5
0,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AEDALKHL,CRC_VAL_HE_7K,,Mesenchymal,Adipose Tissue
1,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AGWWSHFM,CRC_VAL_HE_7K,,Mesenchymal,Adipose Tissue
2,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AIQQNFEC,CRC_VAL_HE_7K,,Mesenchymal,Adipose Tissue
3,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-AKVLMQER,CRC_VAL_HE_7K,,Mesenchymal,Adipose Tissue
4,CRC_VAL_HE_7K::train::ADI::ADI-TCGA-ASWYCRFC,CRC_VAL_HE_7K,,Mesenchymal,Adipose Tissue


## 7) (Optional) 2D layout

If your Mantis setup prefers precomputed coordinates, we compute `x`,`y` via UMAP on the fused vectors.

We also pass a **supervised target** (`cluster_l5`) to encourage separation by histologic class.


In [10]:
#@title 7) Compute 2D layout (UMAP) — supervised on tissue superclass for cross-dataset mixing

if COMPUTE_2D_LAYOUT:
    import umap

    # Handle case-sensitivity for the supervision column
    target_col = UMAP_SUPERVISE_ON
    if target_col not in df.columns:
        # Try to find a case-insensitive match (e.g. 'cluster_l2' -> 'cluster_L2')
        candidates = [c for c in df.columns if c.lower() == target_col.lower()]
        if candidates:
            print(f"⚠️ Column '{target_col}' not found. Using '{candidates[0]}' instead.")
            target_col = candidates[0]
        else:
            raise ValueError(f"UMAP_SUPERVISE_ON='{UMAP_SUPERVISE_ON}' not in df columns. Available: {list(df.columns)}")

    y_lbl = pd.Categorical(df[target_col]).codes

    reducer = umap.UMAP(
        n_neighbors=UMAP_N_NEIGHBORS,
        min_dist=UMAP_MIN_DIST,
        metric="cosine",
        random_state=SEED,
    )
    xy = reducer.fit_transform(np.stack(df["vector"].to_list()), y=y_lbl)
    df["x"] = xy[:,0].astype(float)
    df["y"] = xy[:,1].astype(float)
else:
    df["x"] = 0.0
    df["y"] = 0.0

df[["x","y"]].describe()

  warn(


Unnamed: 0,x,y
count,3727.0,3727.0
mean,23.99793,18.544479
std,12.227475,11.366479
min,-6.223482,-9.697048
25%,13.826935,12.0239
50%,30.644655,23.962517
75%,31.963781,25.911136
max,33.636322,27.836561


In [19]:

#@title 7B) Deterministic Tool Primitives (Demo1) — parsing, layout, neighbors, metadata

import os, json, re, math
import numpy as np
import pandas as pd

from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors

# Optional: UMAP (preferred). If missing, we fall back to PCA.
try:
    import umap
    _HAS_UMAP = True
except Exception:
    _HAS_UMAP = False

# ---------------------------
# JSON helpers
# ---------------------------

def _safe_json_load(x):
    if isinstance(x, dict):
        return x
    if not isinstance(x, str):
        return {}
    s = x.strip()
    if not s:
        return {}
    try:
        return json.loads(s)
    except Exception:
        return {}

def _safe_json_dump(obj):
    try:
        return json.dumps(obj, ensure_ascii=False)
    except Exception:
        return "{}"

# ---------------------------
# String cleanup / de-abbrev helpers
# ---------------------------

_ABBREV = {
    # Histopath / dataset-style tokens
    "he": "hematoxylin and eosin",
    "h&e": "hematoxylin and eosin",
    "adi": "adipose tissue",
    "lym": "lymphocyte",
    "deb": "debris",
    "tum": "tumor",
    "str": "stroma",
    "mus": "muscle",
    "muc": "mucosa",
    # Common high-level
    "crc": "colorectal cancer",
}

def _clean_text(s: str) -> str:
    s = "" if s is None else str(s)
    s = s.replace("\u00a0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def _strip_special_chars(s: str, keep: str = r'A-Za-z0-9 _\-\.,:;\(\)\/\+\["]') -> str:
    """Remove exotic characters but keep punctuation helpful for Mantis tooltips."""
    s = _clean_text(s)
    # Replace anything not in allowed set with space
    s = re.sub(rf"[^{keep}]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def decode_tokens(s: str) -> str:
    """Best-effort de-abbreviation on identifiers like CRC_ADI_HE_7K."""
    s = _clean_text(s)
    # split on common separators
    toks = re.split(r"[_:\/\-\s]+", s)
    out = []
    for t in toks:
        if not t:
            continue
        key = t.lower()
        if key in _ABBREV:
            out.append(_ABBREV[key])
        else:
            out.append(t)
    # title-case but preserve TCGA-like codes
    def _title(tok):
        if re.fullmatch(r"TCGA|TCGA[\-0-9A-Z]+", tok):
            return tok
        if tok.isupper() and len(tok) <= 6:
            return tok
        return tok[:1].upper() + tok[1:]
    return " ".join(_title(x) for x in out)

# ---------------------------
# Vector parsing
# ---------------------------

def parse_vector_cell(v):
    """Accept list/np.ndarray or stringified list and return 1D float32 array."""
    if v is None or (isinstance(v, float) and math.isnan(v)):
        return None
    if isinstance(v, (list, tuple, np.ndarray)):
        arr = np.asarray(v, dtype=np.float32).ravel()
        return arr
    if not isinstance(v, str):
        try:
            arr = np.asarray(v, dtype=np.float32).ravel()
            return arr
        except Exception:
            return None
    s = v.strip()
    if not s:
        return None

    # JSON list
    if s.startswith("[") and s.endswith("]"):
        try:
            obj = json.loads(s)
            arr = np.asarray(obj, dtype=np.float32).ravel()
            return arr
        except Exception:
            pass

    # Space/comma separated numbers
    s2 = s.replace("[", " ").replace("]", " ").replace("(", " ").replace(")", " ")
    s2 = s2.replace("\n", " ").replace("\t", " ")
    parts = [p for p in re.split(r"[ ,]+", s2.strip()) if p]
    try:
        arr = np.asarray([float(p) for p in parts], dtype=np.float32).ravel()
        return arr
    except Exception:
        return None


def vectors_to_matrix(df: pd.DataFrame, vector_col: str = "vector"):
    """Parse df[vector_col] into a (n,d) float32 matrix; coerce mismatched dims via pad/truncate."""
    vecs = [parse_vector_cell(v) for v in df[vector_col].tolist()]
    lens = [len(v) if v is not None else 0 for v in vecs]
    # Choose the modal dimension
    from collections import Counter
    dim_counts = Counter([l for l in lens if l > 0])
    if not dim_counts:
        raise ValueError(f"No valid vectors in column '{vector_col}'.")
    d = dim_counts.most_common(1)[0][0]

    fixed = []
    bad = 0
    for v in vecs:
        if v is None or len(v) == 0:
            bad += 1
            fixed.append(np.zeros((d,), dtype=np.float32))
            continue
        if len(v) < d:
            bad += 1
            vv = np.zeros((d,), dtype=np.float32)
            vv[:len(v)] = v
            fixed.append(vv)
        elif len(v) > d:
            bad += 1
            fixed.append(v[:d].astype(np.float32))
        else:
            fixed.append(v.astype(np.float32))

    X = np.stack(fixed, axis=0)
    # Replace NaN/Inf
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
    return X, d, bad


def matrix_to_vector_str(X: np.ndarray) -> list:
    """Serialize each row of X to a compact JSON list string for CSV."""
    return [json.dumps(row.tolist(), ensure_ascii=False) for row in X]

# ---------------------------
# Tool: promote metadata fields
# ---------------------------

def tool_promote_metadata(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "metadata" not in df.columns:
        # Create minimal metadata from present columns
        def _mk_meta(r):
            m = {}
            for k in ["dataset_key","source","split","label","raw_label","image_path","width","height","mpp"]:
                if k in r and pd.notna(r[k]):
                    m[k] = r[k]
            return _safe_json_dump(m)
        df["metadata"] = [ _mk_meta(row) for _, row in df.iterrows() ]

    meta = df["metadata"].astype(str).map(_safe_json_load)

    def _get(m, *keys, default=None):
        for k in keys:
            if k in m and m.get(k) not in [None, "", "nan", "NaN"]:
                return m.get(k)
        return default

    df["source"] = meta.map(lambda m: _get(m, "dataset_key", "source", default="unknown"))
    df["split"] = meta.map(lambda m: _get(m, "split", default="unknown"))

    # Label fallback order: label -> raw_label -> cluster_l5 -> cluster
    df["label"] = meta.map(lambda m: _get(m, "label", "raw_label"))
    if df["label"].isna().all() or (df["label"].astype(str).str.lower() == "nan").all():
        if "cluster_l5" in df.columns:
            df["label"] = df["cluster_l5"]
        elif "cluster" in df.columns:
            df["label"] = df["cluster"]
        else:
            df["label"] = "unknown"

    df["image_path"] = meta.map(lambda m: _get(m, "image_path"))
    df["width"] = meta.map(lambda m: _get(m, "width"))
    df["height"] = meta.map(lambda m: _get(m, "height"))
    df["mpp"] = meta.map(lambda m: _get(m, "mpp"))

    # write back cleaned metadata (ensures JSON validity)
    df["metadata"] = [ _safe_json_dump(m) for m in meta ]
    return df

# ---------------------------
# Tool: compute x,y layout (UMAP preferred, PCA fallback)
# ---------------------------

def tool_compute_layout(df: pd.DataFrame, vector_col: str = "vector", seed: int = 42) -> pd.DataFrame:
    df = df.copy()
    if "x" in df.columns and "y" in df.columns:
        # Ensure finite
        x = pd.to_numeric(df["x"], errors="coerce")
        y = pd.to_numeric(df["y"], errors="coerce")
        if x.notna().all() and y.notna().all() and np.isfinite(x).all() and np.isfinite(y).all():
            return df

    X, d, bad = vectors_to_matrix(df, vector_col=vector_col)
    X = normalize(X, norm="l2", axis=1)

    if _HAS_UMAP:
        reducer = umap.UMAP(
            n_components=2,
            n_neighbors=30,
            min_dist=0.15,
            metric="cosine",
            random_state=seed,
        )
        XY = reducer.fit_transform(X)
    else:
        XY = PCA(n_components=2, random_state=seed).fit_transform(X)

    XY = np.nan_to_num(XY, nan=0.0, posinf=0.0, neginf=0.0)
    df["x"] = XY[:,0].astype(float)
    df["y"] = XY[:,1].astype(float)
    return df

# ---------------------------
# Tool: neighbors_json from existing nn columns (fast) OR recompute
# ---------------------------

def tool_build_neighbors_json(df: pd.DataFrame, topn: int = 5) -> pd.DataFrame:
    df = df.copy()

    # If nn1.. doesn't exist, compute cosine kNN from vectors
    nn_cols = [c for c in df.columns if re.fullmatch(r"nn\d+", c)]
    dist_cols = [c for c in df.columns if re.fullmatch(r"nn_dist\d+", c)]

    if not nn_cols or not dist_cols:
        X, d, bad = vectors_to_matrix(df, vector_col="vector")
        X = normalize(X, norm="l2", axis=1)
        k = max(12, topn)
        nn = NearestNeighbors(n_neighbors=min(k+1, len(df)), metric="cosine")
        nn.fit(X)
        dists, idxs = nn.kneighbors(X)

        # Drop self neighbor at index 0
        dists = dists[:,1:]
        idxs = idxs[:,1:]

        # Ensure id exists
        if "id" in df.columns:
            ids = df["id"].astype(str).tolist()
        else:
            ids = df.index.astype(str).tolist()

        for j in range(dists.shape[1]):
            df[f"nn{j+1}"] = [ ids[ii] for ii in idxs[:,j] ]
            df[f"nn_dist{j+1}"] = dists[:,j].astype(float)

        nn_cols = [f"nn{i}" for i in range(1, dists.shape[1]+1)]
        dist_cols = [f"nn_dist{i}" for i in range(1, dists.shape[1]+1)]

    # Build top-N explicit columns like the reference schema
    for i in range(1, topn+1):
        if f"nn{i}" in df.columns:
            df[f"nn{i}_item_id"] = df[f"nn{i}"].astype(str)
        if f"nn_dist{i}" in df.columns:
            di = pd.to_numeric(df[f"nn_dist{i}"], errors="coerce").fillna(1e6).astype(float)
            df[f"nn{i}_dist"] = di
            df[f"nn{i}_weight"] = np.exp(-di)

    def _neighbors_row(r):
        out = []
        for i in range(1, topn+1):
            ncol = f"nn{i}"; dcol = f"nn_dist{i}"
            if ncol in r and dcol in r and pd.notna(r[ncol]) and pd.notna(r[dcol]):
                try:
                    dist = float(r[dcol])
                except Exception:
                    continue
                out.append({"id": str(r[ncol]), "dist": dist, "weight": float(math.exp(-dist))})
        return out

    df["neighbors_json"] = [ _safe_json_dump(_neighbors_row(r)) for _, r in df.iterrows() ]
    return df

# ---------------------------
# Tool: purity/entropy
# ---------------------------

def tool_purity_entropy(df: pd.DataFrame, label_col: str = "label", cluster_col: str = "cluster_id") -> pd.DataFrame:
    df = df.copy()
    if label_col not in df.columns:
        df[label_col] = "unknown"
    if cluster_col not in df.columns:
        # fallback to cluster key
        df[cluster_col] = pd.factorize(df.get("cluster", "cluster_unknown"), sort=True)[0].astype(int)

    grp = df.groupby(cluster_col)[label_col].value_counts(normalize=True)

    purity = grp.groupby(level=0).max().rename("purity")

    # entropy
    def _entropy(p):
        p = p[p>0]
        return float(-(p * np.log2(p)).sum())

    entropy = grp.groupby(level=0).apply(_entropy).rename("entropy")

    df = df.merge(purity.reset_index(), on=cluster_col, how="left")
    df = df.merge(entropy.reset_index(), on=cluster_col, how="left")

    return df

# ---------------------------
# Tool: title/body/labels + display fields (Mantis-friendly)
# ---------------------------

def tool_make_title_body_labels(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # Ensure identifiers
    if "id" not in df.columns:
        df["id"] = df.index.astype(str)
    df["item_id"] = df["id"].astype(str)

    # Ensure semantic_text exists and is non-empty
    if "semantic_text" not in df.columns:
        df["semantic_text"] = ""
    if "text" not in df.columns:
        df["text"] = ""
    if "title" not in df.columns:
        df["title"] = ""

    # Preserve title original
    if "title_original" not in df.columns:
        df["title_original"] = df["title"].astype(str)

    # Decode ID-like tokens
    def _decoded_id(r):
        raw = r.get("item_id", "")
        m = _safe_json_load(r.get("metadata", "{}"))
        # Prefer original_item_id if present
        if "original_item_id" in m:
            raw = str(m.get("original_item_id"))
        return decode_tokens(raw)

    df["decoded_id"] = df.apply(_decoded_id, axis=1)

    # Derive a concise display_label
    def _display_label(r):
        base = r.get("label", "")
        base = decode_tokens(base)
        if not base or str(base).lower() in {"nan","none",""}:
            base = r.get("cluster_l5", "") if "cluster_l5" in df.columns else "Item"
        # keep short
        base = _strip_special_chars(base)
        suffix = str(r.get("item_id"))[-8:]
        return f"{base} · {suffix}"

    df["display_label"] = df.apply(_display_label, axis=1)

    # Title: cluster name if available, else cluster; ensure uniqueness by appending suffix
    def _mk_title(r):
        cname = r.get("cluster_name") or r.get("cluster") or "Cluster"
        cname = _strip_special_chars(cname)
        return f"{cname} | {r.get('display_label','')}"

    df["title"] = df.apply(_mk_title, axis=1)

    # Body: semantic_text (clean) + cluster definition + id line (forces uniqueness)
    def _mk_body(r):
        sem = _clean_text(r.get("semantic_text", ""))
        if not sem:
            sem = _clean_text(r.get("text", ""))
        if not sem:
            sem = _clean_text(r.get("title", ""))
        sem = _strip_special_chars(sem)
        cdef = _strip_special_chars(r.get("cluster_definition", ""))
        if cdef:
            sem = f"{cdef}\n\n{sem}" if sem else cdef
        # add a deterministic uniqueness anchor
        return f"{sem}\n\nItem: {r.get('item_id','')}"

    df["body"] = df.apply(_mk_body, axis=1)

    # Labels: stable tags (cluster + ontology + triage-like category)
    def _triage(r):
        txt = " ".join(str(r.get(c,"")) for c in ["cluster_l4","label","cluster_l5","cluster_name","cluster"] if c in df.columns)
        t = txt.lower()
        if any(k in t for k in ["carcinoma","malignant","adenocarcinoma","tumor","cancer","invasive"]):
            return "critical-bug-urgent"
        if any(k in t for k in ["artifact","debris","noise","background"]):
            return "wontfix-noise"
        return "feature-routine"

    def _mk_labels(r):
        tags = []
        tags.append(_triage(r))
        # cluster tag
        if r.get("cluster_name"):
            tags.append("cluster-" + re.sub(r"\s+", "-", _clean_text(r.get("cluster_name"))).lower()[:40])
        # organ tag
        if "cluster_l3" in df.columns and r.get("cluster_l3"):
            tags.append("organ-" + re.sub(r"\s+", "-", _clean_text(r.get("cluster_l3"))).lower())
        # pathology state
        if "cluster_l4" in df.columns and r.get("cluster_l4"):
            tags.append("state-" + re.sub(r"\s+", "-", _clean_text(r.get("cluster_l4"))).lower())
        # dataset source
        if r.get("source"):
            tags.append("src-" + re.sub(r"\s+", "-", _clean_text(r.get("source"))).lower())
        # de-duplicate
        seen=set(); out=[]
        for t in tags:
            t=_strip_special_chars(t, keep=r'A-Za-z0-9_\-\.')
            if not t:
                continue
            if t not in seen:
                seen.add(t); out.append(t)
        return ",".join(out)

    df["labels"] = df.apply(_mk_labels, axis=1)

    # Tooltip for inspector
    def _tooltip(r):
        bits=[]
        if r.get("cluster_name"):
            bits.append(str(r.get("cluster_name")))
        if r.get("label"):
            bits.append("Label: " + str(r.get("label")))
        if r.get("source"):
            bits.append("Source: " + str(r.get("source")))
        if r.get("cluster_keywords"):
            bits.append("Keywords: " + str(r.get("cluster_keywords")))
        return _strip_special_chars(" | ".join(bits))

    df["tooltip_summary"] = df.apply(_tooltip, axis=1)

    return df

# ---------------------------
# Tool: critic validation
# ---------------------------

def tool_validate(df: pd.DataFrame, min_rows: int = 100) -> None:
    # Row count
    assert len(df) >= min_rows, f"Dataset too small: {len(df)} rows (min {min_rows})."

    # Required columns
    req = ["id","title","body","labels","metadata","vector","x","y","cluster_id"]
    for c in req:
        assert c in df.columns, f"Missing required column: {c}"

    # Finite coords
    assert np.isfinite(pd.to_numeric(df["x"], errors="coerce")).all(), "Non-finite x"
    assert np.isfinite(pd.to_numeric(df["y"], errors="coerce")).all(), "Non-finite y"

    # JSON fields
    for c in ["metadata","neighbors_json","cluster_xref"]:
        if c in df.columns:
            bad=0
            for s in df[c].astype(str).head(2000):
                try:
                    json.loads(s)
                except Exception:
                    bad += 1
            assert bad==0, f"Invalid JSON detected in {c} (in first 2000 rows)."

    # Vector dimension constant
    X,d,bad = vectors_to_matrix(df, vector_col="vector")
    assert d>0, "Invalid vector dim"

    # Titles/bodies not empty
    assert df["title"].astype(str).str.len().gt(0).all(), "Empty title exists"
    assert df["body"].astype(str).str.len().gt(0).all(), "Empty body exists"
    assert df["labels"].astype(str).str.len().gt(0).all(), "Empty labels exists"

    # cluster_xref exists and no self links (spot check)
    if "cluster_xref" in df.columns:
        # Build per-cluster one representative
        rep = df.drop_duplicates("cluster_id")[["cluster_id","cluster_xref"]].head(500)
        for cid, s in zip(rep["cluster_id"], rep["cluster_xref"]):
            arr = json.loads(s)
            for e in arr:
                assert int(e.get("target_cluster_id")) != int(cid), "Self-link in cluster_xref"

    print("✅ Critic approved: validation passed")

In [21]:

#@title 7C) Semantic Cartographer Agent (Demo2 + Demo3 memory) — cluster_name / definition / keywords

import os, json, re, time
import numpy as np
import pandas as pd

# Optional Colab userdata
try:
    from google.colab import userdata
    _key = userdata.get('histopathology') or userdata.get('OPENAI_API_KEY')
    if _key and not os.environ.get('OPENAI_API_KEY'):
        os.environ['OPENAI_API_KEY'] = _key
except Exception:
    pass

API_KEY = os.environ.get('OPENAI_API_KEY')

# OpenAI client (only used if API key present)
client = None
if API_KEY:
    try:
        from openai import OpenAI
        client = OpenAI(api_key=API_KEY)
    except Exception as e:
        print('⚠️ OpenAI client not available:', e)
        client = None

# Choose working dataframe
if 'df' not in globals():
    if 'atlas' in globals():
        df = atlas.copy()
    else:
        df = unified_atlas.copy()

# Ensure cluster_id exists
if 'cluster_id' not in df.columns:
    if 'ontology_path' in df.columns and df['ontology_path'].astype(str).str.len().gt(0).any():
        df['cluster_key'] = df['ontology_path'].astype(str)
    elif 'cluster' in df.columns:
        df['cluster_key'] = df['cluster'].astype(str)
    else:
        df['cluster_key'] = 'cluster_unknown'
    df['cluster_id'] = pd.factorize(df['cluster_key'], sort=True)[0].astype(int)

# --- FIX: Handle NaNs (e.g. from Anchors) before strict int casting ---
if df['cluster_id'].isnull().any():
    print(f"⚠️ Filling {df['cluster_id'].isnull().sum()} missing cluster_ids with -1 (likely anchors).")
    df['cluster_id'] = df['cluster_id'].fillna(-1)
df['cluster_id'] = df['cluster_id'].astype(int)
# ----------------------------------------------------------------------

# Memory store
exports_dir = globals().get('exports_dir', None)
MEM_PATH = None
if exports_dir is not None:
    try:
        from pathlib import Path
        MEM_PATH = Path(exports_dir) / 'cluster_naming_memory_histopathology.json'
    except Exception:
        MEM_PATH = None

memory = {}
if MEM_PATH is not None and MEM_PATH.exists():
    try:
        memory = json.loads(MEM_PATH.read_text())
    except Exception:
        memory = {}

used_names = set(memory.get('used_names', []))
cluster_mem = memory.get('clusters', {})

def _clean(s: str) -> str:
    s = '' if s is None else str(s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def _deterministic_name(group: pd.DataFrame) -> str:
    # Use ontology levels if available
    parts=[]
    for c in ['cluster_l3','cluster_l5','cluster_l4']:
        if c in group.columns:
            v = _clean(group[c].mode().iloc[0]) if not group[c].mode().empty else ''
            if v:
                parts.append(v)
    if not parts:
        parts.append('Cluster')
    return ' — '.join(parts)[:80]

def _deterministic_keywords(group: pd.DataFrame) -> list:
    # Simple keyword set from mode of label/source and ontology tokens
    kws=[]
    for c in ['label','source','cluster_l1','cluster_l2','cluster_l3','cluster_l4','cluster_l5','cluster_l6','cluster_l7']:
        if c in group.columns:
            try:
                v = _clean(group[c].mode().iloc[0])
                if v and v.lower()!='nan':
                    kws.append(v)
            except Exception:
                pass
    # de-dupe
    out=[]
    seen=set()
    for k in kws:
        k=k.strip()
        if k and k not in seen:
            seen.add(k)
            out.append(k)
    return out[:12]

def _ensure_unique(name: str, suffix: str) -> str:
    base = name
    if base not in used_names:
        used_names.add(base)
        return base
    # Stable disambiguation
    cand = f"{base} — {suffix}"
    if cand not in used_names:
        used_names.add(cand)
        return cand
    # Final fallback
    i=2
    while True:
        cand2 = f"{cand} v{i}"
        if cand2 not in used_names:
            used_names.add(cand2)
            return cand2
        i += 1

# Iterate clusters
out_rows = []
for cid, g in df.groupby('cluster_id', sort=True):
    cid_str = str(int(cid))

    # If already in memory, reuse
    if cid_str in cluster_mem:
        info = cluster_mem[cid_str]
        cname = info.get('cluster_name')
        cdef  = info.get('cluster_definition')
        ckws  = info.get('cluster_keywords', [])
        out_rows.append((cid, cname, cdef, ckws))
        continue

    # Evidence samples
    samples=[]
    for col in ['title','semantic_text','text']:
        if col in g.columns:
            s = g[col].astype(str)
            s = s[s.str.len()>0].head(10)
            samples.extend(s.tolist())
    samples = [ _clean(s) for s in samples if _clean(s) ]
    samples = samples[:10]

    # Default deterministic
    cname = _deterministic_name(g)
    ckws = _deterministic_keywords(g)
    cdef = ''

    # Optional LLM-assisted naming
    if client is not None:
        prompt = {
            "role": "user",
            "content": (
                "You are naming a histopathology cluster for an interactive cognitive map.\n"
                "Return STRICT JSON with keys: cluster_name, cluster_definition, cluster_keywords.\n"
                "Rules: unique short name; 1-2 sentence definition grounded ONLY in evidence; 5-12 keywords.\n"
                "Do NOT invent biology.\n\n"
                f"Existing deterministic name: {cname}\n"
                f"Evidence samples:\n- " + "\n- ".join(samples)
            )
        }
        try:
            resp = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[prompt],
                temperature=0.2,
                max_tokens=350,
            )
            txt = resp.choices[0].message.content.strip()
            # Extract JSON
            m = re.search(r"\{.*\}", txt, flags=re.S)
            if m:
                obj = json.loads(m.group(0))
                cname = _clean(obj.get('cluster_name') or cname)
                cdef = _clean(obj.get('cluster_definition') or '')
                ckws = obj.get('cluster_keywords') or ckws
                if isinstance(ckws, str):
                    # split comma list
                    ckws = [k.strip() for k in ckws.split(',') if k.strip()]
        except Exception as e:
            print('LLM naming failed for cluster', cid, ':', e)

    # Uniqueness enforcement
    suffix = _clean(g['cluster_l5'].mode().iloc[0]) if 'cluster_l5' in g.columns and not g['cluster_l5'].mode().empty else f"cid{cid_str}"
    cname = _ensure_unique(cname, suffix)

    if not cdef:
        # deterministic definition
        bits=[]
        if 'cluster_l3' in g.columns and not g['cluster_l3'].mode().empty:
            bits.append(_clean(g['cluster_l3'].mode().iloc[0]))
        if 'cluster_l5' in g.columns and not g['cluster_l5'].mode().empty:
            bits.append(_clean(g['cluster_l5'].mode().iloc[0]))
        if 'cluster_l4' in g.columns and not g['cluster_l4'].mode().empty:
            bits.append(_clean(g['cluster_l4'].mode().iloc[0]))
        cdef = "Cluster characterized by: " + ", ".join([b for b in bits if b]) + "."

    out_rows.append((cid, cname, cdef, ckws))
    cluster_mem[cid_str] = {
        'cluster_name': cname,
        'cluster_definition': cdef,
        'cluster_keywords': ckws,
    }

# Apply to df
name_map = {int(cid): cname for cid, cname, _, _ in out_rows}
def_map  = {int(cid): cdef  for cid, _, cdef, _ in out_rows}
kw_map   = {int(cid): ckws  for cid, _, _, ckws in out_rows}

df['cluster_name'] = df['cluster_id'].astype(int).map(name_map)
df['cluster_definition'] = df['cluster_id'].astype(int).map(def_map)
df['cluster_keywords'] = df['cluster_id'].astype(int).map(lambda c: json.dumps(kw_map[int(c)], ensure_ascii=False))

# For Mantis UI: cluster column should be the human-readable name
if 'cluster' in df.columns:
    df['cluster_original'] = df['cluster']
df['cluster'] = df['cluster_name']

# Persist memory
memory = {
    'used_names': sorted(list(used_names)),
    'clusters': cluster_mem,
}
if MEM_PATH is not None:
    try:
        MEM_PATH.parent.mkdir(parents=True, exist_ok=True)
        MEM_PATH.write_text(json.dumps(memory, ensure_ascii=False, indent=2))
        print('✅ Memory written:', MEM_PATH)
    except Exception as e:
        print('⚠️ Could not write memory:', e)

print('✅ Semantic Cartographer complete. Example clusters:')
display(df[['cluster_id','cluster_name','cluster_definition','cluster_keywords']].drop_duplicates('cluster_id').head(8))


⚠️ Filling 15 missing cluster_ids with -1 (likely anchors).
✅ Semantic Cartographer complete. Example clusters:


Unnamed: 0,cluster_id,cluster_name,cluster_definition,cluster_keywords
0,25,Colorectal — Adipose Tissue — Normal/Benign,"Cluster characterized by: Colorectal, Adipose ...","[""adipose"", ""MEDMNIST_PATHMNIST"", ""Biological ..."
57,28,Colorectal — Background / Empty Slide — Not Ap...,"Cluster characterized by: Colorectal, Backgrou...","[""background"", ""MEDMNIST_PATHMNIST"", ""Non-Tiss..."
113,22,Colorectal — Necrotic Debris — Inflammatory/Re...,"Cluster characterized by: Colorectal, Necrotic...","[""debris"", ""MEDMNIST_PATHMNIST"", ""Biological T..."
170,24,Colorectal — Lymphocytes — Inflammatory/Reactive,"Cluster characterized by: Colorectal, Lymphocy...","[""lymphocytes"", ""MEDMNIST_PATHMNIST"", ""Biologi..."
227,23,Colorectal — Mucus — Structural,"Cluster characterized by: Colorectal, Mucus, S...","[""mucus"", ""MEDMNIST_PATHMNIST"", ""Biological Ti..."
284,27,Colorectal — Smooth Muscle — Structural,"Cluster characterized by: Colorectal, Smooth M...","[""smooth muscle"", ""MEDMNIST_PATHMNIST"", ""Biolo..."
341,14,Colorectal — Normal Mucosa — Normal/Benign,"Cluster characterized by: Colorectal, Normal M...","[""normal colon mucosa"", ""MEDMNIST_PATHMNIST"", ..."
398,26,Colorectal — Cancer-Associated Stroma — Normal...,"Cluster characterized by: Colorectal, Cancer-A...","[""cancer-associated stroma"", ""MEDMNIST_PATHMNI..."


In [22]:

#@title 7D) Ontology & Relationship Builder (Demo4 hypothesis loop) — cluster_xref JSON

import json, numpy as np, pandas as pd
from sklearn.preprocessing import normalize

# Ensure df exists
if 'df' not in globals():
    raise RuntimeError('df not found. Run previous cells first.')

# Ensure metadata + vectors
if 'metadata' not in df.columns:
    df = tool_promote_metadata(df)

if 'vector' not in df.columns:
    raise RuntimeError("Missing 'vector' column. Ensure embedding fusion ran.")

TOPK_XREF = 8
SIM_STRICT = 0.85
SIM_LOOSE  = 0.75

# Parse vectors -> matrix
X, d, bad = vectors_to_matrix(df, vector_col='vector')
Xn = normalize(X, norm='l2', axis=1)

# Cluster centroids
cluster_ids = sorted(df['cluster_id'].astype(int).unique().tolist())
idx_map = {cid: df.index[df['cluster_id'].astype(int) == cid].to_numpy() for cid in cluster_ids}

centroids = []
for cid in cluster_ids:
    v = Xn[idx_map[cid]].mean(axis=0)
    v = v / (np.linalg.norm(v) + 1e-12)
    centroids.append(v)
C = np.stack(centroids).astype(np.float32)

# Cosine similarity (clusters x clusters)
S = C @ C.T

# Keyword sets per cluster
kw_sets = {}
rep = df.drop_duplicates('cluster_id')[['cluster_id','cluster_keywords','cluster_l4','cluster_l3']]
for _, r in rep.iterrows():
    try:
        kws = set([str(x).lower() for x in json.loads(r['cluster_keywords'])])
    except Exception:
        kws = set()
    kw_sets[int(r['cluster_id'])] = kws

def _jaccard(a: set, b: set) -> float:
    if not a and not b:
        return 0.0
    return float(len(a & b) / max(1, len(a | b)))

# Hypothesis loop: propose -> test -> keep
xref_map = {}
for i, cid in enumerate(cluster_ids):
    sims = S[i].copy()
    sims[i] = -999.0

    cand = np.argsort(-sims)[:max(TOPK_XREF*2, 12)]
    out = []
    for j in cand:
        if len(out) >= TOPK_XREF:
            break
        cid2 = cluster_ids[j]
        cos = float(sims[j])
        jac = _jaccard(kw_sets.get(cid, set()), kw_sets.get(cid2, set()))

        # weight in [0,1]
        sim_w = max(0.0, min(1.0, (cos + 1.0) / 2.0))
        w = 0.80 * sim_w + 0.20 * jac
        w = float(max(0.0, min(1.0, w)))

        rel = None
        if cos >= SIM_STRICT:
            rel = 'SIMILAR_TO'
        elif cos >= SIM_LOOSE or jac >= 0.25:
            rel = 'ASSOCIATED_WITH'
        else:
            continue

        # Contrast rule: close but different pathology state
        try:
            st1 = str(df.loc[df['cluster_id']==cid, 'cluster_l4'].iloc[0])
            st2 = str(df.loc[df['cluster_id']==cid2, 'cluster_l4'].iloc[0])
            if cos >= SIM_LOOSE and st1 != st2:
                rel = 'CONTRASTS_WITH'
        except Exception:
            pass

        out.append({
            'target_cluster_id': int(cid2),
            'target_cluster_name': str(df.loc[df['cluster_id']==cid2, 'cluster_name'].iloc[0]) if 'cluster_name' in df.columns else str(cid2),
            'relation_type': rel,
            'weight': round(w, 6),
            'rationale': f'Centroid cosine={cos:.3f}; keyword overlap={jac:.2f}.'
        })

    # Hard fallback: top cosine neighbors
    if len(out) < min(TOPK_XREF, len(cluster_ids)-1):
        fallback = np.argsort(-sims)[:TOPK_XREF]
        for j in fallback:
            if len(out) >= TOPK_XREF:
                break
            cid2 = cluster_ids[j]
            if cid2 == cid:
                continue
            if any(e['target_cluster_id']==cid2 for e in out):
                continue
            cos = float(sims[j])
            sim_w = max(0.0, min(1.0, (cos + 1.0)/2.0))
            out.append({
                'target_cluster_id': int(cid2),
                'target_cluster_name': str(df.loc[df['cluster_id']==cid2, 'cluster_name'].iloc[0]) if 'cluster_name' in df.columns else str(cid2),
                'relation_type': 'ASSOCIATED_WITH',
                'weight': round(float(sim_w), 6),
                'rationale': f'Fallback neighbor by centroid cosine={cos:.3f}.'
            })

    xref_map[int(cid)] = out

# Inject into df

df['cluster_xref'] = df['cluster_id'].astype(int).map(lambda c: json.dumps(xref_map[int(c)], ensure_ascii=False))

# Also inject into metadata so inspector always has it
meta_objs = df['metadata'].astype(str).map(_safe_json_load)
new_meta = []
for m, cid in zip(meta_objs, df['cluster_id'].astype(int).tolist()):
    m = dict(m)
    m['cluster_xref'] = xref_map[int(cid)]
    new_meta.append(_safe_json_dump(m))

df['metadata'] = new_meta

print('✅ cluster_xref built. Example:')
display(df[['cluster_id','cluster_name','cluster_xref']].drop_duplicates('cluster_id').head(5))



✅ cluster_xref built. Example:


Unnamed: 0,cluster_id,cluster_name,cluster_xref
0,25,Colorectal — Adipose Tissue — Normal/Benign,"[{""target_cluster_id"": 23, ""target_cluster_nam..."
57,28,Colorectal — Background / Empty Slide — Not Ap...,"[{""target_cluster_id"": -1, ""target_cluster_nam..."
113,22,Colorectal — Necrotic Debris — Inflammatory/Re...,"[{""target_cluster_id"": 26, ""target_cluster_nam..."
170,24,Colorectal — Lymphocytes — Inflammatory/Reactive,"[{""target_cluster_id"": 25, ""target_cluster_nam..."
227,23,Colorectal — Mucus — Structural,"[{""target_cluster_id"": 25, ""target_cluster_nam..."


In [23]:

#@title 7E) Critic Validation + Final Export (Mantis-ready) — keeps 100+ rows

import os, json, pandas as pd

# Ensure df exists
if 'df' not in globals():
    raise RuntimeError('df not found. Run previous cells first.')

# 1) Promote metadata fields

df = tool_promote_metadata(df)

# 2) Ensure body/title/labels + display fields

df = tool_make_title_body_labels(df)

# 3) Compute x,y if missing

df = tool_compute_layout(df, vector_col='vector', seed=42)

# 4) Build neighbors_json and reference-style nn*_item_id/dist/weight

df = tool_build_neighbors_json(df, topn=5)

# 5) Purity/entropy per cluster

df = tool_purity_entropy(df, label_col='label', cluster_col='cluster_id')

# 6) Final schema tweaks
# Ensure id/item_id are strings

df['id'] = df['id'].astype(str)
df['item_id'] = df['item_id'].astype(str)

# Ensure cluster_name present
if 'cluster_name' not in df.columns:
    df['cluster_name'] = df['cluster']

# Ensure cluster_definition/keywords present
if 'cluster_definition' not in df.columns:
    df['cluster_definition'] = ''
if 'cluster_keywords' not in df.columns:
    df['cluster_keywords'] = '[]'

# 7) Critic validation

tool_validate(df, min_rows=100)

# 8) Export ONE file

OUT_PATH = '04_mantis_histopathology_agentic_revamped.csv'

df.to_csv(OUT_PATH, index=False)
print('✅ Wrote:', OUT_PATH, 'rows:', len(df), 'cols:', len(df.columns))

# Preview

display(df[['id','title','labels','cluster_id','cluster_name','x','y']].head(3))



✅ Critic approved: validation passed
✅ Wrote: 04_mantis_histopathology_agentic_revamped.csv rows: 3727 cols: 85


Unnamed: 0,id,title,labels,cluster_id,cluster_name,x,y
0,0,Colorectal — Adipose Tissue — Normal/Benign | ...,"feature-routine,cluster-colorectal- -adipose-t...",25,Colorectal — Adipose Tissue — Normal/Benign,4.125274,-1.633909
1,1,Colorectal — Adipose Tissue — Normal/Benign | ...,"feature-routine,cluster-colorectal- -adipose-t...",25,Colorectal — Adipose Tissue — Normal/Benign,4.087461,-1.669118
2,2,Colorectal — Adipose Tissue — Normal/Benign | ...,"feature-routine,cluster-colorectal- -adipose-t...",25,Colorectal — Adipose Tissue — Normal/Benign,4.135255,-1.530786


## 8) Build Mantis CSV (strict + backward compatible)

We export both:

- **New strict fields** (commonly expected in Mantis‑style spaces):  
  `id`, `text`, `vector_str`, `cluster_l1..cluster_l7`, `metadata`

- **Backward‑compatible fields** used by the legacy notebook / UI inspector:  
  `title`, `semantic_text`, `x`, `y`, `raw_label`, `dataset_key`, `split`, `mpp`, `width`, `height`, `image_path`

This makes the CSV resilient to schema differences across Mantis builds.


## 8) Export (OBSOLETE in agentic patch)

This notebook patch adds a new **Step 7E** export cell (Critic + single CSV).

**Skip this original Step 8** to avoid writing multiple CSVs / dropping columns.

## 9) Quick health checks (hierarchy depth, no unlabeled clusters)

These checks catch the failure modes you saw:

- unlabeled clusters (`""` / `NaN`)
- insufficient semantic depth (only 1–2 levels)
- dataset dominating by raw label only


## 9) Validate (OBSOLETE in agentic patch)

Validation is handled inside **Step 7E** (Critic gate).

Skip this cell.

## 10) Optional: Upload to Mantis via API

Because the dev docs require login, the exact endpoint may differ.  
The cell below is a **template**: set the correct URL and paste your token **at runtime**.

✅ Recommendation: store token in a Colab secret / environment variable rather than hard‑coding.


In [None]:
#@title 10) Upload template (fill in endpoint + token)

import os, requests

MANTIS_UPLOAD_URL = "https://mantisdev.csail.mit.edu/api/v1/upload"  # <-- update if needed

# Set in Colab: %env MANTIS_TOKEN=...
MANTIS_TOKEN = os.environ.get("MANTIS_TOKEN", "")

if not MANTIS_TOKEN:
    print("No token found in env var MANTIS_TOKEN. Set it before uploading.")
else:
    with open(EXPORT_CSV, "rb") as f:
        files = {"file": f}
        headers = {"Authorization": f"Bearer {MANTIS_TOKEN}"}
        resp = requests.post(MANTIS_UPLOAD_URL, headers=headers, files=files, timeout=120)
    print("Status:", resp.status_code)
    try:
        print(resp.json())
    except Exception:
        print(resp.text[:1000])


In [None]:
#@title 11) Global KG Healthcheck (PyVis) — cluster-colored + searchable + bridges
# This is a LIGHTWEIGHT, local visualization to sanity-check:
#  - cluster naming
#  - node naming (uses id)
#  - connectivity (bridges between islands)

from pathlib import Path
import base64, re, math, colorsys, json
import numpy as np
import pandas as pd
from IPython.display import HTML, display

try:
    from pyvis.network import Network
except ImportError:
    import sys, subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "pyvis"])
    from pyvis.network import Network

# -----------------
# Load export
# -----------------
export_path = Path(EXPORT_DIR) / f"{EXPORT_BASENAME}.csv"
df_viz = pd.read_csv(export_path)

# Parse dataset_key from metadata for optional coloring/debug

def _dk(md):
    try:
        return json.loads(md).get("dataset_key","?")
    except Exception:
        return "?"

df_viz["dataset_key"] = df_viz["metadata"].map(_dk)

# -----------------
# Edge extraction from nn columns
# -----------------

NN_K = 8
edge_rows = []
for j in range(1, NN_K+1):
    nn = f"nn{j}"
    dd = f"nn_dist{j}"
    if nn not in df_viz.columns:
        continue
    src = df_viz["id"].astype(str)
    dst = df_viz[nn].astype(str)
    dist = pd.to_numeric(df_viz[dd], errors="coerce").fillna(1.0)
    # keep only strong edges
    m = dist < 0.35
    edge_rows.append(pd.DataFrame({"src": src[m], "dst": dst[m], "dist": dist[m]}))

edges_df = pd.concat(edge_rows, ignore_index=True) if edge_rows else pd.DataFrame(columns=["src","dst","dist"])
# Drop self-edges
edges_df = edges_df[edges_df["src"] != edges_df["dst"]]

# -----------------
# Bridge selection
# -----------------
# We'll treat leaf cluster as cluster_l5 for now
leaf = "cluster_l5"

# compute cross-cluster edges
join = df_viz.set_index("id")[[leaf]]
edges_df = edges_df.join(join, on="src").rename(columns={leaf:"src_leaf"})
edges_df = edges_df.join(join, on="dst").rename(columns={leaf:"dst_leaf"})
edges_df["cross"] = edges_df["src_leaf"] != edges_df["dst_leaf"]

# Pick top bridge edges (lowest dist)
N_BRIDGES = 80
bridge_edges = edges_df[edges_df["cross"]].sort_values("dist").head(N_BRIDGES)
forced_nodes = set(bridge_edges["src"]).union(set(bridge_edges["dst"]))

# Also keep some representatives per leaf cluster
ITEMS_PER_CLUSTER = 20
rep_nodes = set()
for c, g in df_viz.groupby(leaf):
    # prefer forced nodes
    forced_in = [n for n in g["id"].tolist() if n in forced_nodes]
    keep = forced_in[:ITEMS_PER_CLUSTER]
    if len(keep) < ITEMS_PER_CLUSTER:
        keep += g["id"].tolist()[: (ITEMS_PER_CLUSTER - len(keep))]
    rep_nodes.update(keep)

keep_nodes = rep_nodes.union(forced_nodes)

df_sub = df_viz[df_viz["id"].isin(keep_nodes)].copy()
edges_sub = edges_df[edges_df["src"].isin(keep_nodes) & edges_df["dst"].isin(keep_nodes)].copy()

# -----------------
# Coloring
# -----------------

def _cluster_color(key: str) -> str:
    # stable hash → HSV
    h = (abs(hash(key)) % 10_000) / 10_000.0
    r, g, b = colorsys.hsv_to_rgb(h, 0.60, 0.92)
    return f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"

def _rgba(hex_c, alpha):
    hex_c = hex_c.lstrip("#")
    r, g, b = int(hex_c[0:2], 16), int(hex_c[2:4], 16), int(hex_c[4:6], 16)
    return {"color": f"rgba({r},{g},{b},{alpha:.3f})"}

# -----------------
# Build PyVis
# -----------------
net = Network(height="760px", width="100%", directed=False, notebook=False, bgcolor="#ffffff")
net.barnes_hut(gravity=-25000, central_gravity=0.25, spring_length=160, damping=0.09)

# Cluster nodes (leaf clusters)
leaf_sizes = df_viz[leaf].value_counts().to_dict()
leaf_to_cid = {c: f"CLUSTER::{c}" for c in df_viz[leaf].unique()}

for c in sorted(leaf_to_cid.keys()):
    cid = leaf_to_cid[c]
    col = _cluster_color(c)
    n = int(leaf_sizes.get(c, 0))
    label = f"{c} (n={n})"
    net.add_node(cid, label=label, title=label, shape="diamond", size=28,
                 color={"background": col, "border": col}, hc_type="CLUSTER")

# Item nodes
for r in df_sub.itertuples(index=False):
    c = getattr(r, leaf)
    col = _cluster_color(c)
    # show id as node label (strict identifier)
    node_label = str(r.id)
    title = f"<b>{r.title}</b><br>{r.semantic_text}<br><br><b>Ontology:</b> {r.ontology_path}"
    net.add_node(str(r.id), label=node_label, title=title, shape="dot", size=10,
                 color={"background": col, "border": col}, hc_type="ITEM", hc_leaf=c)
    # connect to its leaf cluster
    net.add_edge(str(r.id), leaf_to_cid[c], color=_rgba(col, 0.35), value=1.2, hc_group="META")

# Similarity edges
for e in edges_sub.itertuples(index=False):
    src = str(e.src)
    dst = str(e.dst)
    if src not in keep_nodes or dst not in keep_nodes:
        continue
    cross = bool(e.cross)
    w = float(1.0 - float(e.dist))
    # thicker + darker for bridges
    net.add_edge(src, dst,
                 color={"color": "rgba(60,60,60,0.75)"} if cross else {"color": "rgba(0,0,0,0.10)"},
                 value=3.0 if cross else 0.6,
                 title=f"cosine_dist={float(e.dist):.3f}",
                 hc_group="SIM")

# -----------------
# Embed with search UI
# -----------------

def _as_iframe(html: str, height_px: int = 780) -> HTML:
    b64 = base64.b64encode(html.encode("utf-8")).decode("ascii")
    return HTML(f'<iframe src="data:text/html;base64,{b64}" width="100%" height="{height_px}" frameborder="0"></iframe>')

# UI injection (reuse style from your D4.2 cell)

def _inject_ui(pyvis_html: str) -> str:
    ui = """
    <div style="padding:10px 12px; font-family: Arial, sans-serif; border-bottom: 1px solid #eee;">
      <div style="display:flex; gap:10px; align-items:center; flex-wrap:wrap;">
        <div style="flex:1; min-width: 320px;">
          <input id="hc_search" list="hc_node_list" type="text"
                 placeholder="Search node… (id, title, ontology)"
                 style="width:100%; padding:7px 9px; border:1px solid #ccc; border-radius:8px;">
          <datalist id="hc_node_list"></datalist>
        </div>
        <button onclick="hcSearch()" style="padding:7px 12px; border-radius:8px; border:1px solid #999; background:#fff;">Find</button>
        <label style="display:flex; gap:6px; align-items:center;">
          <input id="hc_show_sim" type="checkbox" checked onchange="hcToggleEdges('SIM')">
          <span style="font-size:13px;">SIM edges</span>
        </label>
        <label style="display:flex; gap:6px; align-items:center;">
          <input id="hc_show_meta" type="checkbox" checked onchange="hcToggleEdges('META')">
          <span style="font-size:13px;">Meta edges</span>
        </label>
        <span id="hc_status" style="margin-left:auto; font-size:12px; color:#555;"></span>
      </div>
      <div id="hc_legend" style="margin-top:8px; font-size:12px; color:#333;"></div>
    </div>
    """
    pyvis_html = pyvis_html.replace('<div id="mynetwork"', ui + '\n<div id="mynetwork"', 1)

    js = r"""
    function hcBuildAutocomplete() {
      try {
        var dl = document.getElementById('hc_node_list');
        dl.innerHTML = '';
        var arr = nodes.get();
        var seen = {};
        arr.forEach(function(n){
          var l = (n.id || n.label || '').toString().trim();
          if(!l) return;
          if(seen[l]) return;
          seen[l] = 1;
          var opt = document.createElement('option');
          opt.value = l;
          dl.appendChild(opt);
        });
      } catch(e) {}
    }
    function hcSearch() {
      var q = (document.getElementById('hc_search').value || '').toLowerCase().trim();
      if(!q) return;
      var found = null;
      var arr = nodes.get();
      for (var i=0; i<arr.length; i++){
        var n = arr[i];
        var id = (n.id || '').toLowerCase();
        var label = (n.label || '').toLowerCase();
        var title = (n.title || '').toLowerCase();
        if(id.includes(q) || label.includes(q) || title.includes(q)){
          found = n.id; break;
        }
      }
      var status = document.getElementById('hc_status');
      if(found){
        network.selectNodes([found]);
        network.focus(found, {scale: 1.8, animation: {duration: 600}});
        status.textContent = 'Found node: ' + found;
      } else {
        status.textContent = 'Not found';
      }
    }
    function hcToggleEdges(group){
      var show = (group === 'SIM') ? document.getElementById('hc_show_sim').checked
                                   : document.getElementById('hc_show_meta').checked;
      var arr = edges.get();
      arr.forEach(function(e){
        if(e.hc_group === group){
          e.hidden = !show;
          edges.update(e);
        }
      });
    }
    document.getElementById('hc_search').addEventListener('keydown', function(e){
      if(e.key === 'Enter'){ hcSearch(); }
    });
    """

    pyvis_html = re.sub(
        r"(network = new vis\.Network\(container, data, options\);)",
        r"\1" + "\n" + js + "\n hcBuildAutocomplete();\n",
        pyvis_html, count=1
    )
    return pyvis_html

html = net.generate_html()
html = _inject_ui(html)

legend_html = f"<b>Legend:</b> Items (dots) + leaf clusters (diamonds). Dark edges are cross-cluster bridges. Nodes shown: {len(df_sub)}"
html = html.replace('<div id="hc_legend" style="margin-top:8px; font-size:12px; color:#333;"></div>',
                    f'<div id="hc_legend" style="margin-top:8px; font-size:12px; color:#333;">{legend_html}</div>')

out_html = Path(EXPORT_DIR) / f"{EXPORT_BASENAME}__global_kg_healthcheck.html"
out_html.write_text(html, encoding="utf-8")
print("✅ Wrote:", out_html)

display(_as_iframe(html))
