# 00 — Setup & Path-Safe Runtime (Histopathology Cartography)

This notebook fixes the most common Colab failure mode: **Google Drive path confusion**.

What it does:
- mounts Google Drive
- reliably resolves `PROJECT_ROOT` (your project folder) even if Colab CWD is `/content`
- loads `pipeline_config.yaml`
- initializes structured logging + crash-safe state (`checkpoints/_STATE.json`)
- installs only **missing** dependencies (no numpy/pandas/torch upgrades)

> If your folder is `My Drive > mit > histopathology_202502...`, Colab path is:
> `/content/drive/MyDrive/mit/histopathology_202502...`


<a id="A0.0"></a>
### Cell A0.0 — Mount Drive & Resolve PROJECT_ROOT

- **Purpose:** Mount Google Drive and auto-detect the folder containing pipeline_config.yaml.
- **Inputs:** Env var HISTO_PROJECT_ROOT (optional); /content/drive/MyDrive/mit (default search base)
- **Outputs:** PROJECT_ROOT (Path), sys.path updated
- **Depends on:** None
- **Writes checkpoints:** None (but prints resolved path)


In [2]:
import os, sys
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules

def _mount_drive(mountpoint: str = "/content/drive", max_tries: int = 3, timeout_ms: int = 300000) -> bool:
    """Robust Google Drive mount with retries (Colab).

    Returns True if /content/drive/MyDrive becomes available.
    """
    if not IN_COLAB:
        return True
    try:
        from google.colab import drive  # type: ignore
    except Exception as e:
        print("⚠️ google.colab not available:", repr(e))
        return False

    import time

    mp = Path(mountpoint)
    if (mp / "MyDrive").exists():
        return True

    last = None
    for t in range(max_tries):
        try:
            kwargs = {}
            if t > 0:
                kwargs["force_remount"] = True
            # Some Colab versions accept timeout_ms; ignore if not.
            kwargs["timeout_ms"] = timeout_ms
            try:
                drive.mount(mountpoint, **kwargs)
            except TypeError:
                kwargs.pop("timeout_ms", None)
                if kwargs:
                    drive.mount(mountpoint, **kwargs)
                else:
                    drive.mount(mountpoint)

            if (mp / "MyDrive").exists():
                return True
        except Exception as e:
            last = e
            time.sleep(2)

    print("❌ Google Drive mount failed.")
    print("Fixes to try:")
    print("  1) Runtime ▸ Restart runtime, then re-run this cell")
    print("  2) Run: from google.colab import drive; drive.flush_and_unmount(); drive.mount('/content/drive', force_remount=True)")
    print("  3) In your browser, allow third‑party cookies for colab.research.google.com")
    if last is not None:
        print("Last error:", repr(last))
    return False

if IN_COLAB and not _mount_drive():
    raise RuntimeError("Cannot continue without Google Drive mounted. Fix Drive mount and re-run this cell.")

# Optional hard-set:
os.environ["HISTO_PROJECT_ROOT"] = "/content/drive/MyDrive/mit/histopathology_202601012"

def resolve_project_root() -> Path:
    """Find the folder that contains pipeline_config.yaml + label_taxonomy.yaml."""
    ev = os.environ.get("HISTO_PROJECT_ROOT")
    if ev:
        p = Path(ev).expanduser()
        if (p / "pipeline_config.yaml").exists() and (p / "label_taxonomy.yaml").exists():
            return p
        raise FileNotFoundError(f"HISTO_PROJECT_ROOT is set but required files not found in: {p}")

    bases = [
        Path("/content/drive/MyDrive/mit"),
        Path("/content/drive/MyDrive"),
    ]
    required = ["pipeline_config.yaml", "label_taxonomy.yaml"]
    candidates = []
    for base in bases:
        if not base.exists():
            continue
        for p in base.glob("**/pipeline_config.yaml"):
            root = p.parent
            if all((root / rf).exists() for rf in required):
                candidates.append(root.resolve())
        if candidates:
            break

    candidates = sorted(set(candidates), key=lambda p: p.stat().st_mtime, reverse=True)
    if not candidates:
        raise FileNotFoundError(
            "Could not locate project root containing pipeline_config.yaml + label_taxonomy.yaml.\n"
            "Expected it somewhere under /content/drive/MyDrive/mit/.\n"
            "Fix: copy the project folder into Drive, OR set os.environ['HISTO_PROJECT_ROOT'] explicitly."
        )

    if len(candidates) > 1:
        print("⚠️ Multiple candidate project roots found; using newest. To force, set HISTO_PROJECT_ROOT.")
        for c in candidates[:5]:
            print("  -", c)

    return candidates[0]

PROJECT_ROOT = resolve_project_root().resolve()
os.environ["HISTO_PROJECT_ROOT"] = str(PROJECT_ROOT)
print("✅ PROJECT_ROOT =", PROJECT_ROOT)

if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("sys.path[0] =", sys.path[0])


✅ PROJECT_ROOT = /content/drive/MyDrive/mit/histopathology_202601012
sys.path[0] = /content/drive/MyDrive/mit/histopathology_202601012


<a id="A0.1"></a>
### Cell A0.1 — Load config & init runtime

- **Purpose:** Load YAML config and initialize logging, checkpoints, and run STATE.
- **Inputs:** PROJECT_ROOT, pipeline_config.yaml
- **Outputs:** CFG dict, logger, checkpoints directories, STATE initialized
- **Depends on:** A0.0
- **Writes checkpoints:** checkpoints/_STATE.json, logs/run.jsonl


In [3]:

import json
from pathlib import Path

import yaml  # PyYAML (installed on Colab by default, installed if missing in A1.0)

from histo_cartography.runtime import init_runtime, set_seed, health_check, cell_context
from histo_cartography.paths import ensure_dirs

CONFIG_PATH = PROJECT_ROOT / "pipeline_config.yaml"
assert CONFIG_PATH.exists(), f"Missing {CONFIG_PATH}"

with cell_context(
    "A0.1",
    purpose="Load pipeline config and initialize runtime state/logging.",
    stage="A",
    checkpoint_paths=[str(PROJECT_ROOT / "checkpoints" / "_STATE.json")],
):
    cfg = yaml.safe_load(CONFIG_PATH.read_text())
    SAFE_MODE = bool(cfg["project"]["safe_mode"])
    DEBUG_LEVEL = int(cfg["project"]["debug_level"])
    SEED = int(cfg["project"]["seed"])

    # Create standard directories on Drive
    ensure_dirs(PROJECT_ROOT, [
        cfg["paths"]["log_dir"],
        cfg["paths"]["checkpoints_dir"],
        cfg["paths"]["data_raw_dir"],
        cfg["paths"]["data_staging_dir"],
        cfg["paths"]["exports_dir"],
        "exports/eda",
        "exports/cartography",
        "exports/kg",
    ])

    init_runtime(PROJECT_ROOT, safe_mode=SAFE_MODE, debug_level=DEBUG_LEVEL,
                 log_dir_rel=cfg["paths"]["log_dir"],
                 checkpoint_dir_rel=cfg["paths"]["checkpoints_dir"])

    set_seed(SEED)

print("SAFE_MODE =", SAFE_MODE, "| DEBUG_LEVEL =", DEBUG_LEVEL)


INFO:histo_cartography:Logging to: /content/drive/MyDrive/mit/histopathology_202601012/logs/run.jsonl
INFO:histo_cartography:Loaded existing STATE for resume mode
INFO:histo_cartography:Seeds set to 1337
INFO:histo_cartography:✅ A0.1 finished in 0.66s


SAFE_MODE = True | DEBUG_LEVEL = 1


<a id="A1.0"></a>
### Cell A1.0 — Dependency resolver (conservative)

- **Purpose:** Install only missing packages. Avoid upgrading numpy/pandas/torch. Optional risky installs behind flag.
- **Inputs:** cfg.project.allow_risky_installs
- **Outputs:** Installed packages, checkpoints/requirements.lock.txt
- **Depends on:** A0.1
- **Writes checkpoints:** checkpoints/requirements.lock.txt


In [4]:

import importlib.util
import subprocess, sys
from pathlib import Path
import yaml

from histo_cartography.runtime import cell_context

cfg = yaml.safe_load((PROJECT_ROOT / "pipeline_config.yaml").read_text())
ALLOW_RISKY = bool(cfg["project"].get("allow_risky_installs", False))

# Conservative required deps
REQUIRED = [
    "pyarrow",   # parquet
    "rdflib",    # RDF export (pure python)
    "tqdm",      # progress bars
]

# Optional / riskier deps (compiled extensions). Only install when ALLOW_RISKY=True.
RISKY = [
    "umap-learn",  # may pull numba/llvmlite
    "hdbscan",     # compiled
]

def spec_exists(pkg: str) -> bool:
    # pkg names like "umap-learn" map to module "umap"; we'll check common transforms.
    candidates = [pkg, pkg.replace("-", "_")]
    if pkg == "umap-learn":
        candidates = ["umap"]
    for m in candidates:
        if importlib.util.find_spec(m) is not None:
            return True
    return False

def pip_install(pkgs):
    if not pkgs:
        return
    cmd = [sys.executable, "-m", "pip", "install", "-q"] + pkgs
    print("Installing:", pkgs)
    subprocess.check_call(cmd)

with cell_context("A1.0", purpose="Install missing dependencies conservatively", stage="A"):
    missing = [p for p in REQUIRED if not spec_exists(p)]
    pip_install(missing)

    if ALLOW_RISKY:
        missing_risky = [p for p in RISKY if not spec_exists(p)]
        pip_install(missing_risky)
    else:
        print("⚠️  Risky deps disabled. UMAP/HDBSCAN will fallback to PCA/KMeans unless you set allow_risky_installs=true.")

    # pip check (non-fatal)
    try:
        out = subprocess.check_output([sys.executable, "-m", "pip", "check"], stderr=subprocess.STDOUT).decode()
        print(out[:2000])
    except Exception as e:
        print("pip check failed (non-fatal):", e)

    # Lock file
    lock_path = PROJECT_ROOT / cfg["paths"]["checkpoints_dir"] / "requirements.lock.txt"
    freeze = subprocess.check_output([sys.executable, "-m", "pip", "freeze"]).decode("utf-8")
    lock_path.write_text(freeze)

print("✅ requirements.lock.txt written to:", lock_path)


INFO:histo_cartography:▶️  A1.0: Install missing dependencies conservatively


Installing: ['rdflib']
⚠️  Risky deps disabled. UMAP/HDBSCAN will fallback to PCA/KMeans unless you set allow_risky_installs=true.
pip check failed (non-fatal): Command '['/usr/bin/python3', '-m', 'pip', 'check']' returned non-zero exit status 1.


INFO:histo_cartography:✅ A1.0 finished in 8.00s


✅ requirements.lock.txt written to: /content/drive/MyDrive/mit/histopathology_202601012/checkpoints/requirements.lock.txt


<a id="A2.0"></a>
### Cell A2.0 — Binary compatibility guard

- **Purpose:** Heuristic checks for common Colab binary incompatibilities (numpy/pandas/torch/CUDA).
- **Inputs:** Current environment
- **Outputs:** Guard report dict
- **Depends on:** A1.0
- **Writes checkpoints:** logs/run.jsonl


In [5]:

from histo_cartography.runtime import cell_context, binary_compatibility_guard

with cell_context("A2.0", purpose="Run binary compatibility guard", stage="A"):
    report = binary_compatibility_guard()

print("Warnings:", report.get("warnings", []))


INFO:histo_cartography:▶️  A2.0: Run binary compatibility guard
INFO:histo_cartography:Binary compatibility guard report
INFO:histo_cartography:✅ A2.0 finished in 2.55s




<a id="A3.0"></a>
### Cell A3.0 — Crash mitigation utilities

- **Purpose:** Seed control, torch CUDA memory knobs, and a lightweight watchdog you can call before heavy steps.
- **Inputs:** SAFE_MODE, DEBUG_LEVEL, SEED
- **Outputs:** watchdog_report() utility
- **Depends on:** A0.1
- **Writes checkpoints:** None


In [6]:

import os, sys
from histo_cartography.runtime import cell_context, env_fingerprint

with cell_context("A3.0", purpose="Configure conservative runtime settings", stage="A"):
    os.environ.setdefault("PYTHONHASHSEED", "0")
    # Torch GPU memory fragmentation mitigations (no effect if CPU-only)
    os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "max_split_size_mb:128")

    try:
        import torch
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
    except Exception:
        pass

    fp = env_fingerprint()

def watchdog_report():
    from histo_cartography.runtime import disk_free_gb
    from pathlib import Path
    rep = {
        "project_root": str(PROJECT_ROOT),
        "disk_free_gb": float(disk_free_gb(Path(PROJECT_ROOT))),
    }
    try:
        import torch
        rep["cuda_available"] = bool(torch.cuda.is_available())
        if torch.cuda.is_available():
            rep["gpu_name"] = torch.cuda.get_device_name(0)
            rep["gpu_mem_allocated_mb"] = int(torch.cuda.memory_allocated() / (1024**2))
    except Exception:
        pass
    return rep

print("Runtime fingerprint:", fp)
print("Watchdog report:", watchdog_report())


INFO:histo_cartography:▶️  A3.0: Configure conservative runtime settings
INFO:histo_cartography:✅ A3.0 finished in 0.05s


Runtime fingerprint: {'python': '3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]', 'platform': 'Linux-6.6.105+-x86_64-with-glibc2.35', 'executable': '/usr/bin/python3', 'time_utc': '2026-01-13T06:52:43Z', 'torch': '2.9.0+cpu', 'cuda_available': False, 'cuda_version': None, 'cudnn': None, 'numpy': '2.0.2', 'pandas': '2.2.2', 'sklearn': '1.6.1'}
Watchdog report: {'project_root': '/content/drive/MyDrive/mit/histopathology_202601012', 'disk_free_gb': 194.31491088867188, 'cuda_available': False}
