# Stage 00 — Download & stage raw data (inventory only)

This notebook prepares a **raw inventory index** (parquet-first) for downstream stages.

Outputs:
- `exports/stage_00_download/raw_index.parquet`

Notes:
- Stage 00 is **inventory-only**: it does *not* create stable `item_id`s or text prompts.
- Stage 01 builds the canonical `items.parquet` used by the rest of the pipeline.


In [None]:
# --- Colab-first setup ---
import os, sys, time
from pathlib import Path

FORCE_REBUILD = False
FAST_MODE = True
EDA_LEVEL = "core"

SHOW_PLOTS = True
SAVE_PLOTS = True

DRIVE_SEARCH_BASE = "/content/drive/MyDrive"

def _is_colab() -> bool:
    return "google.colab" in sys.modules

if _is_colab():
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive")

def _resolve_project_root() -> Path:
    ev = os.environ.get("HISTO_PROJECT_ROOT")
    if ev and Path(ev).exists():
        return Path(ev)

    base = Path(DRIVE_SEARCH_BASE)
    candidates = []
    if base.exists():
        for p in base.glob("**/pipeline_config.yaml"):
            parent = p.parent
            if (parent / "label_taxonomy.yaml").exists():
                candidates.append(parent)
    if candidates:
        candidates = sorted(candidates, key=lambda p: p.stat().st_mtime, reverse=True)
        return candidates[0]

    p = Path.cwd()
    for _ in range(10):
        if (p / "pipeline_config.yaml").exists():
            return p
        p = p.parent
    raise FileNotFoundError("Could not resolve PROJECT_ROOT. Set HISTO_PROJECT_ROOT env var.")

PROJECT_ROOT = _resolve_project_root()
sys.path.insert(0, str(PROJECT_ROOT))
print("PROJECT_ROOT:", PROJECT_ROOT)

# Install deps
import subprocess
subprocess.check_call([sys.executable, "-m", "pip", "-q", "install", "-r", str(PROJECT_ROOT / "requirements.txt")])

import yaml
cfg = yaml.safe_load((PROJECT_ROOT / "pipeline_config.yaml").read_text())

EXPORTS_DIR = PROJECT_ROOT / str(cfg.get("paths", {}).get("exports_dir", "exports"))
RAW_DIR = PROJECT_ROOT / str(cfg.get("paths", {}).get("raw_dir", "data/raw"))
STAGING_DIR = PROJECT_ROOT / str(cfg.get("paths", {}).get("staging_dir", "data/staging"))
EXPORTS_DIR.mkdir(parents=True, exist_ok=True)
RAW_DIR.mkdir(parents=True, exist_ok=True)
STAGING_DIR.mkdir(parents=True, exist_ok=True)

SAFE_MODE = bool(cfg.get("project", {}).get("safe_mode", True))
SEED = int(cfg.get("project", {}).get("seed", 1337))

print("SAFE_MODE:", SAFE_MODE)


In [None]:
# --- Stage paths + registries ---
from pathlib import Path
import pandas as pd

from histo_cartography.viz import ensure_dir, save_and_display, register_plot
from histo_cartography.artifact_registry import register_artifact, append_stage_manifest
from histo_cartography.critic import run_critic, write_critic_report, critic_result_table, critic_issues_table

stage_dir = EXPORTS_DIR / "stage_00_download"
plots_dir = ensure_dir(stage_dir / "plots")
qa_dir = ensure_dir(stage_dir / "qa")

raw_index_path = stage_dir / "raw_index.parquet"
viz_records = []

print("stage_dir:", stage_dir)


## PEEP — Config sanity

In [None]:
# PEEP — show data config
data_cfg = cfg.get("data", {})
dataset_keys = data_cfg.get("dataset_keys") or [data_cfg.get("dataset_key", "CRC_VAL_HE_7K")]
split = str(data_cfg.get("split", "val"))
max_items = data_cfg.get("max_items_safe", 512) if SAFE_MODE else data_cfg.get("max_items_full", None)

print("dataset_keys:", dataset_keys)
print("split:", split)
print("max_items:", max_items)
print("RAW_DIR:", RAW_DIR)
print("STAGING_DIR:", STAGING_DIR)


## Stage logic — Download / stage + build raw_index (idempotent)

In [None]:
# --- Build raw_index.parquet ---
from pathlib import Path
import pandas as pd

from histo_cartography import datasets
from histo_cartography.exports import save_parquet

t0 = time.time()

data_cfg = cfg.get("data", {})
dataset_keys = data_cfg.get("dataset_keys") or [data_cfg.get("dataset_key", "CRC_VAL_HE_7K")]
split = str(data_cfg.get("split", "val"))

verify_md5 = bool(data_cfg.get("download", {}).get("verify_md5", True))
allow_large = bool(data_cfg.get("download", {}).get("allow_large", False))
max_items = data_cfg.get("max_items_safe", 512) if SAFE_MODE else data_cfg.get("max_items_full", None)
overwrite = bool(data_cfg.get("force_reextract", False))

def _provider_from_key(k: str) -> str:
    if str(k).startswith("HF_"):
        return "hf"
    if str(k).startswith("MEDMNIST_"):
        return "medmnist"
    return "zenodo_or_local"

if raw_index_path.exists() and not FORCE_REBUILD:
    raw_index = pd.read_parquet(raw_index_path)
    print(f"✅ Loaded existing raw_index.parquet: {raw_index.shape}")
else:
    parts = []
    for dk in dataset_keys:
        items_df, images_dir = datasets.prepare_dataset_to_staging(
            dk,
            raw_dir=RAW_DIR,
            staging_dir=STAGING_DIR,
            split=split,
            safe_mode=SAFE_MODE,
            max_items=max_items,
            seed=SEED,
            overwrite=overwrite,
            verify_md5=verify_md5,
            allow_large=allow_large,
            mpp=float(data_cfg.get("mpp", 0.5)),
            use_text_modality=False,
            text_template_version="v2_no_label",
        )
        df = items_df.copy()
        df["dataset_key"] = str(dk)
        df["provider"] = _provider_from_key(str(dk))
        df = df.rename(columns={"source": "source_dataset"})
        keep = ["dataset_key", "provider", "source_dataset", "split", "label", "image_path", "width", "height", "mpp"]
        df = df[keep]
        parts.append(df)

    raw_index = pd.concat(parts, ignore_index=True) if parts else pd.DataFrame()
    assert len(raw_index) > 0, "raw_index is empty. Check dataset_key and download settings."
    assert raw_index["image_path"].isna().sum() == 0, "raw_index has missing image_path values."

    save_parquet(raw_index, raw_index_path)

runtime_sec = time.time() - t0
print("runtime_sec:", round(runtime_sec, 2))
raw_index.head()


## CHECKPOINT — Raw index health gates

In [None]:
# CHECKPOINT: critic on raw_index
from IPython.display import display

crit_raw = run_critic(
    df=raw_index,
    stage="stage_00_download",
    gate="checkpoint_raw_index",
    required_cols=["dataset_key","provider","source_dataset","split","label","image_path","width","height","mpp"],
    id_col=None,
    min_rows=10 if SAFE_MODE else 100,
    key_nonnull_cols=["dataset_key","image_path"],
)

write_critic_report(crit_raw, qa_dir / "critic_checkpoint_raw_index.json")
display(critic_result_table(crit_raw))
display(critic_issues_table(crit_raw).head(50))


In [None]:
# Register artifact + stage manifest
schema_version = str(cfg.get("project", {}).get("schema_version", "0.1.0"))

register_artifact(
    project_root=PROJECT_ROOT,
    stage="stage_00_download",
    artifact="raw_index",
    path=raw_index_path,
    schema_version=schema_version,
    inputs=[],
    df=raw_index,
    warnings_count=int(crit_raw.warnings_count),
    fails_count=int(crit_raw.fails_count),
    runtime_sec=float(runtime_sec),
    notes="raw inventory parquet index",
)

append_stage_manifest(
    project_root=PROJECT_ROOT,
    stage="stage_00_download",
    inputs=[],
    outputs=[raw_index_path],
    schema_version=schema_version,
    warnings_count=int(crit_raw.warnings_count),
    fails_count=int(crit_raw.fails_count),
    runtime_sec=float(runtime_sec),
    notes="stage 00 run summary",
)


## POST — Inventory EDA (one plot per cell)

In [None]:
# POST plot 1 — Dataset key distribution
import matplotlib.pyplot as plt

vc = raw_index["dataset_key"].astype(str).value_counts()

fig = plt.figure(figsize=(8, 4))
plt.bar(vc.index.astype(str), vc.values)
plt.xticks(rotation=45, ha="right")
plt.title("Dataset key distribution")
plt.ylabel("n items")
plt.tight_layout()

out_path = plots_dir / "dataset_key_distribution.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_00_download", plot_id="dataset_key_distribution", title="Dataset key distribution", path=out_path, tags=["post","distribution"], is_core=True)


In [None]:
# POST plot 2 — Label distribution (top 30)
import matplotlib.pyplot as plt

vc = raw_index["label"].astype(str).value_counts().head(30)

fig = plt.figure(figsize=(8, 4))
plt.bar(vc.index.astype(str), vc.values)
plt.xticks(rotation=45, ha="right")
plt.title("Label distribution (top 30)")
plt.ylabel("n items")
plt.tight_layout()

out_path = plots_dir / "label_distribution_top30.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_00_download", plot_id="label_distribution_top30", title="Label distribution (top 30)", path=out_path, tags=["post","distribution","label"], is_core=True)


In [None]:
# POST plot 3 — Image resolution scatter (w × h)
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(5, 5))
plt.scatter(raw_index["width"].astype(float), raw_index["height"].astype(float), s=10, alpha=0.5)
plt.xlabel("width")
plt.ylabel("height")
plt.title("Image resolution scatter")
plt.tight_layout()

out_path = plots_dir / "image_resolution_scatter.png"
save_and_display(fig, out_path)
register_plot(viz_records, stage="stage_00_download", plot_id="image_resolution_scatter", title="Image resolution scatter", path=out_path, tags=["post","images"], is_core=True)


In [None]:
# Write viz index (parquet + csv)
from IPython.display import display
from histo_cartography.viz import write_viz_index, viz_records_to_df

viz_index_path = stage_dir / "viz_index.parquet"
write_viz_index(viz_records, out_parquet=viz_index_path, out_csv=stage_dir / "viz_index.csv")
display(viz_records_to_df(viz_records).head(50))
print("✅ wrote viz_index:", viz_index_path)


## Next actions
- Run Stage 01 to build canonical `items.parquet` with stable `item_id`s and text prompts (optional).
- If downloads fail: check dataset keys, Drive permissions, and storage limits.