# Run full epistasis pipeline

1. **Process sources** (null first, then others): for each source and each model, compute embeddings and write `{output_base}/{source}/{model_key}.db`. After the **null** source is done, null covariance is computed from `null/*.db` and saved under `{output_base}/null_cov/{model_key}_pack.npz`. That **null cov_inv is passed into add_epistasis_metrics** when processing every non-null source so that epistasis metrics include Mahalanobis terms (epi_mahal, mahal_obs, mahal_add, etc.) relative to the null background. SpliceAI runs only for splicing sources (fas_analysis, mst1r_analysis, kras). Uses batched `add_epistasis_metrics(batch_size=..., cov_inv=null_cov_inv)`.

2. **Optional combined covariance**: Step 2 computes cov from all source DBs combined and saves `{model_key}_pack.npz` in OUTPUT_BASE for other uses.

Set paths and options below, then run all cells.

**Environment control:** Different models need different conda/envs (e.g. AlphaGenome needs JAX, Evo2 its own stack). Set **ENV_PROFILE** in the config cell and run this notebook in the matching environment:
- `"alphagenome"` — run only AlphaGenome (use AlphaGenome env)
- `"evo2"` — run only Evo2 (use Evo2 env)
- `"main"` — run all other models (shared env; excludes alphagenome & evo2)
- `"all"` — run every model (only if all deps in one env)

In [None]:
# ---------------------------------------------------------------------------
# Config: paths and options
# ---------------------------------------------------------------------------
# Run with project root as current working directory (so "from notebooks.process_epistasis" works).
# All data (embeddings, CSVs) lives under EPISTASIS_PAPER_ROOT. Override with env EPISTASIS_PAPER_ROOT.
from pathlib import Path
from notebooks.process_epistasis import get_model_keys_for_env
from notebooks.paper_data_config import EPISTASIS_PAPER_ROOT, data_dir, embeddings_dir

# Environment profile: which models to run in *this* environment.
#   "alphagenome" -> only AlphaGenome (use in AlphaGenome conda env)
#   "evo2"       -> only Evo2 (use in Evo2 conda env)
#   "main"       -> all models except alphagenome & evo2 (shared env)
#   "all"        -> every model (only if all deps in one env)
ENV_PROFILE = "main"  # <-- single point of control: change per environment
MODEL_KEYS = get_model_keys_for_env(ENV_PROFILE)
print(f"ENV_PROFILE={ENV_PROFILE!r} -> MODEL_KEYS={MODEL_KEYS}")
print(f"Data root: {EPISTASIS_PAPER_ROOT}")

# (source_name, path_to_csv). Paths under paper data root/data/
SOURCES = [
    ("null", data_dir() / "null_epistasis.csv"),
    ("fas_analysis", data_dir() / "fas_subset.csv"),
    ("mst1r_analysis", data_dir() / "mst1r_subset.csv"),
    ("kras", data_dir() / "kras_subset.csv"),
    # ("tcga_analysis", data_dir() / "tcga_subset_doubles.csv"),
    # ("okgp_analysis", data_dir() / "okgp_subset_clean.csv"),
]

OUTPUT_BASE = embeddings_dir()
ID_COL = "epistasis_id"
BATCH_SIZE = 8  # add_epistasis_metrics batch_size (e.g. 8 => 32 sequences per batch)

# OpenSpliceAI checkpoint dir for SpliceAI (or set env OPENSPLICEAI_MODEL_DIR)
SPLICEAI_MODEL_DIR = None  # e.g. "/path/to/openspliceai-mane/10000nt"

In [None]:
# ---------------------------------------------------------------------------
# Step 1: Process all sources (null first); one .db per model per source
# ---------------------------------------------------------------------------
import logging
from notebooks.process_epistasis import run_sources

logging.basicConfig(level=logging.INFO)

run_sources(
    SOURCES,
    output_base=OUTPUT_BASE,
    model_keys=MODEL_KEYS,
    spliceai_model_dir=SPLICEAI_MODEL_DIR,
    id_col=ID_COL,
    show_progress=True,
    force=False,
    batch_size=BATCH_SIZE,
)

In [None]:
# ---------------------------------------------------------------------------
# Step 2: Compute covariance from combined DBs (optional) and save .npz packs
# ---------------------------------------------------------------------------
# Null covariance is already computed and saved in OUTPUT_BASE / "null_cov" during
# Step 1; it is passed into add_epistasis_metrics when processing non-null sources.
# Optionally compute a combined cov from all sources (null + others) for other uses:
from notebooks.process_epistasis import run_covariance_and_save

source_names = [name for name, _ in SOURCES]
saved_npz = run_covariance_and_save(
    OUTPUT_BASE,
    source_names,
    model_keys=MODEL_KEYS,
    out_npz_dir=OUTPUT_BASE,
    method="ledoit_wolf",
    show_progress=True,
)
print("Saved combined cov packs:", saved_npz)

In [None]:
# ---------------------------------------------------------------------------
# Optional: inspect null covariance pack (this cov_inv is passed to add_epistasis_metrics)
# ---------------------------------------------------------------------------
import numpy as np

null_cov_dir = OUTPUT_BASE / "null_cov"
tool_name = "nt500_multi"
null_pack_path = null_cov_dir / f"{tool_name}_pack.npz"
if null_pack_path.exists():
    data = np.load(null_pack_path, allow_pickle=True)
    cov = data["cov"]
    cov_inv = data["cov_inv"]
    print(f"Null cov ({tool_name}): cov {cov.shape}, cov_inv {cov_inv.shape}")
    print("Used as cov_inv in add_epistasis_metrics for non-null sources (epi_mahal, etc.)")
else:
    print(f"No null pack at {null_pack_path}")

In [None]:
# ---------------------------------------------------------------------------
# Optional: inspect combined covariance pack (from Step 2)
# ---------------------------------------------------------------------------
# Null packs live in OUTPUT_BASE / "null_cov"; combined packs in OUTPUT_BASE.
tool_name = "nt500_multi"  # or alphagenome, spliceai, etc.
pack_path = OUTPUT_BASE / f"{tool_name}_pack.npz"
if pack_path.exists():
    data = np.load(pack_path, allow_pickle=True)
    cov = data["cov"]
    cov_inv = data["cov_inv"]
    print(f"{tool_name}: cov {cov.shape}, cov_inv {cov_inv.shape}")
    print("model:", data["model"])
    print("pool:", data["pool"])
else:
    print(f"No pack at {pack_path}")