# Run full epistasis pipeline

**Step 1:** Run the whole dataframe. All double-variant IDs are processed; embeddings are stored under `{output_base}/{source}/{model_key}.db` (split by the `source` column). Null is processed first so null covariance is saved and used for Mahalanobis metrics on non-null sources.

**Step 2:** Pass a part of that dataframe (e.g. null only, or null + one other source). `compute_cov_inv` uses the embeddings in the corresponding source DBs and **returns** `(cov, cov_inv)` directly—no separate save step.

Set paths and options in the config cell, then run Step 1 and Step 2.

**Environment control:** Different models need different conda/envs (e.g. AlphaGenome needs JAX, Evo2 its own stack). Set **ENV_PROFILE** in the config cell and run this notebook in the matching environment:
- `"alphagenome"` — run only AlphaGenome (use AlphaGenome env)
- `"evo2"` — run only Evo2 (use Evo2 env)
- `"main"` — run all other models (shared env; excludes alphagenome & evo2)
- `"all"` — run every model (only if all deps in one env)

In [None]:
# ---------------------------------------------------------------------------
# Config: paths and options
# ---------------------------------------------------------------------------
# Run with project root as current working directory (so "from notebooks.process_epistasis" works).
# All data (embeddings, CSVs) lives under EPISTASIS_PAPER_ROOT. Override with env EPISTASIS_PAPER_ROOT.
import sys
from pathlib import Path

# Ensure repo root is on path so "notebooks" can be imported
ROOT = Path.cwd()
for _ in range(4):
    if (ROOT / "notebooks" / "paper_data_config.py").exists():
        break
    ROOT = ROOT.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from notebooks.process_epistasis import get_model_keys_for_env
from notebooks.paper_data_config import EPISTASIS_PAPER_ROOT, data_dir, embeddings_dir

# Environment profile: which models to run in *this* environment.
#   "alphagenome" -> only AlphaGenome (use in AlphaGenome conda env)
#   "evo2"       -> only Evo2 (use in Evo2 conda env)
#   "main"       -> all models except alphagenome & evo2 (shared env)
#   "all"        -> every model (only if all deps in one env)
ENV_PROFILE = "main"  # <-- single point of control: change per environment
MODEL_KEYS = get_model_keys_for_env(ENV_PROFILE)
print(f"ENV_PROFILE={ENV_PROFILE!r} -> MODEL_KEYS={MODEL_KEYS}")
print(f"Data root: {EPISTASIS_PAPER_ROOT}")

# Input mode: single dataframe with 'source' column, or list of (source_name, path) per source.
# - If USE_SINGLE_DATAFRAME is True: load one table (e.g. epistasis_aggregated.csv) with columns
#   epistasis_id and SOURCE_COL; storage dirs are split by value in SOURCE_COL: {output_base}/{source}/{model}.db.
# - If False: use SOURCES list below (one CSV per source).
USE_SINGLE_DATAFRAME = False
SINGLE_DATAFRAME_PATH = data_dir() / "epistasis_aggregated.csv"  # used when USE_SINGLE_DATAFRAME is True
SOURCE_COL = "source"  # column in the single dataframe that defines the storage subdirectory per row

# (source_name, path_to_csv). Used when USE_SINGLE_DATAFRAME is False. Paths under paper data root/data/
SOURCES = [
    ("null", data_dir() / "null_epistasis.csv"),
    ("fas_analysis", data_dir() / "fas_subset.csv"),
    ("mst1r_analysis", data_dir() / "mst1r_subset.csv"),
    ("kras", data_dir() / "kras_subset.csv"),
    # ("tcga_analysis", data_dir() / "tcga_subset_doubles.csv"),
    # ("okgp_analysis", data_dir() / "okgp_subset_clean.csv"),
]

OUTPUT_BASE = embeddings_dir()
ID_COL = "epistasis_id"
BATCH_SIZE = 8  # add_epistasis_metrics batch_size (e.g. 8 => 32 sequences per batch)

# OpenSpliceAI checkpoint dir for SpliceAI (or set env OPENSPLICEAI_MODEL_DIR)
SPLICEAI_MODEL_DIR = None  # e.g. "/path/to/openspliceai-mane/10000nt"

In [None]:
# ---------------------------------------------------------------------------
# Step 1: Process all sources (null first); one .db per model per source
# ---------------------------------------------------------------------------
# Either run from a single dataframe (storage split by SOURCE_COL) or from SOURCES list.
import logging
import pandas as pd
from notebooks.process_epistasis import run_sources, run_from_single_dataframe

logging.basicConfig(level=logging.INFO)

if USE_SINGLE_DATAFRAME and SINGLE_DATAFRAME_PATH is not None and Path(SINGLE_DATAFRAME_PATH).exists():
    df_all = pd.read_csv(SINGLE_DATAFRAME_PATH)
    run_from_single_dataframe(
        df_all,
        output_base=OUTPUT_BASE,
        source_col=SOURCE_COL,
        model_keys=MODEL_KEYS,
        spliceai_model_dir=SPLICEAI_MODEL_DIR,
        id_col=ID_COL,
        show_progress=True,
        force=False,
        batch_size=BATCH_SIZE,
    )
    # For Step 2: source names in same order (null first)
    _unique = df_all[SOURCE_COL].dropna().astype(str).unique().tolist()
    SOURCE_NAMES = [s for s in _unique if s == "null"] + [s for s in _unique if s != "null"]
else:
    df_all = None  # so Step 2 can check and use a small df for source subset
    run_sources(
        SOURCES,
        output_base=OUTPUT_BASE,
        model_keys=MODEL_KEYS,
        spliceai_model_dir=SPLICEAI_MODEL_DIR,
        id_col=ID_COL,
        show_progress=True,
        force=False,
        batch_size=BATCH_SIZE,
    )
    SOURCE_NAMES = [name for name, _ in SOURCES]

In [None]:
# ---------------------------------------------------------------------------
# Step 2: Compute cov_inv from a part of the dataframe — returns (cov, cov_inv) directly
# ---------------------------------------------------------------------------
# Pass a subset of your data (by source). You get back a dict: {model_key: (cov, cov_inv)}.
from notebooks.process_epistasis import compute_cov_inv

# Define which sources to use (e.g. null only, or null + fas_analysis).
# If you ran Step 1 from a single dataframe, filter it; otherwise use a small df with source names.
try:
    _df = df_all
except NameError:
    _df = None
if _df is not None:
    df_subset = _df[_df[SOURCE_COL].isin(["null"])]   # e.g. null only
else:
    df_subset = __import__("pandas").DataFrame({SOURCE_COL: ["null"]})  # null only by source name

cov_inv_by_model = compute_cov_inv(
    OUTPUT_BASE,
    source_df=df_subset,
    source_col=SOURCE_COL,
    model_keys=MODEL_KEYS,
    method="ledoit_wolf",
    show_progress=True,
)
# Example: get arrays for one model
# cov, cov_inv = cov_inv_by_model["nt500_multi"]
# cov.shape, cov_inv.shape
print("Returned cov_inv for models:", list(cov_inv_by_model.keys()))
for k, (c, ci) in cov_inv_by_model.items():
    print(f"  {k}: cov {c.shape}, cov_inv {ci.shape}")

In [None]:
# Unpack for one model (e.g. for downstream use or add_epistasis_metrics with cov_inv=...)
# cov, cov_inv = cov_inv_by_model["nt500_multi"]

In [None]:
# Optional: save cov_inv to .npz if you need it on disk later
# from notebooks.process_epistasis import run_covariance_and_save
# run_covariance_and_save(OUTPUT_BASE, source_names, model_keys=MODEL_KEYS, out_npz_dir=OUTPUT_BASE)