In [3]:
# durg target enrichment analysis for : 

# a. whole set of causal genes
# b. larget module of causal genes
# c. partioned modules of causal genes
# b. and c. causal genes 

# 

# All membership   : /mnt/f/10_osteo_MR/results_network/largest_causal_subnet_A2_a6_g0.001832981/all_membership.tsv

# ----------------------------
# Paths
# ----------------------------
DIR_DM = "/mnt/f/0.datasets/dgidb/"
PATH_DM_DRUGS = os.path.join(DIR_DM, "1.General Information of Drug.tsv")
PATH_DM_TARGETS = os.path.join(DIR_DM, "5.General Information of Drug Therapeutic Target (DTT).tsv")
PATH_DM_MAP = os.path.join(DIR_DM, "10.Drug to DTT Mapping Information.tsv")

OUTDIR = "/mnt/f/10_osteo_MR/result_drug_target"


MOA_KEEP = {"Agonist", "Antagonist", "Inhibitor", "Modulator" } # , "Activator" }
STATUS_KEEP = [
    "Investigative", "Patented", "Approved", "Preclinical", "Clinical Trial",
    "Phase 1", "Phase 1/2", "Phase 2", "Phase 2/3", "Phase 3", "Phase 4"
]

STATUS_OTHER = "Other"

# ----------------------------
# Helpers
# ----------------------------
def clean_symbol(x):
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s == "" or s.lower() in {"nan","null","none","."}:
        return None
    return s.upper()

def coerce_str(x):
    if pd.isna(x):
        return ""
    return str(x).strip()

def coerce_float(x):
    try:
        return float(x)
    except Exception:
        return np.nan

def bucket_status(s):
    s = coerce_str(s)
    # exact match first
    if s in STATUS_KEEP:
        return s
    # quick normalization hints
    low = s.lower()
    if low.startswith("phase 1/2"): return "Phase 1/2"
    if low.startswith("phase 2/3"): return "Phase 2/3"
    if low.startswith("phase 1"): return "Phase 1"
    if low.startswith("phase 2"): return "Phase 2"
    if low.startswith("phase 3"): return "Phase 3"
    if low.startswith("phase 4"): return "Phase 4"
    if "approved" in low: return "Approved"
    if "investigative" in low: return "Investigative"
    if "preclinical" in low: return "Preclinical"
    if "patented" in low: return "Patented"
    if "clinical trial" in low: return "Clinical Trial"
    return STATUS_OTHER

# Drugmap: drugs
dm_drugs = pd.read_csv(PATH_DM_DRUGS, sep="\t", dtype=str, low_memory=False)
if "DrugID" not in dm_drugs.columns:
    raise ValueError("Drug file must have 'DrugID' column.")
# Normalize
dm_drugs["DrugID"] = dm_drugs["DrugID"].astype(str).str.strip()
if "Drug_Name" not in dm_drugs.columns:
    # some releases use 'Drug_Name' or 'Drug_Name '—be forgiving
    cand = [c for c in dm_drugs.columns if c.strip().lower() == "drug_name"]
    dm_drugs["Drug_Name"] = dm_drugs[cand[0]] if cand else dm_drugs["DrugID"]
dm_drugs["Drug_Name"] = dm_drugs["Drug_Name"].astype(str).str.strip()

if "Highest_status" not in dm_drugs.columns:
    # be forgiving—search a close match
    cand = [c for c in dm_drugs.columns if c.strip().lower() == "highest_status"]
    if cand:
        dm_drugs["Highest_status"] = dm_drugs[cand[0]]
    else:
        dm_drugs["Highest_status"] = ""

dm_drugs["Status_bucket"] = dm_drugs["Highest_status"].apply(bucket_status)

# Drugmap: targets (TargetID -> Gene)
dm_targets = pd.read_csv(PATH_DM_TARGETS, sep="\t", dtype=str, low_memory=False)
need_t = {"TargetID", "Gene_Name"}
if not need_t.issubset(dm_targets.columns):
    raise ValueError("Target file must contain TargetID and Gene_Name.")
dm_targets["TargetID"] = dm_targets["TargetID"].astype(str).str.strip()
dm_targets["Gene_clean"] = dm_targets["Gene_Name"].map(clean_symbol)

# Drugmap: mapping DrugID–TargetID–MOA
dm_map = pd.read_csv(PATH_DM_MAP, sep="\t", dtype=str, low_memory=False)
need_m = {"DrugID", "TargetID", "MOA"}
if not need_m.issubset(dm_map.columns):
    raise ValueError("Mapping file must contain DrugID, TargetID, MOA.")
dm_map["DrugID"] = dm_map["DrugID"].astype(str).str.strip()
dm_map["TargetID"] = dm_map["TargetID"].astype(str).str.strip()
dm_map["MOA"] = dm_map["MOA"].astype(str).str.strip()

# Keep only desired MOA classes
dm_map = dm_map[dm_map["MOA"].isin(MOA_KEEP)].copy()

# ----------------------------
# Build Drug–Gene (causal) table with MOA
# ----------------------------
# Join TargetID->Gene
map_ = dm_map.merge(dm_targets[["TargetID", "Gene_clean"]], on="TargetID", how="left")
map_ = map_[~map_["Gene_clean"].isna()].copy()

# Attach Drug_Name and Status_bucket
drug_meta = dm_drugs[["DrugID", "Drug_Name", "Status_bucket"]].drop_duplicates()
map_ = map_.merge(drug_meta, on="DrugID", how="left")

# Deduplicate basic rows
map_ = map_[["DrugID","Drug_Name","MOA","Gene_clean","Status_bucket"]].drop_duplicates()

out_csv = os.path.join(OUTDIR, "drugmap_drug_gene_by_moa_status.csv")

map_.to_csv( out_csv, index=False) 

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import json
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
import seaborn as sns

# =========================
# ======== CONFIG =========
# =========================

PATHS = {
    "rb_membership": "/mnt/f/10_osteo_MR/results_network/largest_causal_subnet_A2_a6_g0.001832981/all_membership.tsv",  # cols: gene, community
    "drugmap": "/mnt/f/10_osteo_MR/result_drug_target/drugmap_drug_gene_by_moa_status.csv",  # cols: DrugID, Drug_Name, MOA, Gene_clean, Status_bucket
}

OUTDIR_BASE = "/mnt/f/10_osteo_MR/result_drug_target/"

SUBDIR = "module_sig_drug"

# Include ALL statuses by default; set to {"Approved"} if you want only approved drugs.
FILTER_STATUS = None  # e.g., {"Approved"} or None

# Visualization limits
TOP_N_DRUGS_HEATMAP = 30
TOP_N_PAIRS_BUBBLE = 200
SEED = 13




#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Drug target × RB subnetwork/module enrichment (hypergeometric)
MR-PTRS-Osteo — Step 0 (minimal, two inputs only)
FDR per-drug across modules (includes 'ALL' + each module)

Inputs
------
PATHS = {
  "rb_membership": "/mnt/f/10_osteo_MR/results_network/largest_causal_subnet_A2_a6_g0.001832981/all_membership.tsv",
  "drugmap": "/mnt/f/10_osteo_MR/result_drug_target/drugmap_drug_gene_by_moa_status.csv",
}

Outputs
-------
- drug_module_enrichment.tsv     (includes fdr_per_drug and fdr_global)
- per_drug_minFDR.tsv            (based on fdr_per_drug)
- per_module_summary.tsv         (based on fdr_per_drug < 0.05)
- plots/
    - heatmap_top_drugs.png
    - bubble_top_pairs.png
- params_interaction_simple.json
"""

import os
import json
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
import seaborn as sns

# =========================
# ======== CONFIG =========
# =========================

PATHS = {
    "rb_membership": "/mnt/f/10_osteo_MR/results_network/largest_causal_subnet_A2_a6_g0.001832981/all_membership.tsv",  # cols: gene, community
    "drugmap": "/mnt/f/10_osteo_MR/result_drug_target/drugmap_drug_gene_by_moa_status.csv",  # cols: DrugID, Drug_Name, MOA, Gene_clean, Status_bucket
}



# Include ALL statuses by default; set to {"Approved"} if you want only approved drugs.
FILTER_STATUS = None  # e.g., {"Approved"} or None

# Visualization limits
TOP_N_DRUGS_HEATMAP = 30
TOP_N_PAIRS_BUBBLE = 200
SEED = 13


#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Drug target × RB module enrichment (hypergeometric)
Universe = PPI nodes ∪ all drug targets
MR-PTRS-Osteo — Step 0 (two inputs + PPI)

Key choices
-----------
- Universe U := nodes(G_ppi) ∪ {all Gene_clean in Drugmap (after any status filter)}.
- Module enrichment: for each (drug d, RB module m)
    N = |U|
    M = |m|                                      (RB module size; m ⊂ RB)
    K = |targets(d)|                             (all targets, not restricted to RB)
    k = |targets(d) ∩ m|
    p = P[X ≥ k | N, K, M]  (hypergeom.sf)
- FDR: Benjamini–Hochberg **per drug** over modules.
- Optional global table "RB vs union" (one test per drug):
    N = |U|, M = |RB|, K = |targets(d)|, k = |targets(d) ∩ RB|, FDR across drugs.

Outputs
-------
/mnt/f/10_osteo_MR/results_network/drs_v1/interaction_test_simple/
- drug_module_enrichment.tsv     (module-level; uses fdr_per_drug; includes fdr_global for reference)
- per_drug_minFDR.tsv            (based on fdr_per_drug)
- per_module_summary.tsv         (based on fdr_per_drug < 0.05)
- drug_RB_vs_union.tsv           (optional global enrichment per drug; FDR across drugs)
- plots/
    - heatmap_top_drugs.png
    - bubble_top_pairs.png
- params_interaction_simple.json
"""

import os
import json
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd
from scipy.stats import hypergeom
import matplotlib.pyplot as plt
import seaborn as sns

# =========================
# ======== CONFIG =========
# =========================

PATHS = {
    "rb_membership": "/mnt/f/10_osteo_MR/results_network/largest_causal_subnet_A2_a6_g0.001832981/all_membership.tsv",  # cols: gene, community
    "drugmap": "/mnt/f/10_osteo_MR/result_drug_target/drugmap_drug_gene_by_moa_status.csv",                              # cols: DrugID, Drug_Name, MOA, Gene_clean, Status_bucket
    "ppi": "/mnt/f/10_osteo_MR/datasets/ppi/ppi_all_nonduplicate.tsv",                                                   # cols: inx, gene_u, gene_v
}

#OUTDIR_BASE = "/mnt/f/10_osteo_MR/results_network/drs_v1/"
#SUBDIR = "interaction_test_simple"

# Include ALL statuses by default; set to {"Approved"} if you want only approved drugs.
FILTER_STATUS = None  # e.g., {"Approved"} or None

# Visualization limits
TOP_N_DRUGS_HEATMAP = 30
TOP_N_PAIRS_BUBBLE = 200
SEED = 13


# =========================
# ===== UTILITIES =========
# =========================

def ensure_outdirs(base, sub):
    outdir = Path(base, sub)
    plots = outdir / "plots"
    outdir.mkdir(parents=True, exist_ok=True)
    plots.mkdir(parents=True, exist_ok=True)
    return str(outdir), str(plots)

def bh_fdr_series(pvals: pd.Series) -> pd.Series:
    """
    Benjamini–Hochberg FDR for a 1-D pandas Series (ignores NaN).
    Correct implementation with reverse cumulative min.
    """
    p = pd.Series(pvals, dtype=float)
    mask = p.notna()
    if mask.sum() == 0:
        return p * np.nan
    p_non = p[mask].values.astype(float)
    n = p_non.size
    order = np.argsort(p_non, kind="mergesort")
    ranks = np.arange(1, n + 1, dtype=float)
    bh = np.empty(n, dtype=float)
    bh[order] = p_non[order] * n / ranks
    bh_sorted = bh[order]
    bh_sorted = np.minimum.accumulate(bh_sorted[::-1])[::-1]
    bh[order] = np.minimum(bh_sorted, 1.0)
    out = pd.Series(np.nan, index=p.index, dtype=float)
    out.loc[mask] = bh
    return out


# =========================
# ===== LOADERS ===========
# =========================

def load_rb_membership(path):
    rb = pd.read_csv(path, sep="\t")
    assert {"gene", "community"}.issubset(rb.columns), "RB membership must have 'gene','community'."
    rb["gene"] = rb["gene"].astype(str)
    return rb

def load_drugmap(path, filter_status=None):
    dm = pd.read_csv(path)
    need = {"DrugID", "Drug_Name", "MOA", "Gene_clean", "Status_bucket"}
    miss = need - set(dm.columns)
    assert not miss, f"Drugmap file missing columns: {miss}"
    dm["Drug_Name"] = dm["Drug_Name"].astype(str)
    dm["Gene_clean"] = dm["Gene_clean"].astype(str)
    dm["Status_bucket"] = dm["Status_bucket"].astype(str)
    if filter_status is not None:
        dm = dm[dm["Status_bucket"].isin(filter_status)].copy()
    # Deduplicate drug–gene pairs
    dm = dm.drop_duplicates(subset=["Drug_Name", "Gene_clean"])
    return dm

def load_ppi_nodes(path):
    ppi = pd.read_csv(path)
    # Expect columns: inx, gene_u, gene_v
    if ppi.shape[1] >= 3:
        ppi.columns = ["inx", "gene_u", "gene_v"]
    else:
        raise ValueError("PPI file must have at least 3 columns: inx, gene_u, gene_v")
    nodes = set(ppi["gene_u"].astype(str)) | set(ppi["gene_v"].astype(str))
    return nodes


# =========================
# === UNIVERSE & SETS =====
# =========================

def build_universe_and_modules(rb, dm, ppi_nodes):
    """
    Universe U = PPI nodes ∪ all Drugmap target genes (after status filtering).
    Modules are taken from RB membership (unchanged).
    Returns:
      U (set), modules (list), mod2genes (dict m->set), rb_genes (set), all_targets (set)
    """
    modules = sorted(rb["community"].unique())
    mod2genes = {m: set(rb.loc[rb["community"] == m, "gene"].astype(str)) for m in modules}
    rb_genes = set(rb["gene"].astype(str))
    all_targets = set(dm["Gene_clean"].astype(str))
    U = set(ppi_nodes) | set(all_targets)
    return U, modules, mod2genes, rb_genes, all_targets

def build_drug_targets(dm):
    drug2genes = defaultdict(set)
    meta = {}
    for drug, sub in dm.groupby("Drug_Name"):
        genes = set(sub["Gene_clean"])
        drug2genes[drug] = genes
        ids = sorted(set(sub["DrugID"].astype(str)))
        moa_counts = Counter(sub["MOA"].astype(str))
        status_counts = Counter(sub["Status_bucket"].astype(str))
        meta[drug] = {
            "drug_ids": ";".join(ids),
            "best_moa_hint": ",".join([m for m, _ in moa_counts.most_common(3)]),
            "status_bucket": status_counts.most_common(1)[0][0] if status_counts else "NA"
        }
    return drug2genes, meta


# =========================
# ====== TESTS ============
# =========================

def run_hypergeom_modules_union_universe(drug2genes, mod2genes, U):
    """
    Module enrichment with union-universe U (PPI nodes ∪ all targets).
    For each (drug, module):
      N = |U|
      M = |module|
      K = |targets(drug)|
      k = |targets(drug) ∩ module|
    Returns dataframe with pval, fdr_global (reference), fdr_per_drug (used).
    """
    N = len(U)
    rows = []
    for drug, tg in drug2genes.items():
        K = len(tg)
        for mod_name, mg in mod2genes.items():
            M = len(mg)
            if K == 0 or M == 0:
                pval = 1.0; k = 0
            else:
                k = len(tg & mg)
                pval = hypergeom.sf(k-1, N, K, M) if k > 0 else 1.0
            rows.append((drug, mod_name, K, M, N, k, pval))
    res = pd.DataFrame(rows, columns=["drug","module","K","M","N","k","pval"])
    res["fdr_global"] = bh_fdr_series(res["pval"])
    res["fdr_per_drug"] = np.nan
    for drug, idx in res.groupby("drug").indices.items():
        idx = list(idx)
        res.loc[idx, "fdr_per_drug"] = bh_fdr_series(res.loc[idx, "pval"])
    return res

def run_rb_vs_union(drug2genes, rb_genes, U):
    """
    Global RB enrichment per drug with union-universe U.
      N = |U|
      M = |RB|
      K = |targets(drug)|
      k = |targets(drug) ∩ RB|
    Returns dataframe with FDR across drugs.
    """
    N = len(U); M = len(rb_genes)
    rows = []
    for drug, tg in drug2genes.items():
        K = len(tg)
        k = len(tg & rb_genes)
        pval = hypergeom.sf(k-1, N, K, M) if (K>0 and M>0 and k>0) else 1.0
        rows.append((drug, K, M, N, k, pval))
    res = pd.DataFrame(rows, columns=["drug","K","M","N","k","pval"])
    res["fdr_across_drugs"] = bh_fdr_series(res["pval"])
    return res


# =========================
# ==== SUMMARIES/plots ====
# =========================

def add_overlap_genes(enrich_df, drug2genes, mod2genes):
    glist = []
    for r in enrich_df.itertuples(index=False):
        tg = drug2genes.get(r.drug, set())
        mg = mod2genes.get(r.module, set())
        ol = sorted(tg & mg)
        glist.append(";".join(ol))
    out = enrich_df.copy()
    out["overlap_genes"] = glist
    return out

def summarize_per_drug(enrich_df, drug_meta, alpha=0.05):
    idx = enrich_df.groupby("drug")["fdr_per_drug"].idxmin()
    best = enrich_df.loc[idx, ["drug","module","fdr_per_drug","pval","K","M","k"]].copy()
    best = best.rename(columns={
        "module":"best_module","fdr_per_drug":"min_fdr_per_drug","pval":"pval_at_best",
        "K":"K_at_best","M":"M_at_best","k":"k_at_best"
    })
    sig_counts = enrich_df[enrich_df["fdr_per_drug"] < alpha].groupby("drug").size().rename("n_sig_modules")
    out = best.merge(sig_counts, on="drug", how="left").fillna({"n_sig_modules":0})
    meta_df = pd.DataFrame.from_dict(drug_meta, orient="index").reset_index().rename(columns={"index":"drug"})
    out = out.merge(meta_df, on="drug", how="left")
    out = out[["drug","drug_ids","best_moa_hint","status_bucket",
               "min_fdr_per_drug","best_module","K_at_best","M_at_best","k_at_best","pval_at_best","n_sig_modules"]]
    out = out.rename(columns={"drug_ids":"drug_id"})
    return out.sort_values(["min_fdr_per_drug","pval_at_best","k_at_best"], ascending=[True,True,False])

def summarize_per_module(enrich_df, alpha=0.05, top_k=5):
    sig = enrich_df[enrich_df["fdr_per_drug"] < alpha].copy()
    if sig.empty:
        return pd.DataFrame(columns=["module","n_enriched_drugs","top_drugs_by_fdr","module_size_M"])
    top_per_mod = (sig.sort_values(["module","fdr_per_drug","pval","k"], ascending=[True,True,True,False])
                     .groupby("module")
                     .apply(lambda df: ";".join(df["drug"].head(top_k)))
                     .rename("top_drugs_by_fdr"))
    mod_size = enrich_df.groupby("module")["M"].max().rename("module_size_M")
    counts = sig.groupby("module").size().rename("n_enriched_drugs")
    out = pd.concat([counts, top_per_mod, mod_size], axis=1).reset_index()
    return out.sort_values("n_enriched_drugs", ascending=False)

def plot_heatmap(enrich_df, per_drug_df, plots_dir, top_n=30):
    if enrich_df.empty or per_drug_df.empty:
        return None
    top_drugs = per_drug_df.head(top_n)["drug"].tolist()
    sub = enrich_df[enrich_df["drug"].isin(top_drugs)].copy()
    sub["mlog10_fdr"] = -np.log10(sub["fdr_per_drug"].replace(0, np.nextafter(0,1))).clip(upper=20)
    mat = sub.pivot(index="module", columns="drug", values="mlog10_fdr").fillna(0.0)
    mat = mat.loc[(mat > 0).any(axis=1)]
    if mat.empty:
        return None
    plt.figure(figsize=(max(8, 0.35*mat.shape[1]+6), max(6, 0.2*mat.shape[0]+3)))
    sns.heatmap(mat, cmap="mako", linewidths=0.1, linecolor="gray")
    plt.title(f"-log10 (FDR per-drug) for (module × top-{top_n} drugs)")
    plt.xlabel("Drug")
    plt.ylabel("RB Module")
    out = os.path.join(plots_dir, "heatmap_top_drugs.png")
    plt.tight_layout()
    plt.savefig(out, dpi=200)
    plt.close()
    return out

def plot_bubble_top_pairs(enrich_df, plots_dir, top_pairs=200):
    if enrich_df.empty:
        return None
    df = enrich_df.copy()
    df["mlog10_fdr"] = -np.log10(df["fdr_per_drug"].replace(0, np.nextafter(0,1)))
    df = df.sort_values(["fdr_per_drug","pval","k"], ascending=[True,True,False]).head(top_pairs)
    if df.empty:
        return None
    plt.figure(figsize=(10, 7))
    ax = sns.scatterplot(
        data=df,
        x="drug", y="module",
        size="k", sizes=(40, 800),
        hue="mlog10_fdr", palette="viridis", legend=False, alpha=0.85
    )
    # numeric colorbar
    norm = plt.Normalize(df["mlog10_fdr"].min(), df["mlog10_fdr"].max())
    sm = plt.cm.ScalarMappable(cmap="viridis", norm=norm); sm.set_array([])
    # cbar = plt.colorbar(sm); cbar.set_label("-log10 FDR (per-drug)")
    plt.xticks(rotation=45, ha="right")
    plt.title(f"Top {top_pairs} drug × RB-module pairs by per-drug enrichment")
    plt.xlabel("Drug"); plt.ylabel("RB Module")
    out = os.path.join(plots_dir, "bubble_top_pairs.png")
    plt.tight_layout(); plt.savefig(out, dpi=200); plt.close()
    return out


# =========================
# ========= MAIN ==========
# =========================

def main():
    np.random.seed(SEED)
    outdir, plots_dir = ensure_outdirs(OUTDIR_BASE, SUBDIR)

    # Save params for traceability
    params = {
        "paths": PATHS,
        "filter_status": None if FILTER_STATUS is None else sorted(FILTER_STATUS),
        "universe": "U = PPI nodes ∪ all Drugmap targets",
        "test_modules": "hypergeometric (P[X>=k]) over modules; BH-FDR per drug",
        "test_RB_union": "optional RB vs union per drug; BH-FDR across drugs",
        "outputs_dir": outdir
    }
    with open(os.path.join(outdir, "params_interaction_simple.json"), "w") as f:
        json.dump(params, f, indent=2)

    print("[INFO] Loading RB membership...")
    rb = load_rb_membership(PATHS["rb_membership"])
    print(f"[INFO] RB nodes: {rb['gene'].nunique()}, modules: {rb['community'].nunique()}")

    print("[INFO] Loading Drugmap 2.0...")
    dm = load_drugmap(PATHS["drugmap"], filter_status=FILTER_STATUS)
    print(f"[INFO] Drugmap entries (post-filter & dedup): {len(dm)}")

    print("[INFO] Loading PPI and building universe...")
    ppi_nodes = load_ppi_nodes(PATHS["ppi"])
    U, modules, mod2genes, rb_genes, all_targets = build_universe_and_modules(rb, dm, ppi_nodes)

    print( mod2genes.keys(), len( mod2genes[ 0] )  ) 
    print(f"[INFO] Universe size |U| = {len(U)} (|PPI nodes|={len(ppi_nodes)}, |all targets|={len(all_targets)})")

    drug2genes, drug_meta = build_drug_targets(dm)

    print("[INFO] Running module enrichment with union-universe (per-drug FDR over modules)...")
    enrich_mod = run_hypergeom_modules_union_universe(drug2genes, mod2genes, U)
    if enrich_mod.empty:
        print("[WARN] No module pairs were tested; check inputs."); return

    # overlap gene lists + metadata
    enrich_mod = add_overlap_genes(enrich_mod, drug2genes, mod2genes)
    meta_df = pd.DataFrame.from_dict(drug_meta, orient="index").reset_index().rename(columns={"index":"drug"})
    enrich_mod = enrich_mod.merge(meta_df, on="drug", how="left")
    enrich_mod = enrich_mod[["drug","drug_ids","best_moa_hint","status_bucket","module","K","M","N","k","pval","fdr_per_drug","fdr_global","overlap_genes"]]
    pair_path = os.path.join(outdir, "drug_module_enrichment.tsv")
    enrich_mod.to_csv(pair_path, sep="\t", index=False)
    print(f"[OK] Saved module enrichment table: {pair_path}")

    # per-drug and per-module summaries (use per-drug FDR)
    per_drug = summarize_per_drug(enrich_mod, drug_meta, alpha=0.05)
    per_drug_path = os.path.join(outdir, "per_drug_minFDR.tsv")
    per_drug.to_csv(per_drug_path, sep="\t", index=False)
    print(f"[OK] Saved per-drug summary (modules): {per_drug_path}")

    per_module = summarize_per_module(enrich_mod, alpha=0.05, top_k=5)
    per_module_path = os.path.join(outdir, "per_module_summary.tsv")
    per_module.to_csv(per_module_path, sep="\t", index=False)
    print(f"[OK] Saved per-module summary: {per_module_path}")

    # Plots (modules only)
    hm_path = plot_heatmap(enrich_mod, per_drug, plots_dir, top_n=TOP_N_DRUGS_HEATMAP)
    if hm_path: print(f"[OK] Heatmap saved: {hm_path}")
    #bb_path = plot_bubble_top_pairs(enrich_mod, plots_dir, top_pairs=TOP_N_PAIRS_BUBBLE)
    #if bb_path: print(f"[OK] Bubble plot saved: {bb_path}")

    # Optional: RB vs union (one test per drug)
    print("[INFO] Running RB vs union enrichment per drug...")
    rb_vs_union = run_rb_vs_union(drug2genes, rb_genes, U)
    rb_vs_union = rb_vs_union.merge(meta_df, on="drug", how="left")
    rb_vs_union = rb_vs_union[["drug","drug_ids","best_moa_hint","status_bucket","K","M","N","k","pval","fdr_across_drugs"]]
    rb_union_path = os.path.join(outdir, "drug_RB_vs_union.tsv")
    rb_vs_union.to_csv(rb_union_path, sep="\t", index=False)
    print(f"[OK] Saved RB vs union enrichment per-drug: {rb_union_path}")

    # Console peek
    print("\n[TOP 20 module pairs by per-drug FDR]")
    print(enrich_mod.sort_values(["fdr_per_drug","pval","k"], ascending=[True,True,False]).head(20).to_string(index=False, max_colwidth=60))

    print("\n=== DONE (module enrichment with union-universe) ===")
    print(f"All files under: {outdir}")
    print("Key files:")
    print(f"  - Module pairs:       {pair_path}")
    print(f"  - Per-drug summary:   {per_drug_path}")
    print(f"  - Per-module summary: {per_module_path}")
    if hm_path: print(f"  - Heatmap:            {hm_path}")
    #if bb_path: print(f"  - Bubble:             {bb_path}")
    print(f"  - RB vs union:        {rb_union_path}")

if __name__ == "__main__":
    main()






[INFO] Loading RB membership...
[INFO] RB nodes: 2470, modules: 12
[INFO] Loading Drugmap 2.0...
[INFO] Drugmap entries (post-filter & dedup): 40562
[INFO] Loading PPI and building universe...
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]) 450
[INFO] Universe size |U| = 16898 (|PPI nodes|=16201, |all targets|=2574)
[INFO] Running module enrichment with union-universe (per-drug FDR over modules)...
[OK] Saved module enrichment table: /mnt/f/10_osteo_MR/result_drug_target/module_sig_drug/drug_module_enrichment.tsv
[OK] Saved per-drug summary (modules): /mnt/f/10_osteo_MR/result_drug_target/module_sig_drug/per_drug_minFDR.tsv
[OK] Saved per-module summary: /mnt/f/10_osteo_MR/result_drug_target/module_sig_drug/per_module_summary.tsv


  .apply(lambda df: ";".join(df["drug"].head(top_k)))


[OK] Heatmap saved: /mnt/f/10_osteo_MR/result_drug_target/module_sig_drug/plots/heatmap_top_drugs.png
[INFO] Running RB vs union enrichment per drug...
[OK] Saved RB vs union enrichment per-drug: /mnt/f/10_osteo_MR/result_drug_target/module_sig_drug/drug_RB_vs_union.tsv

[TOP 20 module pairs by per-drug FDR]
                                         drug drug_ids        best_moa_hint status_bucket  module  K   M     N  k         pval  fdr_per_drug   fdr_global                                                overlap_genes
                              PMID23489211C20  DMDYC4J            Inhibitor Investigative       6 13 227 16898 13 3.284251e-25  3.941101e-24 1.130071e-19 EPHA1;EPHA2;EPHA3;EPHA4;EPHA5;EPHA6;EPHA7;EPHA8;EPHB1;EPH...
                                      MK-2461  DM21WBH            Inhibitor     Phase 1/2       6 13 227 16898 12 3.313855e-22  3.976626e-21 5.701289e-17 FGFR1;FGFR2;FGFR3;FLT1;FLT3;FLT4;KDR;MET;MST1R;NTRK2;NTRK...
                              PMID19788238C66