# 12 Beta meta-integration cross-omics

**Origin:** `1_2_beta_merged_meta.ipynb`  
**This annotated version was generated on:** 2025-10-13 06:41

**What this notebook does (high level):**  
- Merge MR-derived effect sizes (β) across modalities and meta-analyze to produce unified causal weights per gene.

**How to use:**  
1. Review the markdown notes before each code cell.  
2. Adjust input/output paths as needed for your environment.  
3. Run cell-by-cell to reproduce artifacts for downstream steps.

---


**Step 1:** Load tabular data (summary stats / annotations).

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import chi2, norm

# ---------- helpers ----------
def ivw_fixed(b, se):
    w = 1.0 / (se**2)
    beta = np.sum(w*b) / np.sum(w)
    se_pooled = np.sqrt(1.0 / np.sum(w))
    z = beta / se_pooled
    p = 2*norm.sf(abs(z))
    return beta, se_pooled, z, p

def heterogeneity(b, se, beta_fe):
    w = 1.0 / (se**2)
    q = np.sum(w * (b - beta_fe)**2)
    k = len(b)
    df = k - 1
    p_q = 1 - chi2.cdf(q, df) if df > 0 else np.nan
    i2 = max(0.0, (q - df) / q) if q > 0 and df > 0 else 0.0
    c = np.sum(w) - (np.sum(w**2) / np.sum(w))
    tau2 = max(0.0, (q - df) / c) if c > 0 and df > 0 else 0.0
    return q, df, p_q, i2, tau2

def ivw_random_dl(b, se):
    beta_fe, se_fe, _, _, = ivw_fixed(b, se)
    q, df, _, _, tau2 = heterogeneity(b, se, beta_fe)
    w_star = 1.0 / (se**2 + tau2)
    beta = np.sum(w_star*b) / np.sum(w_star)
    se_pooled = np.sqrt(1.0 / np.sum(w_star))
    z = beta / se_pooled
    p = 2*norm.sf(abs(z))
    return beta, se_pooled, z, p, q, df, tau2

def meta_within_platform(df_list, labels, use_random=False, min_studies=1):
    # stack and pool within each platform
    outs = []
    for lab, df in zip(labels, df_list):
        req = {"gene","weighted_beta","weighted_se"}
        if not req.issubset(df.columns):
            raise ValueError(f"Missing columns in {lab}: need {req}")
        d = df[["gene","weighted_beta","weighted_se"]].rename(
            columns={"weighted_beta":"beta","weighted_se":"se"}
        ).dropna()
        d["study"] = lab
        outs.append(d)
    long = pd.concat(outs, ignore_index=True)

    # group by platform family
    platform = np.array(["eqtl","eqtl","pqtl","pqtl"])  # map your 4 inputs
    platform_map = dict(zip(labels, platform))

    pooled = []
    for (g, fam), sub in long.groupby([ "gene", long["study"].map(platform_map) ]):
        b = sub["beta"].to_numpy(float)
        s = sub["se"].to_numpy(float)
        if len(b) < min_studies:
            # keep singleton as-is
            pooled.append({"gene": g, "platform": fam,
                           "beta": b[0], "se": s[0], "k": len(b)})
            continue
        if use_random and len(b) >= 2:
            beta, se, z, p, q, df, tau2 = ivw_random_dl(b, s)
        else:
            beta, se, z, p = ivw_fixed(b, s)
        pooled.append({"gene": g, "platform": fam,
                       "beta": beta, "se": se, "k": len(b)})
    pooled = pd.DataFrame(pooled)
    # pivot to eqtl/pqtl columns
    wide = pooled.pivot(index="gene", columns="platform", values=["beta","se","k"])
    # flatten columns
    wide.columns = [f"{a}_{b}" for a,b in wide.columns]
    wide = wide.reset_index()
    return wide

# ---- Deming regression to calibrate pqtl→eqtl (errors-in-variables) ----
def deming_fit(x, y, se_x=None, se_y=None):
    """
    Fit y = a + b x with errors in both x and y.
    lambda = Var(e_x)/Var(e_y). We approximate with mean(se_x^2)/mean(se_y^2).
    Returns a, b. (Uncertainty of a,b ignored here for simplicity.)
    """
    x = np.asarray(x); y = np.asarray(y)
    xbar, ybar = x.mean(), y.mean()
    X = x - xbar; Y = y - ybar
    Sxx = np.sum(X*X) / (len(x)-1)
    Syy = np.sum(Y*Y) / (len(y)-1)
    Sxy = np.sum(X*Y) / (len(x)-1)
    if se_x is None or se_y is None:
        lam = 1.0
    else:
        lam = np.nanmean(se_x**2) / np.nanmean(se_y**2)
        if not np.isfinite(lam) or lam <= 0: lam = 1.0
    # Deming slope
    delta = (Syy - lam*Sxx)
    b = (delta + np.sqrt(delta**2 + 4*lam*Sxy**2)) / (2*Sxy)
    a = ybar - b*xbar
    return a, b

def apply_linear_transform(beta, se, a, b):
    """
    Map y = a + b*x; propagate SE ignoring (a,b) uncertainty.
    Var(y) ≈ b^2 Var(x) → se_y = |b| se_x.
    """
    beta_t = a + b*beta
    se_t = np.abs(b) * se
    return beta_t, se_t

# ---------- main wrapper ----------
def cross_modal_meta(
    eqtlgen_path, gtex_path, decode_path, ukbppp_path,
    use_random_within=False,
    require_overlap_for_calibration=True
):
    labels = ["eqtlgen","gtex","decode","ukbppp"]
    dfs = [pd.read_csv(p, sep="\t") for p in [eqtlgen_path, gtex_path, decode_path, ukbppp_path]]

    # 1) within-modality pooling
    wide = meta_within_platform(dfs, labels, use_random=use_random_within, min_studies=1)
    # Now we may have columns: beta_eqtl, se_eqtl, beta_pqtl, se_pqtl (some missing)
    # Create concise columns
    if "beta_eqtl" not in wide.columns:
        wide["beta_eqtl"] = np.nan; wide["se_eqtl"] = np.nan
    if "beta_pqtl" not in wide.columns:
        wide["beta_pqtl"] = np.nan; wide["se_pqtl"] = np.nan

    # 2) calibration using genes with both
    both = wide.dropna(subset=["beta_eqtl","se_eqtl","beta_pqtl","se_pqtl"])
    if len(both) >= 10:  # need some signal to calibrate
        a, b = deming_fit(
            x=both["beta_pqtl"].values,
            y=both["beta_eqtl"].values,
            se_x=both["se_pqtl"].values,
            se_y=both["se_eqtl"].values
        )
    else:
        # fallback: identity mapping
        a, b = 0.0, 1.0

    # Option A: map everything onto the eQTL scale
    beta_eqtl = wide["beta_eqtl"].copy()
    se_eqtl = wide["se_eqtl"].copy()

    # transform pQTL-only to eQTL scale
    mask_p_only = beta_eqtl.isna() & wide["beta_pqtl"].notna()
    if mask_p_only.any():
        t_beta, t_se = apply_linear_transform(
            beta=wide.loc[mask_p_only, "beta_pqtl"].values,
            se=wide.loc[mask_p_only, "se_pqtl"].values,
            a=a, b=b
        )
        beta_eqtl.loc[mask_p_only] = t_beta
        se_eqtl.loc[mask_p_only] = t_se

    # 3) final per-gene meta on the eQTL scale (use FE here; switch to RE if you like)
    meta_rows = []
    for i, row in wide.iterrows():
        g = row["gene"]
        estimates = []
        ses = []
        labels_used = []
        # If we have original eQTL pooled:
        if pd.notna(row.get("beta_eqtl", np.nan)) and pd.notna(row.get("se_eqtl", np.nan)):
            estimates.append(row["beta_eqtl"]); ses.append(row["se_eqtl"]); labels_used.append("eQTL")
        # If we also had original pQTL, transform it too and include (gives 2-study meta)
        if pd.notna(row.get("beta_pqtl", np.nan)) and pd.notna(row.get("se_pqtl", np.nan)):
            b_t, s_t = apply_linear_transform(row["beta_pqtl"], row["se_pqtl"], a, b)
            estimates.append(b_t); ses.append(s_t); labels_used.append("pQTL→eQTLscale")

        estimates = np.array(estimates, float)
        ses = np.array(ses, float)

        if len(estimates) == 0:
            continue  # no info
        elif len(estimates) == 1:
            # keep singleton
            meta_beta, meta_se = estimates[0], ses[0]
            k_final = 1
        else:
            meta_beta, meta_se, _, _ = ivw_fixed(estimates, ses)
            k_final = len(estimates)

        meta_rows.append({
            "gene": g,
            "meta_beta_common": meta_beta,
            "meta_se_common": meta_se,
            "k_final": k_final,
            "used": ";".join(labels_used),
            "a_calib": a,
            "b_calib": b
        })

    meta_final = pd.DataFrame(meta_rows).sort_values("gene").reset_index(drop=True)
    return meta_final

# ----------- usage -----------

# beta estates from instruments within DHS 
eqtlgen_path = '/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_eqtlgen_weighted_beta_table.tsv'
gtex_path    = '/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_gtex_weighted_beta_table.tsv'
decode_path  = '/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_pqtl_decode_weighted_beta_table.tsv'
ukbppp_path  = '/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_pqtl_ukbppp_weighted_beta_table.tsv'



meta_final = cross_modal_meta(eqtlgen_path, gtex_path, decode_path, ukbppp_path,
                              use_random_within=False)
meta_final.to_csv('/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_crossmodal_meta_beta.tsv',
                  sep='\t', index=False)

# For PTRS weights:
meta_beta = meta_final[["gene","meta_beta_common"]].rename(columns={"meta_beta_common":"weighted_beta"})

