# 31 ci-PTRS bulk expression scoring

**Origin:** `3_1_transcriptome_risk_score_bulk_MR.ipynb`  
**This annotated version was generated on:** 2025-10-13 06:41

**What this notebook does (high level):**  
- Compute causality-informed PTRS (ci-PTRS) by multiplying normalized expression with MR Î² weights; evaluate discrimination.

**How to use:**  
1. Review the markdown notes before each code cell.  
2. Adjust input/output paths as needed for your environment.  
3. Run cell-by-cell to reproduce artifacts for downstream steps.

---


**Step 1:** Load tabular data (summary stats / annotations).

In [3]:
import pandas as pd
import numpy as np
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
import gzip
import re
import os
from statannotations.Annotator import Annotator

# ========== SETTINGS ==========
mr_files = {
    'eqtlgen': '/mnt/f/10_osteo_MR/results_mr_ptrs/MR_result_eqtlgen_osteo.tsv',
    'gtex': '/mnt/f/10_osteo_MR/results_mr_ptrs/MR_result_gtex_osteo.tsv',
    'pqtl_decode': '/mnt/f/10_osteo_MR/results_mr_ptrs/MR_result_pqtl_decode_osteo.tsv',
    'pqtl_ukbppp': '/mnt/f/10_osteo_MR/results_mr_ptrs/MR_result_pqtl_ukb_ppp_osteo.tsv' ,
}





outdir = '/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/'
os.makedirs(outdir, exist_ok=True)

def safe_name(name):
    return re.sub(r'[\/\s]', '_', str(name))

# 
def process_mr_table(mr_df, celltype=None):
    out = {}
    for method in ['MR Egger', 'Inverse variance weighted', 'Weighted median']:
        df = mr_df[mr_df['celltype'] == celltype] if celltype is not None else mr_df
        sub = df[df['method'] == method ].copy()
        # Drop missing pvals (and optionally se, b)
        sub = sub.dropna(subset=['pval', 'b', 'se'] )

        if len(sub) == 0: continue
        rejected, qvals, _, _ = multipletests(sub['pval'], alpha=0.05, method='fdr_bh')
        sub['qval'] = qvals
        sub = sub[sub['qval'] < 0.05 ]
        out[method] = sub

    return out


def compute_weighted_beta_full(mr_dict, gene_filter=None):
    # 1. Get set of genes significant in all 3 methods
    egger = mr_dict.get('MR Egger', pd.DataFrame())
    ivw   = mr_dict.get('Inverse variance weighted', pd.DataFrame())
    wm    = mr_dict.get('Weighted median', pd.DataFrame())
    gene_sets = [set(df['gene']) for df in [egger, ivw, wm] if len(df) > 0]
    if not gene_sets: return pd.DataFrame()
    combined_genes = set.intersection(*gene_sets)
    if gene_filter is not None:
        combined_genes &= gene_filter
    if not combined_genes:
        return pd.DataFrame()

    # 2. For each gene, collect beta/se per method
    rows = []
    for gene in combined_genes:
        row = {'gene': gene}
        betas, ses = [], []
        for label, df in zip(['MREgger', 'IVW', 'WM'], [egger, ivw, wm]):
            if gene in set(df['gene']):
                this = df[df['gene'] == gene].iloc[0]
                row[f'{label}_beta'] = this['b']
                row[f'{label}_se'] = this['se']
                betas.append(this['b'])
                ses.append(this['se'])
            else:
                row[f'{label}_beta'] = np.nan
                row[f'{label}_se'] = np.nan

        # 3. Weighted beta/SE
        beta_arr = np.array([row[f'{x}_beta'] for x in ['MREgger', 'IVW', 'WM'] if not np.isnan(row[f'{x}_beta'])])
        se_arr = np.array([row[f'{x}_se'] for x in ['MREgger', 'IVW', 'WM'] if not np.isnan(row[f'{x}_se'])])
        if len(se_arr) == 0: continue
        weights = 1/(se_arr**2)
        weighted_beta = np.sum(beta_arr * weights) / np.sum(weights)
        weighted_se = np.sqrt(1 / np.sum(weights))

        
        row['weighted_beta'] = weighted_beta
        row['weighted_se'] = weighted_se
        rows.append(row)
    df = pd.DataFrame(rows)
    return df


def compute_weighted_beta(mr_dict, gene_filter=None):
    egger_genes = set(mr_dict.get('MR Egger', pd.DataFrame()).get('gene', []))
    ivw_genes = set(mr_dict.get('Inverse variance weighted', pd.DataFrame()).get('gene', []))
    wm_genes  = set(mr_dict.get('Weighted median', pd.DataFrame()).get('gene', []))
    combined_genes = egger_genes & (ivw_genes & wm_genes)

    if gene_filter is not None:
        combined_genes &= gene_filter
    if not combined_genes:
        return pd.DataFrame(columns=['gene', 'weighted_beta', 'n_methods'])

    pooled = pd.concat([df for df in mr_dict.values() if len(df) > 0], ignore_index=True)
    pooled = pooled[pooled['gene'].isin(combined_genes)]
    pooled = pooled.dropna(subset=['gene', 'b', 'se'])
    pooled['weight'] = 1 / (pooled['se'] ** 2)

    def agg_fun(df):
        w_beta = np.sum(df['b'] * df['weight']) / np.sum(df['weight'])
        n_methods = df.shape[0]
        return pd.Series({'weighted_beta': w_beta, 'n_methods': n_methods})

    weighted_beta = (
        pooled[['gene', 'b', 'se', 'weight']].groupby('gene', group_keys=False)
        .apply(agg_fun)
        .reset_index()
    )
    return weighted_beta

    import numpy as np
import pandas as pd
from statsmodels.stats.multitest import multipletests

def iut_fdr_sign_consistent(
    mr_df: pd.DataFrame,
    methods=('MR Egger', 'Inverse variance weighted', 'Weighted median'),
    celltype=None,
    fdr=0.05,
    gene_col='gene',
    method_col='method',
    pval_col='pval',
    beta_col='b',
    se_col='se'
) -> pd.DataFrame:
    """

    Intersection-Union Test (IUT) + BH-FDR across genes with strict sign-consistency.

    Steps
    -----
    1) (Optional) subset by `celltype`.
    2) Keep rows in `methods`, coerce p/b/se to numeric, drop missing.
    3) Deduplicate to one row per (gene, method) by taking the row with the smallest p-value.
    4) Pivot to wide (per-gene columns for each method: p, beta, se).
    5) Keep genes with all methods present; compute p_IUT = max(p_m) across methods.
    6) Apply BH to p_IUT across genes -> q_IUT.
    7) Enforce strict sign-consistency (all betas >0 or all <0).
    8) Compute inverse-variance weighted beta and SE across methods.
    9) robust_hit = (q_IUT < fdr) & sign_consistent.

    Returns
    -------
    DataFrame with columns:
      gene, p_<short>, beta_<short>, se_<short>, p_IUT, q_IUT,
      sign_consistent, consensus_sign, weighted_beta, weighted_se, robust_hit
    """
    df = mr_df.copy()

    if celltype is not None and 'celltype' in df.columns:
        df = df[df['celltype'] == celltype]

    # Keep the three MR methods of interest
    df = df[df[method_col].isin(methods)].copy()

    # Coerce types and drop missing essentials
    for c in (pval_col, beta_col, se_col):
        df[c] = pd.to_numeric(df[c], errors='coerce')
    df = df.dropna(subset=[gene_col, method_col, pval_col, beta_col, se_col])

    # One row per (gene, method): choose the row with the smallest p-value
    df = (df.sort_values([gene_col, method_col, pval_col, se_col])
            .groupby([gene_col, method_col], as_index=False)
            .first())

    # Wide pivots
    p_wide  = df.pivot(index=gene_col, columns=method_col, values=pval_col)
    b_wide  = df.pivot(index=gene_col, columns=method_col, values=beta_col)
    se_wide = df.pivot(index=gene_col, columns=method_col, values=se_col)

    # Keep genes with all required methods present
    # print( p_wide.head() , methods ) 
    
    keep = p_wide.notna().all(axis=1)
    if keep.sum() == 0:
        return pd.DataFrame(columns=[
            gene_col, 'p_IUT', 'q_IUT', 'sign_consistent', 'consensus_sign',
            'weighted_beta', 'weighted_se', 'robust_hit'
        ])

    p_wide  = p_wide.loc[keep, methods]
    b_wide  = b_wide.loc[keep, methods]
    se_wide = se_wide.loc[keep, methods]

    # Helper for short labels in column names
    label_map_default = {
        'MR Egger': 'egger',
        'Inverse variance weighted': 'ivw',
        'Weighted median': 'wm'
    }
    def short_label(m):
        return label_map_default.get(m, m.lower().replace(' ', '_'))

    out = pd.DataFrame({gene_col: p_wide.index})

    # Attach per-method columns
    for m in methods:
        out[f"p_{short_label(m)}"]    = p_wide[m].values
        out[f"beta_{short_label(m)}"] = b_wide[m].values
        out[f"se_{short_label(m)}"]   = se_wide[m].values

    # IUT p-value = max of per-method p-values (AND rule)
    p_mat = p_wide.values
    out["p_IUT"] = p_mat.max(axis=1)

    # BH-FDR across genes on p_IUT
    out["q_IUT"] = multipletests(out["p_IUT"].values, method='fdr_bh')[1]

    # Sign consistency: all betas positive OR all negative (strict)
    beta_mat = b_wide.values
    all_pos = (beta_mat > 0).all(axis=1)
    all_neg = (beta_mat < 0).all(axis=1)
    out["sign_consistent"] = all_pos | all_neg
    out["consensus_sign"] = np.where(all_pos, 1, np.where(all_neg, -1, 0))

    # Inverse-variance weighted beta/SE
    w = 1.0 / np.square(se_wide.values)
    wsum = w.sum(axis=1)
    out["weighted_beta"] = (beta_mat * w).sum(axis=1) / wsum
    out["weighted_se"] = np.sqrt(1.0 / wsum)

    # Final robust hit flag
    out["robust_hit"] = (out["q_IUT"] < fdr) & out["sign_consistent"]

    # Sort by q-value then abs(weighted_beta)
    out = out.sort_values(["q_IUT", out["weighted_beta"].abs().name]).reset_index(drop=True)
    return out


def load_probe2gene(probe2gene_path):
    probe2gene = {}
    with gzip.open(probe2gene_path, 'rt') as f:
        inside_table = False
        for line in f:
            if line.startswith('!platform_table_begin'):
                inside_table = True
                header = next(f).strip().split('\t')
                idx_probe = header.index('ID')
                idx_symbol = header.index('Gene Symbol')
                continue
            if inside_table:
                if line.startswith('!platform_table_end'):
                    break
                cols = line.strip().split('\t')
                probe = cols[idx_probe]
                symbol = cols[idx_symbol]
                if symbol and symbol != '---':
                    probe2gene[probe] = symbol.split(' /// ')[0]
    return probe2gene

def load_and_process_expression(expr_path, probe2gene_path):
    probe2gene = load_probe2gene(probe2gene_path)
    expr = pd.read_csv(expr_path, sep='\t', comment='!', index_col=0)
    expr.index = expr.index.astype(str)
    expr = expr[expr.index.isin(probe2gene)]
    expr['gene'] = expr.index.map(probe2gene)
    gene_expr = expr.groupby('gene').mean()
    gene_expr_norm = gene_expr.subtract(gene_expr.mean(axis=1), axis=0)
    return gene_expr_norm

def extract_group_labels(matrix_file):
    with gzip.open(matrix_file, 'rt') as f:
        for line in f:
            if line.startswith('!Sample_title'):
                sample_titles = line.strip().split('\t')[1:]
                break
    group_labels = []
    for title in sample_titles:
        if re.search(r'control group', title, re.IGNORECASE):
            group_labels.append('Control')
        elif re.search(r'disease group', title, re.IGNORECASE):
            group_labels.append('Disease')
        else:
            group_labels.append('Unknown')
    return group_labels

def compute_ptrs(gene_expr_norm, weighted_beta):
    valid_genes = set(gene_expr_norm.index) & set(weighted_beta['gene'])
    if len(valid_genes) == 0:
        return pd.Series(dtype=float)
    betas = weighted_beta.set_index('gene').loc[list(valid_genes)]['weighted_beta']
    X = gene_expr_norm.loc[betas.index]
    PTRS = X.T.dot(betas)
    return PTRS


def plot_ptrs_bak(ptrs, group_labels, out_prefix):
    trs_df = pd.DataFrame({'TRS': ptrs.values, 'Group': group_labels[:len(ptrs)]})
    plt.figure(figsize=(6,5))
    ax = sns.boxplot(x='Group', y='TRS', data=trs_df, palette='Set2')
    sns.stripplot(x='Group', y='TRS', data=trs_df, color='k', alpha=0.6, jitter=True, ax=ax)
    pairs = [('Control', 'Disease')]
    # Compute p-value
    from scipy.stats import ttest_ind
    control = trs_df.loc[trs_df['Group'] == 'Control', 'TRS']
    disease = trs_df.loc[trs_df['Group'] == 'Disease', 'TRS']
    if len(control) > 1 and len(disease) > 1:
        ttest_res = ttest_ind(control, disease)
        pval = ttest_res.pvalue
        annotator = Annotator(ax, pairs, data=trs_df, x='Group', y='TRS')
        annotator.configure(test=None, text_format='star', loc='outside')
        annotator.set_pvalues([pval])
        annotator.annotate()
    #plt.title('Transcriptome Risk Score by Group')
    plt.ylabel('PTRS')
    plt.xlabel('')
    plt.tight_layout()
    plt.savefig(f"{out_prefix}_PTRS_boxplot.png", dpi=120)
    plt.close()


# ---- Fix seaborn warning for boxplot ----
def plot_ptrs(ptrs, group_labels, out_prefix):
    print( 'ptrs', out_prefix ) 
    trs_df = pd.DataFrame({'TRS': ptrs.values, 'Group': group_labels[:len(ptrs)]})
    plt.figure(figsize=(3,5))
    ax = sns.boxplot(x='Group', y='TRS', data=trs_df  , palette='Set2', hue='Group', legend=False) #
    sns.stripplot(x='Group', y='TRS', data=trs_df,  alpha=0.6, jitter=True, ax=ax, palette='dark:k', hue='Group', legend=False ) # 
    plt.axhline(y=0, xmin=-1, xmax=2, color='grey', linestyle='--', label='Reference', linewidth=0.5 )

    plt.ylim( -23, 23 )
    #if  'crossmodal' not in key:
    #    plt.ylim( -10, 10) 
    #ax = sns.boxplot(x='Group', y='TRS', hue='Group', legend=False, data=trs_df, palette='Set2')
    #sns.stripplot(x='Group', y='TRS', hue='Group', dodge=True, data=trs_df, palette='dark:k', alpha=0.6, jitter=True, ax=ax, legend=False)
    
    pairs = [('Control', 'Disease')]
    from scipy.stats import ttest_ind
    from scipy.stats import mannwhitneyu
    control = trs_df.loc[trs_df['Group'] == 'Control', 'TRS']
    disease = trs_df.loc[trs_df['Group'] == 'Disease', 'TRS']

    print( 'control', 'mean, sem', np.mean( control), scipy.stats.sem( control), 
           'disease', 'mean, sem', np.mean( disease ), scipy.stats.sem( disease ) ) 
    if len(control) > 1 and len(disease) > 1:
        ttest_res = ttest_ind(control, disease)
        pval = ttest_res.pvalue
        
        mwutest_res = mannwhitneyu(control, disease)
        pval = mwutest_res.pvalue
        
        annotator = Annotator(ax, pairs, data=trs_df, x='Group', y='TRS')
        annotator.configure(test=None, text_format='star', loc='outside')
        annotator.set_pvalues([pval])
        annotator.annotate()
        
    plt.ylabel('PTRS')
    plt.xlabel('')
    plt.tight_layout()
    plt.savefig(f"{out_prefix}_PTRS_boxplot.pdf" )
    plt.close()




# ========== MAIN PIPELINE ==========

# 1. Load expression

gene_expr_norm = load_and_process_expression(
    '/mnt/f/10_osteo_MR/datasets/gse123568/GSE123568_series_matrix.txt.gz',
    '/mnt/f/10_osteo_MR/datasets/gse123568/GSE123568_family.soft.gz'
)

# 2. Get group labels
group_labels = extract_group_labels('/mnt/f/10_osteo_MR/datasets/gse123568/GSE123568_series_matrix.txt.gz')


bulk_keys = ['eqtlgen',  'gtex', 'pqtl_ukbppp' , 'pqtl_decode' ]  # 
bulk_gene_sets = []  

for key in bulk_keys:
    path = mr_files[key]
    mr_df = pd.read_csv(path, sep='\t')
    mr_df['pval'] = pd.to_numeric(mr_df['pval'], errors='coerce')
    mr_methods = process_mr_table(mr_df)
    weighted_beta_full = compute_weighted_beta_full(mr_methods)

    # iut_fdr sing consistent 
    #mr_df = pd.read_csv(path, sep='\t')
    #mr_df['pval'] = pd.to_numeric(mr_df['pval'], errors='coerce')

    res_all = iut_fdr_sign_consistent(mr_df)                 # across all cell types (if any)
    # or per cell type:
    # res_ct = iut_fdr_sign_consistent(mr_df, celltype='Whole_Blood')

    robust_hits = res_all[res_all['robust_hit']]

    # If you still want just the weighted betas for robust hits:
    weighted_beta_full = robust_hits[[ 'gene', 'weighted_beta', 'weighted_se' ]].copy()
    weighted_beta_full.rename(columns={'weighted_beta':'weighted_beta',
                                   'weighted_se':'weighted_se'}, inplace=True)

    
    if weighted_beta_full.shape[0] == 0: continue
    weighted_beta_full.to_csv(f"{outdir}bulk_{key}_weighted_beta_table.tsv", sep='\t', index=False)
    plt.figure(figsize=(5,3))
    sns.histplot(weighted_beta_full['weighted_beta'], color='k')
    plt.title(f'Weighted Beta distribution: {key}')
    plt.xlabel('Weighted Beta')
    plt.tight_layout()
    plt.savefig(f"{outdir}bulk_{key}_weighted_beta_hist.png", dpi=120)
    plt.close()
    
    ptrs = compute_ptrs(gene_expr_norm, weighted_beta_full[['gene','weighted_beta']].rename(columns={'weighted_beta':'weighted_beta'}))
    
    if len(ptrs) == 0: continue
    plot_ptrs(ptrs, group_labels, f"{outdir}bulk_{key}")
    


 




ptrs /mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_eqtlgen
control mean, sem -3.58437355483901 1.2731713667877569 disease mean, sem 1.1947911849463408 0.37182969562053036
p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Control vs. Disease: Custom statistical test, P_val:1.884e-03
ptrs /mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_gtex
control mean, sem -0.49500893219760594 0.23421406761708546 disease mean, sem 0.16500297739920317 0.09525830811934556
p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Control vs. Disease: Custom statistical test, P_val:1.091e-02
ptrs /mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_pqtl_ukbppp
control mean, sem -1.8672288520182811 0.804758670691447 disease mean, sem 0.622409617339

**Step 2:** Load tabular data (summary stats / annotations).

In [35]:
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind, mannwhitneyu

def permutation_test_ptrs(ptrs, group_labels, n_perm=10000, alternative='two-sided',
                          stat='mean', random_state=None):
    """
    Label-permutation test on PTRS differences between Control and Disease.

    Parameters
    ----------
    ptrs : array-like (Series OK)
        PTRS values per sample.
    group_labels : array-like
        Labels aligned to ptrs; expects 'Control' and 'Disease'.
    n_perm : int
        Number of permutations.
    alternative : {'two-sided','greater','less'}
        Alternative hypothesis. For 'greater', tests Control > Disease.
    stat : {'mean','median'}
        Test statistic = difference (Control - Disease) in mean or median.
    random_state : int or None
        Seed for reproducibility.

    Returns
    -------
    result : dict
        {
          'obs_stat': float,
          'pvalue': float,
          'null_mean': float,
          'null_sd': float,
          'n_perm': int,
          'alternative': str,
          'stat': str,
          'n_control': int,
          'n_disease': int
        }
    """
    rng = np.random.default_rng(random_state)

    # Align & clean
    ptrs = np.asarray(pd.Series(ptrs).values, dtype=float)
    groups = np.asarray(group_labels)[:len(ptrs)]
    mask = np.isfinite(ptrs) & pd.notna(groups)
    x = ptrs[mask]
    g = groups[mask]

    a = x[g == 'Control']
    b = x[g == 'Disease']
    nA, nB = len(a), len(b)
    if nA < 2 or nB < 2:
        return {
            'obs_stat': np.nan, 'pvalue': np.nan, 'null_mean': np.nan, 'null_sd': np.nan,
            'n_perm': n_perm, 'alternative': alternative, 'stat': stat,
            'n_control': nA, 'n_disease': nB
        }

    def _stat(u, v):
        if stat == 'median':
            return np.median(u) - np.median(v)
        # default 'mean'
        return u.mean() - v.mean()

    obs = _stat(a, b)

    # Build null by permuting labels (keep group sizes)
    n = x.size
    stats = np.empty(n_perm, dtype=float)
    for i in range(n_perm):
        perm = rng.permutation(n)
        A = x[perm[:nA]]
        B = x[perm[nA:]]
        stats[i] = _stat(A, B)

    # P-value with +1 correction
    if alternative == 'two-sided':
        pval = (np.sum(np.abs(stats) >= abs(obs)) + 1) / (n_perm + 1)
    elif alternative == 'greater':
        pval = (np.sum(stats >= obs) + 1) / (n_perm + 1)
    else:  # 'less'
        pval = (np.sum(stats <= obs) + 1) / (n_perm + 1)

    return {
        'obs_stat': float(obs),
        'pvalue': float(pval),
        'null_mean': float(stats.mean()),
        'null_sd': float(stats.std(ddof=1)),
        'n_perm': int(n_perm),
        'alternative': alternative,
        'stat': stat,
        'n_control': int(nA),
        'n_disease': int(nB)
    }


def compute_ptrs(gene_expr_norm, weighted_beta):
    valid_genes = set(gene_expr_norm.index) & set(weighted_beta['gene'])
    if len(valid_genes) == 0:
        return pd.Series(dtype=float)
    betas = weighted_beta.set_index('gene').loc[list(valid_genes)]['weighted_beta']
    X = gene_expr_norm.loc[betas.index]
    PTRS = X.T.dot(betas)
    return PTRS


def test_ptrs(ptrs, group_labels, out_prefix, n_perm=10000, random_state=42):
    print('ptrs', out_prefix)
    trs_df = pd.DataFrame({'TRS': pd.Series(ptrs).values,
                           'Group': np.asarray(group_labels)[:len(ptrs)]})
    trs_df = trs_df.dropna(subset=['TRS', 'Group'])

    control = trs_df.loc[trs_df['Group'] == 'Control', 'TRS'].values
    disease = trs_df.loc[trs_df['Group'] == 'Disease', 'TRS'].values

    results = {
        'n_control': len(control),
        'n_disease': len(disease),
        'mean_control': float(np.mean(control)) if len(control) else np.nan,
        'mean_disease': float(np.mean(disease)) if len(disease) else np.nan,
    }

    if len(control) > 1 and len(disease) > 1:
        # Welch t-test (safer if variances differ)
        t_res = ttest_ind(control, disease, equal_var=False, nan_policy='omit')
        mwu_res = mannwhitneyu(control, disease, alternative='two-sided')

        perm_res = permutation_test_ptrs(
            ptrs=trs_df['TRS'].values,
            group_labels=trs_df['Group'].values,
            n_perm=n_perm,
            alternative='two-sided',
            stat='mean',
            random_state=random_state
        )

        results.update({
            't_stat': float(t_res.statistic), 't_p': float(t_res.pvalue),
            'mwu_stat': float(mwu_res.statistic), 'mwu_p': float(mwu_res.pvalue),
            'perm_stat(Control_minus_Disease)': perm_res['obs_stat'],
            'perm_p': perm_res['pvalue'],
            'perm_null_mean': perm_res['null_mean'],
            'perm_null_sd': perm_res['null_sd'],
            'perm_n': perm_res['n_perm']
        })
    else:
        results.update({
            't_stat': np.nan, 't_p': np.nan,
            'mwu_stat': np.nan, 'mwu_p': np.nan,
            'perm_stat(Control_minus_Disease)': np.nan,
            'perm_p': np.nan,
            'perm_null_mean': np.nan,
            'perm_null_sd': np.nan,
            'perm_n': n_perm
        })

    # Save a tidy results table next to your plots
    out_path = f"{out_prefix}_PTRS_tests.tsv"
    pd.DataFrame([results]).to_csv(out_path, sep='\t', index=False)
    return results


bulk_keys = ['eqtlgen', 'gtex', 'pqtl_ukbppp', 'pqtl_decode']

for key in bulk_keys:
    path = mr_files[key]
    mr_df = pd.read_csv(path, sep='\t')
    mr_df['pval'] = pd.to_numeric(mr_df['pval'], errors='coerce')
    mr_methods = process_mr_table(mr_df)
    weighted_beta_full = compute_weighted_beta_full(mr_methods)
    if weighted_beta_full.shape[0] == 0: continue
    # ... prepare `gene_expr_norm`, `weighted_beta_full`, `group_labels`, `outdir` ...
    ptrs = compute_ptrs(
        gene_expr_norm,
        weighted_beta_full[['gene', 'weighted_beta']].rename(columns={'weighted_beta': 'weighted_beta'})
    )
    res = test_ptrs(ptrs, group_labels, f"{outdir}bulk_{key}", n_perm=10000, random_state=42)
    print(key, res['t_p'], res['mwu_p'], res['perm_p'])






ptrs /mnt/f/10_osteo_MR/MR_ready/results/bulk_eqtlgen
eqtlgen 0.060548606703024886 0.04394222286967825 0.006999300069993001
ptrs /mnt/f/10_osteo_MR/MR_ready/results/bulk_gtex
gtex 0.07762353093902911 0.021689933800690418 0.028897110288971104
ptrs /mnt/f/10_osteo_MR/MR_ready/results/bulk_pqtl_ukbppp
pqtl_ukbppp 0.01598241962239166 0.009970028375973411 0.0058994100589941
ptrs /mnt/f/10_osteo_MR/MR_ready/results/bulk_pqtl_decode
pqtl_decode 0.046237745500172074 0.0034952701128578537 0.009299070092990702


**Step 3:** Load tabular data (summary stats / annotations).

In [2]:
# crossmodel meta-beta

if 1:
    key = 'crossmodal'
    path = "/mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_crossmodal_meta_beta.tsv"
    mr_df = pd.read_csv(path, sep='\t')
    meta_beta = mr_df[["gene","meta_beta_common"]].rename(columns={"meta_beta_common":"weighted_beta"})
    ptrs = compute_ptrs(gene_expr_norm, meta_beta[['gene','weighted_beta']].rename(columns={'weighted_beta':'weighted_beta'}))
    plot_ptrs(ptrs, group_labels, f"{outdir}bulk_{key}")



ptrs /mnt/f/10_osteo_MR/results_mr_ptrs/PTRS/bulk_crossmodal
control mean, sem -7.87397818112648 2.635503552915691 disease mean, sem 2.624659393708831 0.6829777454416341
p-value annotation legend:
      ns: 5.00e-02 < p <= 1.00e+00
       *: 1.00e-02 < p <= 5.00e-02
      **: 1.00e-03 < p <= 1.00e-02
     ***: 1.00e-04 < p <= 1.00e-03
    ****: p <= 1.00e-04

Control vs. Disease: Custom statistical test, P_val:5.576e-04
