# Compile .gs file from the precomputed MAGMA z-stat file
- zstat_file: /n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gene_annotation/MAGMA-v108/MAGMA_v108_GENE_10_ZSTAT.txt
- compute .gs file based on different strategies of assigning gene weights.
- Use `submit.compute_score.sh` to calculate the scDRS scores.

In [1]:
%load_ext lab_black
import os, sys

In [2]:
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md
import submitit
import os
from os.path import join
import glob
import shutil
import yaml
import pandas as pd
import numpy as np
import itertools
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import subprocess


def zsc2pval(zsc):
    import scipy

    return 1 - scipy.stats.norm.cdf(zsc)

In [3]:
df_trait_info = pd.read_excel(
    "https://www.dropbox.com/s/k4tkz981t4x1km5/supp-tables.xlsx?dl=1",
).set_index("Trait_Identifier")

# Compile gene set

In [5]:
method_list = (
    [f"top-{n}" for n in [100, 500, 1000, 2000]]
    + [f"fdr1-{n}" for n in [2000]]
    + [f"fwer5-{n}" for n in [2000]]
)
method_list = [
    f"{m}-{w}" for m, w in itertools.product(method_list, ["zscore", "uniform"])
]

In [6]:
for method in method_list:
    gene_method, gene_n_cap, weight_method = method.split("-")
    if gene_method.startswith("fdr"):
        gene_method = gene_method[0:3] + "0" + gene_method[3] + "_cap100n"
    elif gene_method.startswith("fwer"):
        gene_method = gene_method[0:4] + "0" + gene_method[4] + "_cap100n"
    elif gene_method.startswith("top-"):
        gene_method = gene_method[4:]
    fname = (
        f"magma_10kb_"
        + gene_method
        + gene_n_cap
        + "_"
        + weight_method
        + ".all_traits.rv1.gs"
    )
    df_gs = pd.read_csv(
        join("/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gs_file", fname),
        sep="\t",
        index_col=0,
    ).loc[df_trait_info.index]
    df_gs.index.name = "TRAIT"
    df_gs.to_csv(join("geneset", method + ".gs"), sep="\t")

In [7]:
for method in method_list:
    df = pd.read_csv(f"geneset/{method}.gs", sep="\t")
    todo_df = []
    for _, row in df.iterrows():
        file_exist = [
            os.path.exists(
                join(
                    f"score_file/{method}.{suffix}",
                    row.TRAIT + ".score.gz",
                )
            )
            for suffix in ["uniform", "vs"]
        ]
        if not np.all(file_exist):
            todo_df.append(row)

    batch_size = 1
    # generate batch gs
    batch_dir = join(f"geneset", f"{method}.gs.batch")
    if os.path.exists(batch_dir):
        shutil.rmtree(batch_dir)
    if len(todo_df) == 0:
        continue

    os.makedirs(batch_dir)
    todo_df = pd.concat(todo_df, axis=1).T

    batch_dfs = np.array_split(todo_df, int(np.ceil(todo_df.shape[0] / batch_size)))
    for batch_i, batch_df in enumerate(batch_dfs):
        batch_df.to_csv(join(batch_dir, f"batch{batch_i}.gs"), sep="\t", index=False)

# Varying window size = 0kb, 50kb while fixing top 1,000 and z-score

In [8]:
for window in [0, 10, 50]:
    method = f"top{window}kb-1000-zscore"
    zstat_file = (
        "/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gene_annotation/MAGMA-v108/"
        f"MAGMA_v108_GENE_{window}_ZSTAT_for_scDRS.txt"
    )
    df_zstat = pd.read_csv(zstat_file, sep="\t")[df_trait_info.index]
    df_zstat.to_csv(f"tmp/window_{window}.tsv", sep="\t", na_rep="NA")

    cmds = [
        "scdrs munge-gs",
        f"--zscore-file tmp/window_{window}.tsv",
        f"--out-file geneset/{method}.gs",
        "--weight zscore",
        "--n-max 1000",
    ]
    subprocess.check_output(" ".join(cmds), shell=True)

    df = pd.read_csv(f"geneset/{method}.gs", sep="\t")
    todo_df = []
    for _, row in df.iterrows():
        file_exist = [
            os.path.exists(
                join(
                    f"score_file/{method}.{suffix}",
                    row.TRAIT + ".score.gz",
                )
            )
            for suffix in ["uniform", "vs"]
        ]
        if not np.all(file_exist):
            todo_df.append(row)

    batch_size = 1
    # generate batch gs
    batch_dir = join(f"geneset", f"{method}.gs.batch")
    if os.path.exists(batch_dir):
        shutil.rmtree(batch_dir)
    if len(todo_df) == 0:
        continue

    os.makedirs(batch_dir)
    todo_df = pd.concat(todo_df, axis=1).T

    batch_dfs = np.array_split(todo_df, int(np.ceil(todo_df.shape[0] / batch_size)))
    for batch_i, batch_df in enumerate(batch_dfs):
        batch_df.to_csv(join(batch_dir, f"batch{batch_i}.gs"), sep="\t", index=False)