# Calculate scDRS scores for different parameters

In [1]:
%load_ext lab_black
import os, sys

In [2]:
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md
import submitit
import os
from os.path import join
import glob
import shutil
import yaml
import pandas as pd

In [3]:
def process_magma(window_size):
    assert window_size in [0, 10, 50]
    zstat_file = f"/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gene_annotation/MAGMA-v108/MAGMA_v108_GENE_{window_size}_ZSTAT.txt"
    df = pd.read_csv(zstat_file, sep="\t")
    dic_ranked_gene_list = dict()
    for trait in df.columns:
        dic_ranked_gene_list[trait] = (
            df[trait].dropna().sort_values(ascending=False).index.tolist()
        )
    return dic_ranked_gene_list

In [4]:
with open("../url_dict.yaml") as f:
    url_dict = yaml.safe_load(f)

In [5]:
list_trait = pd.read_excel(
    "https://www.dropbox.com/s/qojbzu5zln33j7f/supp_tables.xlsx?dl=1", sheet_name=0
)["Trait_Identifier"].values

In [6]:
# download from MAGMA websites
gene_loc_path = "../data/gene_loc/NCBI37.3.gene.loc"
gene_loc = pd.read_csv(
    gene_loc_path,
    delim_whitespace=True,
    header=None,
    usecols=[1, 2, 3, 5],
    names=["CHR", "START", "STOP", "GENE"],
)

In [7]:
for window_size in [0, 10, 50]:
    out_dir = f"./geneset/{window_size}kb"
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    dic_ranked_gene_list = process_magma(window_size)
    for trait in list_trait:
        with open(join(out_dir, f"{trait}.txt"), "w") as f:
            f.writelines("\n".join(dic_ranked_gene_list[trait]))

In [8]:
list_gs_name = [
    f"{window_size}kb.{n_gene}"
    for window_size in [10]
    for n_gene in [100, 500, 1000, 2000]
] + [f"{window_size}kb.{n_gene}" for window_size in [0, 50] for n_gene in [1000]]

In [9]:
for gs_name in list_gs_name:
    window_size, top_n_gene = gs_name.split(".")
    window_size = window_size[:-2]
    top_n_gene = int(top_n_gene)
    print(window_size, top_n_gene)
    df = {"TRAIT": [], "GENESET": []}
    for trait in list_trait:
        file = f"./geneset/{window_size}kb/{trait}.txt"
        with open(file) as f:
            gene_list = [line.strip() for line in f.readlines()]
        df["TRAIT"].append(trait)
        df["GENESET"].append(",".join(gene_list[0:top_n_gene]))
    df = pd.DataFrame(df)
    df.to_csv(
        join(f"geneset/{window_size}kb.{top_n_gene}.gs"),
        sep="\t",
        index=False,
    )

10 100
10 500
10 1000
10 2000
0 1000
50 1000


In [16]:
DATASET = "tms_facs"
for gs_name in list_gs_name:
    print(gs_name)
    window_size, top_n_gene = gs_name.split(".")
    window_size = window_size[:-2]
    df = pd.read_csv(f"geneset/{window_size}kb.{top_n_gene}.gs", sep="\t")
    todo_df = []
    for _, row in df.iterrows():
        if not os.path.exists(
            join(
                f"{DATASET}/score_file/{window_size}kb.{top_n_gene}",
                row.TRAIT + ".score.gz",
            )
        ):
            todo_df.append(row)

    batch_size = 1
    # generate batch gs
    batch_dir = join(f"{DATASET}/geneset", f"{window_size}kb.{top_n_gene}.gs.batch")
    if os.path.exists(batch_dir):
        shutil.rmtree(batch_dir)

    if len(todo_df) == 0:
        continue

    os.makedirs(batch_dir)
    todo_df = pd.concat(todo_df, axis=1).T

    batch_dfs = np.array_split(todo_df, int(np.ceil(todo_df.shape[0] / batch_size)))
    for batch_i, batch_df in enumerate(batch_dfs):
        batch_df.to_csv(join(batch_dir, f"batch{batch_i}.gs"), sep="\t", index=False)

10kb.100
10kb.500
10kb.1000
10kb.2000
0kb.1000
50kb.1000
