In [1]:
from scdrs.util import test_gearysc
import submitit
from os.path import join
import glob
import pandas as pd
import scanpy as sc
from tqdm import tqdm

In [2]:
DATA_ROOT_DIR = "/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data"

URL_SUPP_TABLE = "https://www.dropbox.com/s/qojbzu5zln33j7f/supp_tables.xlsx?dl=1"

df_trait_info = pd.read_excel(
    URL_SUPP_TABLE,
    sheet_name=0,
)

df_celltype_info = pd.read_excel(
    URL_SUPP_TABLE,
    sheet_name=1,
)
trait_list = df_trait_info["Trait_Identifier"].values

In [3]:
DATA_PATH = "/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data"

dict_dset = {
    "tms_facs": join(
        DATA_PATH, "tabula_muris_senis/tabula-muris-senis-facs-official-raw-obj.h5ad"
    ),
    "tms_droplet": join(
        DATA_PATH, "tabula_muris_senis/tabula-muris-senis-droplet-official-raw-obj.h5ad"
    ),
    "ts_facs": join(
        DATA_PATH, "single_cell_data/tabula_sapiens/obj_smartseq2_raw.h5ad"
    ),
}

dict_df_obs = {k: sc.read_h5ad(dict_dset[k]).obs for k in dict_dset}

# Tabula Muris

In [8]:
import os.path

executor = submitit.AutoExecutor(folder="submitit_log/")
executor.update_parameters(timeout_min=240, mem_gb=16, slurm_partition="shared")

def tms_wrapper(dset, tissue_list, trait):
    # load adata
    score_dir = join(DATA_ROOT_DIR, f"score_file/score.{dset}_with_cov.magma_10kb_1000")
    df_score_full = pd.read_csv(
        join(score_dir, f"{trait}.full_score.gz"), sep="\t", index_col=0
    )
    for tissue in tissue_list:
        out_file = join(f"./gearysc/{dset}", f"{trait}.{tissue}.csv")
        if os.path.exists(out_file):
            continue
        print(tissue)
        adata = sc.read_h5ad(
            join(
                DATA_ROOT_DIR,
                f"tabula_muris_senis/tabula-muris-senis-{dset.split('_')[1]}-processed-official-annotations-{tissue}.h5ad",
            )
        )
        adata = adata[adata.obs.index.isin(df_score_full.index)]
        df_score_full_tissue = df_score_full.reindex(adata.obs.index).dropna()
        df_rls = test_gearysc(adata, df_score_full_tissue, "cell_ontology_class")
        df_rls.to_csv(out_file)

for dset in ["tms_facs", "tms_droplet"]:
    jobs = executor.map_array(lambda trait : tms_wrapper(dset, dict_df_obs[dset].tissue.unique(), trait), trait_list)

# Tabula Sapiens

In [40]:
executor = submitit.AutoExecutor(folder="submitit_log/")
executor.update_parameters(timeout_min=120, mem_gb=16, slurm_partition="serial_requeue")
out_dir = "./gearysc/ts_facs"

def wrapper(trait):
    # load adata
    score_dir = join(DATA_ROOT_DIR, "score_file/score.ts_facs_with_cov.magma_10kb_1000")
    df_score_full = pd.read_csv(
        join(score_dir, f"{trait}.full_score.gz"), sep="\t", index_col=0
    )
    adata = sc.read_h5ad("/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/single_cell_data/tabula_sapiens/obj_smartseq2_raw.h5ad")
    
    tissue_list = list(adata.obs.tissue.unique())
    
    for tissue in tissue_list:
        adata_tissue = adata[(adata.obs.tissue == tissue) & adata.obs.index.isin(df_score_full.index)]
        df_score_full_tissue = df_score_full.reindex(adata_tissue.obs.index)
        df_rls = test_gearysc(adata_tissue, df_score_full_tissue, "cell_ontology_class")
        df_rls.to_csv(join(out_dir, f"{trait}.{tissue}.csv"))
        
jobs = executor.map_array(lambda trait : wrapper(trait), trait_list)