In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy as sp
from statsmodels.stats.multitest import multipletests
import os
import time
from os.path import join
import glob

import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md
from anndata import read_h5ad

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

scanpy==1.8.1 anndata==0.7.5 umap==0.4.6 numpy==1.19.4 scipy==1.5.2 pandas==1.3.2 scikit-learn==0.23.2 statsmodels==0.12.0 python-igraph==0.8.3 leidenalg==0.8.2


In [2]:
trait_list = pd.read_excel("https://www.dropbox.com/s/qojbzu5zln33j7f/supp_tables.xlsx?dl=1", sheet_name=0)[
    "Trait_Identifier"
].values

np.savetxt(f"out/trait_list.txt", trait_list, fmt='%s')

In [None]:
DATA_PATH = "/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data"

n_top_gene = 1000
df_hom = pd.read_csv(join(DATA_PATH, 'gene_annotation/mouse_human_homologs.txt'), sep='\t')
dic_map = {x:y for x,y in zip(df_hom['MOUSE_GENE_SYM'], df_hom['HUMAN_GENE_SYM'])}

dict_dset = {
    "tms_facs": join(
        DATA_PATH, "tabula_muris_senis/tabula-muris-senis-facs-official-raw-obj.h5ad"
    ),
    "tms_droplet": join(
        DATA_PATH, "tabula_muris_senis/tabula-muris-senis-droplet-official-raw-obj.h5ad"
    ),
    "ts_facs": join(
        DATA_PATH, "single_cell_data/tabula_sapiens/obj_smartseq2_raw.h5ad"
    ),
}

for dset_name in ["ts_facs", "tms_facs", "tms_droplet"]:
    adata = sc.read_h5ad(dict_dset[dset_name])
    
    if dset_name in ["tms_facs", "tms_droplet"]:
        adata = adata[:, adata.var.index.isin(dic_map.keys())]
    else:
        adata = adata[:, adata.var.index.isin(dic_map.values())]

    adata.obs["ct"] = adata.obs.cell_ontology_class.apply(lambda x: '_'.join(x.split()).replace(",", "")).astype(str)

    sc.pp.filter_cells(adata, min_genes=250)
    sc.pp.filter_genes(adata, min_cells=50)

    sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
    sc.pp.log1p(adata)

    ct_list = adata.obs.cell_ontology_class.value_counts()
    print("Cell-types being removed: ", '\t'.join(ct_list[ct_list <= 10].index.values))
    ct_list = ct_list[ct_list > 10].index.values

    adata = adata[adata.obs.cell_ontology_class.isin(ct_list)]

    groupby = "ct"
    sc.tl.rank_genes_groups(adata, groupby=groupby, method="t-test_overestim_var")
    
    # create gene set
    group_list = sorted(np.unique(adata.obs[groupby]))
    ct_list = adata.obs.ct.unique()
    dic_gene_list = {}
    for ct in ct_list:
        gene_list = sc.get.rank_genes_groups_df(adata, group=ct).sort_values("scores", ascending=False)["names"][0:n_top_gene]
        if dset_name in ["tms_facs", "tms_droplet"]:
            dic_gene_list[ct] = [dic_map[g] for g in gene_list]
        else:
            dic_gene_list[ct] = gene_list

    # all homolog genes
    dic_gene_list["background"] = list(dic_map.values())

    os.makedirs(f"out/{dset_name}/ldscore/", exist_ok=True)
    for ct in [*ct_list, "background"]:
        np.savetxt(f"out/{dset_name}/ldscore/{ct}.geneset", dic_gene_list[ct], fmt='%s')

    # write relevant list
    with open(f"out/{dset_name}/ldsc.ldcts", "w") as f:
        lines = []
        for ct in ct_list:
            lines.append(f"{ct}\tout/{dset_name}/ldscore/{ct}.,out/{dset_name}/ldscore/background.\n")
        f.writelines(lines)

    np.savetxt(f"out/{dset_name}/ct_list.txt", [*ct_list, "background"], fmt='%s')