In [1]:
import scanpy as sc
from anndata import read_h5ad
import pandas as pd
import numpy as np
import os
from os.path import join
import time
import argparse

# inhouse tools
import scdrs

# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
# Constants
DATA_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'
OUT_PATH=DATA_PATH+'/simulation_data'
H5AD_FILE=DATA_PATH+'/tabula_muris_senis/tabula-muris-senis-facs-official-raw-obj.h5ad'

In [3]:
# Load .h5ad file 
adata = read_h5ad(H5AD_FILE)
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=200)
print('# H5AD_FILE loaded: ', adata.shape)

# H5AD_FILE loaded:  (110823, 18383)


### Generate .gs files 

In [6]:
# Config 
GS_SIZE_LIST = [100, 500, 1000, 2000]
N_REP = 100
N_GENE_SELECT = int(0.25*adata.shape[1])

# adata_norm = adata.copy()
# sc.pp.normalize_per_cell(adata_norm, counts_per_cell_after=1e4)
# sc.pp.log1p(adata_norm)
# scdrs.pp.preprocess(adata_norm)
# adata_norm.uns['SCDRS_PARAM']['GENE_STATS']['bvar'] = \
#     adata_norm.uns['SCDRS_PARAM']['GENE_STATS']['var'] - adata_norm.uns['SCDRS_PARAM']['GENE_STATS']['var_tech']


adata_norm = adata.copy()
sc.pp.normalize_per_cell(adata_norm, counts_per_cell_after=1e4)
sc.pp.log1p(adata_norm)

scdrs.pp.preprocess(adata_norm)
df_gene = adata_norm.uns['SCDRS_PARAM']['GENE_STATS'].copy()
df_gene['bvar'] = df_gene['var'] - df_gene['var_tech']

scdrs.preprocess(adata_norm, adj_prop='cell_ontology_class')
df_gene_adj = adata_norm.uns['SCDRS_PARAM']['GENE_STATS'].copy()
df_gene_adj['bvar'] = df_gene_adj['var'] - df_gene_adj['var_tech']

In [16]:
dic_gs_all = {}
# df_gene = adata_norm.uns['SCDRS_PARAM']['GENE_STATS']

# All genes 
dic_gs_all['all'] = sorted(adata.var_names)

# Highly-expressed genes 
dic_gs_all['highmean'] = sorted(df_gene.sort_values('mean', ascending=False).index[:N_GENE_SELECT])

# Highly-variale genes 
dic_gs_all['highvar'] = sorted(df_gene.sort_values('var', ascending=False).index[:N_GENE_SELECT])

# Over-dispersed genes
dic_gs_all['highbvar'] = sorted(df_gene.sort_values('bvar', ascending=False).index[:N_GENE_SELECT])

# Adjusted highly-expressed genes 
dic_gs_all['adj_highmean'] = sorted(df_gene_adj.sort_values('mean', ascending=False).index[:N_GENE_SELECT])

# Adjusted highly-variale genes 
dic_gs_all['adj_highvar'] = sorted(df_gene_adj.sort_values('var', ascending=False).index[:N_GENE_SELECT])

# Adjusted over-dispersed genes
dic_gs_all['adj_highbvar'] = sorted(df_gene_adj.sort_values('bvar', ascending=False).index[:N_GENE_SELECT])

In [17]:
# Randomly sample genes 
f = open(OUT_PATH+'/simu_list.rv1.txt', 'w')
for suffix in dic_gs_all.keys():
    for n_gene in GS_SIZE_LIST:
        gs_name = '%s_ngene%d'%(suffix, n_gene)
        df_gs = pd.DataFrame(columns=['TRAIT', 'GENESET'])
        for i_rep in np.arange(N_REP):
            np.random.seed(i_rep)
            temp_list = np.random.choice(dic_gs_all[suffix], size=n_gene, replace=False)
            df_gs.loc[i_rep] =  ['%s_rep%d'%(gs_name,i_rep), ','.join(temp_list)]
        df_gs.to_csv(OUT_PATH+'/gs_file.rv1/%s.gs'%gs_name, sep='\t', index=False)
        f.write(gs_name+'\n')    
f.close()

In [18]:
# Compare with old gene set
GS_PATH = OUT_PATH+'/gs_file.rv1'
GA_PATH_REF = OUT_PATH+'/gs_file'
for gs in [x for x in os.listdir(GS_PATH) if x.endswith('.gs')]:
    if os.path.exists(GA_PATH_REF+'/'+gs) is False:
        continue
    df_gs = pd.read_csv(GS_PATH+'/'+gs, sep='\t', index_col=0)
    df_gs_ref = pd.read_csv(GA_PATH_REF+'/'+gs, sep='\t', index_col=0)
    v_dif = [len(set(df_gs.loc[x,'GENESET'].split(','))-set(df_gs_ref.loc[x,'GENESET'].split(',')))
             for x in df_gs.index]
    print(gs, np.array(v_dif).sum())
#     break

highbvar_ngene500.gs 0
all_ngene1000.gs 0
highbvar_ngene1000.gs 0
highmean_ngene500.gs 0
highbvar_ngene100.gs 0
highmean_ngene1000.gs 0
all_ngene100.gs 0
highvar_ngene100.gs 0
highvar_ngene1000.gs 0
highvar_ngene500.gs 0
all_ngene500.gs 0
highmean_ngene100.gs 0


In [19]:
# Generate .weighted.gs files for each .gs file
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'
DF_HOM = pd.read_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/gene_annotation/'
                     'mouse_human_homologs.txt', sep='\t')
DIC_MAP_H2M = {x:y for x,y in zip(DF_HOM['HUMAN_GENE_SYM'], DF_HOM['MOUSE_GENE_SYM'])}
DF_MAGMA = pd.read_csv(DATA_PATH+'/gene_annotation/MAGMA-v108/MAGMA_v108_GENE_10_ZSTAT.txt', sep='\t')
DF_TRAIT_INFO = pd.read_csv(DATA_PATH+'/supp_table/trait_info.tsv', sep='\t')
DF_MAGMA = DF_MAGMA[DF_TRAIT_INFO['Trait_Identifier']].copy()

DF_MAGMA.fillna(0, inplace=True)

In [20]:
# Write .gs file with continuous weights
with open(OUT_PATH+'/simu_list.rv1.txt', 'r') as f:
    SIMU_LIST = [x.strip() for x in f]
    
for gs_name in SIMU_LIST:
    df_gs = pd.read_csv(OUT_PATH+'/gs_file.rv1/%s.gs'%gs_name, sep='\t')
    df_gs.index = df_gs['TRAIT']
    dict_gene_weights = {"TRAIT":[], "GENESET":[]}
    for i_trait,trait in enumerate(df_gs.index):
        dict_gene_weights["TRAIT"].append(trait)
        gene_list = df_gs.loc[trait, 'GENESET'].split(',')
        
        np.random.seed(i_trait)
        trait_gwas = np.random.choice(DF_MAGMA.columns)
        temp_df = DF_MAGMA[[trait_gwas]].sort_values(trait_gwas, ascending=False)
        gene_weights = np.random.permutation(temp_df[trait_gwas].values[:len(gene_list)])
        
        dict_gene_weights["GENESET"].append(
            ",".join([f"{g}:{w:.5g}" for g, w in zip(gene_list, gene_weights)])
        )
    df_gs_w = pd.DataFrame(dict_gene_weights)
    df_gs_w.to_csv(OUT_PATH+'/gs_file.rv1/%s.weighted.gs'%gs_name, sep='\t', index=False)   

In [21]:
# Compare with old gene set
GS_PATH = OUT_PATH+'/gs_file.rv1'
GA_PATH_REF = OUT_PATH+'/gs_file.rv1.011622'
for gs in [x for x in os.listdir(GS_PATH) if x.endswith('.gs')]:
    if os.path.exists(GA_PATH_REF+'/'+gs) is False:
        continue
    df_gs = pd.read_csv(GS_PATH+'/'+gs, sep='\t', index_col=0)
    df_gs_ref = pd.read_csv(GA_PATH_REF+'/'+gs, sep='\t', index_col=0)
    v_dif = [len(set(df_gs.loc[x,'GENESET'].split(','))-set(df_gs_ref.loc[x,'GENESET'].split(',')))
             for x in df_gs.index]
    print(gs, np.array(v_dif).sum())
#     break

highmean_ngene1000.weighted.gs 0
highbvar_ngene100.weighted.gs 0
highbvar_ngene500.gs 0
all_ngene1000.weighted.gs 0
all_ngene1000.gs 0
highbvar_ngene1000.gs 0
highmean_ngene500.gs 0
highvar_ngene1000.weighted.gs 0
highmean_ngene2000.gs 0
all_ngene100.weighted.gs 0
highbvar_ngene1000.weighted.gs 0
highmean_ngene500.weighted.gs 0
highbvar_ngene100.gs 0
highmean_ngene1000.gs 0
all_ngene100.gs 0
highbvar_ngene500.weighted.gs 0
highvar_ngene500.weighted.gs 0
all_ngene2000.gs 0
highmean_ngene2000.weighted.gs 0
highvar_ngene100.gs 0
highvar_ngene100.weighted.gs 0
highvar_ngene2000.gs 0
highvar_ngene1000.gs 0
all_ngene500.weighted.gs 0
highvar_ngene500.gs 0
highmean_ngene100.weighted.gs 0
all_ngene2000.weighted.gs 0
all_ngene500.gs 0
highbvar_ngene2000.gs 0
highbvar_ngene2000.weighted.gs 0
highmean_ngene100.gs 0
highvar_ngene2000.weighted.gs 0
