In [4]:
import scanpy as sc
from anndata import read_h5ad
import pandas as pd
import numpy as np
import os
from os.path import join
import time
import argparse

# inhouse tools
import scTRS.util as util
import scTRS.data_loader as dl
import scTRS.method as md

# autoreload
%load_ext autoreload
%autoreload 2

In [5]:
# Constants
DATA_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data'
OUT_PATH=DATA_PATH+'/simulation_data'
H5AD_FILE=DATA_PATH+'/tabula_muris_senis/tabula-muris-senis-facs-official-raw-obj.h5ad'

In [6]:
# Load .h5ad file 
adata = read_h5ad(H5AD_FILE)
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=200)
print('# H5AD_FILE loaded: ', adata.shape)

# H5AD_FILE loaded:  (110823, 18383)


In [12]:
# # Generate subsampled data 
# for n_cell in [1e3,2e3,5e3,1e4,2e4]:
#     np.random.seed(0)
#     ind_select = np.random.choice(np.arange(adata.shape[0]), size=int(n_cell), replace=False)
#     temp_adata = adata[ind_select].copy()
#     temp_adata.write(OUT_PATH+'/tms_facs.ncell_%dk.h5ad'%int(n_cell/1000))
    
#     if n_cell==1e4:
#         temp_adata.obs.to_csv(OUT_PATH+'/tms_facs.ncell_%dk.obs'%int(n_cell/1000), sep='\t')
#         temp_df = pd.DataFrame(index=temp_adata.obs_names, columns=temp_adata.var_names,
#                                data=temp_adata.X.toarray(), dtype=int).T
#         temp_df.to_csv(OUT_PATH+'/tms_facs.ncell_%dk.tsv'%int(n_cell/1000), sep='\t')
#     print(temp_adata.shape)

(1000, 18383)
(2000, 18383)
(5000, 18383)
(10000, 18383)
(20000, 18383)


### Generate .gs files 

In [5]:
# Constants 
GS_SIZE_LIST = [50, 200, 500]
N_REP = 100

adata_norm = adata.copy()
sc.pp.normalize_per_cell(adata_norm, counts_per_cell_after=1e4)
sc.pp.log1p(adata_norm)
md.compute_stats(adata_norm)

In [6]:
dic_gs_all = {}

# All genes 
dic_gs_all['all'] = sorted(adata.var_names)

# Highly-expressed genes 
dic_gs_all['highexp'] = sorted(adata_norm.var.sort_values('mean', ascending=False).index[0:2000])

# Overly-dispersed genes 
temp_df = adata_norm.var.copy()
temp_df['bvar'] = temp_df['var'] - temp_df['var_tech']
dic_gs_all['highbvar'] = sorted(temp_df.sort_values('bvar', ascending=False).index[0:2000])

# Male-specific genes 
sc.tl.rank_genes_groups(adata_norm, 'sex', groups=['male'], reference='female', method='t-test_overestim_var')
dic_gs_all['male'] = [x[0] for x in adata_norm.uns['rank_genes_groups']['names'][:2000]]

# Old-specific genes 
adata_norm.obs['age_yo'] = ['young' if x=='3m' else 'old' for x in adata_norm.obs['age']]
sc.tl.rank_genes_groups(adata_norm, 'age_yo', groups=['old'], reference='rest', method='t-test_overestim_var')
dic_gs_all['old'] = [x[0] for x in adata_norm.uns['rank_genes_groups']['names'][:2000]]

... storing 'age_yo' as categorical


In [7]:
# Randomly sample genes 
f = open(OUT_PATH+'/simu_list.txt', 'w')
for suffix in dic_gs_all.keys():
    for n_gene in GS_SIZE_LIST:
        gs_name = '%s_ngene%d'%(suffix, n_gene)
        df_gs = pd.DataFrame(columns=['TRAIT', 'GENESET'])
        for i_rep in np.arange(N_REP):
            np.random.seed(i_rep)
            temp_list = np.random.choice(dic_gs_all[suffix], size=n_gene, replace=False)
            df_gs.loc[i_rep] =  ['%s_rep%d'%(gs_name,i_rep), ','.join(temp_list)]
        df_gs.to_csv(OUT_PATH+'/gs_file/%s.gs'%gs_name, sep='\t', index=False)
        f.write(gs_name+'\n')
f.close()