In [1]:
import scanpy as sc
from anndata import read_h5ad
import pandas as pd
import numpy as np
import os
from os.path import join
import time
import argparse

# inhouse tools
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md

# autoreload
%load_ext autoreload
%autoreload 2

In [22]:
# Constants
DATA_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'
OUT_PATH=DATA_PATH+'/simulation_data'
H5AD_FILE=DATA_PATH+'/tabula_muris_senis/tabula-muris-senis-facs-official-raw-obj.h5ad'

In [23]:
# Load .h5ad file 
adata = read_h5ad(H5AD_FILE)
sc.pp.filter_cells(adata, min_genes=500)
sc.pp.filter_genes(adata, min_cells=200)
print('# H5AD_FILE loaded: ', adata.shape)

# H5AD_FILE loaded:  (110823, 18383)


In [4]:
# # Generate subsampled data 
# for n_cell in [1e3,2e3,5e3,1e4]:
#     np.random.seed(0)
#     ind_select = np.random.choice(np.arange(adata.shape[0]), size=int(n_cell), replace=False)
#     temp_adata = adata[ind_select].copy()
#     temp_adata.write(OUT_PATH+'/single_cell_data/tms_facs.ncell_%dk.h5ad'%int(n_cell/1000))
    
#     temp_adata.obs.to_csv(OUT_PATH+'/single_cell_data/tms_facs.ncell_%dk.obs'%int(n_cell/1000), sep='\t')
#     temp_df = pd.DataFrame(index=temp_adata.obs_names, columns=temp_adata.var_names,
#                            data=temp_adata.X.toarray(), dtype=int).T
#     temp_df.to_csv(OUT_PATH+'/single_cell_data/tms_facs.ncell_%dk.tsv'%int(n_cell/1000), sep='\t')
#     print(temp_adata.shape)

### Generate .gs files 

In [24]:
# Config 
GS_SIZE_LIST = [100, 500, 1000]
N_REP = 100
N_GENE_SELECT = int(0.25*adata.shape[1])

adata_norm = adata.copy()
sc.pp.normalize_per_cell(adata_norm, counts_per_cell_after=1e4)
sc.pp.log1p(adata_norm)
md.compute_stats(adata_norm)
adata_norm.var['bvar'] = adata_norm.var['var'] - adata_norm.var['var_tech']

In [25]:
dic_gs_all = {}

# All genes 
dic_gs_all['all'] = sorted(adata.var_names)

# Highly-expressed genes 
dic_gs_all['highmean'] = sorted(adata_norm.var.sort_values('mean', ascending=False).index[:N_GENE_SELECT])

# Highly-variale genes 
dic_gs_all['highvar'] = sorted(adata_norm.var.sort_values('var', ascending=False).index[:N_GENE_SELECT])

# Overly-dispersed genes
dic_gs_all['highbvar'] = sorted(adata_norm.var.sort_values('bvar', ascending=False).index[:N_GENE_SELECT])

In [26]:
# Randomly sample genes 
f = open(OUT_PATH+'/simu_list.txt', 'w')
for suffix in dic_gs_all.keys():
    for n_gene in GS_SIZE_LIST:
        gs_name = '%s_ngene%d'%(suffix, n_gene)
        df_gs = pd.DataFrame(columns=['TRAIT', 'GENESET'])
        for i_rep in np.arange(N_REP):
            np.random.seed(i_rep)
            temp_list = np.random.choice(dic_gs_all[suffix], size=n_gene, replace=False)
            df_gs.loc[i_rep] =  ['%s_rep%d'%(gs_name,i_rep), ','.join(temp_list)]
        df_gs.to_csv(OUT_PATH+'/gs_file/%s.gs'%gs_name, sep='\t', index=False)
        f.write(gs_name+'\n')    
f.close()

In [29]:
# Compare with old gene set
GS_PATH = OUT_PATH+'/gs_file'
GA_PATH_REF = OUT_PATH+'/gs_file.080721'
for gs in [x for x in os.listdir(GS_PATH) if x.endswith('.gs')]:
    df_gs = pd.read_csv(GS_PATH+'/'+gs, sep='\t', index_col=0)
    df_gs_ref = pd.read_csv(GA_PATH_REF+'/'+gs, sep='\t', index_col=0)
    v_dif = [len(set(df_gs.loc[x,'GENESET'].split(','))-set(df_gs_ref.loc[x,'GENESET'].split(',')))
             for x in df_gs.index]
    print(gs, np.array(v_dif).sum())
#     break

highbvar_ngene500.gs 0
all_ngene1000.gs 0
highbvar_ngene1000.gs 0
highmean_ngene500.gs 0
highbvar_ngene100.gs 0
highmean_ngene1000.gs 0
all_ngene100.gs 0
highvar_ngene100.gs 0
highvar_ngene1000.gs 0
highvar_ngene500.gs 0
all_ngene500.gs 0
highmean_ngene100.gs 0
