In [5]:
import scanpy as sc
from anndata import read_h5ad
import pandas as pd
import numpy as np
import os
from os.path import join
import time
import argparse

# inhouse tools
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md

# autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Constants
DATA_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'
H5AD_FILE=DATA_PATH+'/simulation_data/single_cell_data/tms_facs.ncell_10k.h5ad'
GS_FILE=DATA_PATH+'/simulation_data/gs_file/all_ngene1000.gs'
OUT_PATH=DATA_PATH+'/simulation_data'

In [8]:
# Load .h5ad file 
adata = read_h5ad(H5AD_FILE)
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
print('# H5AD_FILE loaded: ', adata.shape)

# Load .gs file 
df_gs = pd.read_csv(GS_FILE, sep='\t', index_col=0)

# H5AD_FILE loaded:  (10000, 18383)


In [9]:
# Generate .effect file 
f = open(OUT_PATH+'/perturb_list.txt', 'w')
n_cell_causal = 500
n_gene_causal = 1000
param_list = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
config_list = [[x,0.25] for x in param_list] + [[0.25,x] for x in param_list]
for effect_size,overlap in config_list:
    df_eff = pd.DataFrame(index=df_gs.index, columns=['TRAIT', 'GENELIST', 'GENEEFF', 'CELLLIST'])
    for trait in df_gs.index:
        np.random.seed(int(trait[-1]))
        cell_list = np.random.choice(adata.obs_names, size=n_cell_causal)
        gene_list_gs = sorted(set(df_gs.loc[trait, 'GENESET'].split(',')) & set(adata.var_names))
        gene_list_other = sorted(set(adata.var_names) - set(df_gs.loc[trait, 'GENESET'].split(',')))
        gene_list = list(np.random.choice(gene_list_gs, size=int(overlap*n_gene_causal), replace=False)) + \
            list(np.random.choice(gene_list_other, size=int((1-overlap)*n_gene_causal), replace=False))
        v_eff = np.zeros(len(gene_list)) + np.log(effect_size+1)

        df_eff.loc[trait, 'TRAIT'] = trait
        df_eff.loc[trait, 'GENELIST'] = ','.join(gene_list)
        df_eff.loc[trait, 'GENEEFF'] = ','.join([str(x) for x in v_eff])
        df_eff.loc[trait, 'CELLLIST'] = ','.join(cell_list)

    perturb_file = 'tms_facs_ncell_10k.all_ngene1000.eff_%d_overlap_%d.perturb'\
                    %(int(effect_size*100), int(overlap*100))
    df_eff.to_csv(OUT_PATH+'/gs_file/'+perturb_file, sep='\t', index=False)
    f.write(perturb_file+'\n')  
f.close()

In [10]:
cell_list = list(adata.obs_names[adata.obs['cell_ontology_class']=='B cell'])
print('n cell', len(cell_list))

n cell 528


In [11]:
# Generate .effect file where B cells are causal cells 
f = open(OUT_PATH+'/perturb_list_celltype.txt', 'w')
n_gene_causal = 1000
overlap = 0.25
for effect_size in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]:
    df_eff = pd.DataFrame(index=df_gs.index, columns=['TRAIT', 'GENELIST', 'GENEEFF', 'CELLLIST'])
    for trait in df_gs.index:
        np.random.seed(int(trait[-1]))
        cell_list = list(adata.obs_names[adata.obs['cell_ontology_class']=='B cell'])
        gene_list_gs = sorted(set(df_gs.loc[trait, 'GENESET'].split(',')) & set(adata.var_names))
        gene_list_other = sorted(set(adata.var_names) - set(df_gs.loc[trait, 'GENESET'].split(',')))
        gene_list = list(np.random.choice(gene_list_gs, size=int(overlap*n_gene_causal), replace=False)) + \
            list(np.random.choice(gene_list_other, size=int((1-overlap)*n_gene_causal), replace=False))
        v_eff = np.zeros(len(gene_list)) + np.log(effect_size+1)

        df_eff.loc[trait, 'TRAIT'] = trait
        df_eff.loc[trait, 'GENELIST'] = ','.join(gene_list)
        df_eff.loc[trait, 'GENEEFF'] = ','.join([str(x) for x in v_eff])
        df_eff.loc[trait, 'CELLLIST'] = ','.join(cell_list)

    perturb_file = 'tms_facs_ncell_10k.all_ngene1000.Bcell.eff_%d_overlap_%d.perturb'\
                    %(int(effect_size*100), int(overlap*100))
    df_eff.to_csv(OUT_PATH+'/gs_file/'+perturb_file, sep='\t', index=False)
    f.write(perturb_file+'\n')  
f.close()

In [None]:
# # find list of scores not computed 
# df_perturb = pd.read_csv(DATA_PATH + '/simulation_data/perturb_list.txt', header=None)
# flist_miss = []
# for i_p,perturb in enumerate(df_perturb[0]):
#     folder_name = DATA_PATH+'/simulation_data/score_file/%s'%perturb
#     if os.path.exists(folder_name) is False:
#         flist_miss.append(i_p+1)
# print(','.join(['%d'%x for x in flist_miss]))

In [22]:
# Compare with old gene set
GS_PATH = OUT_PATH+'/gs_file'
GA_PATH_REF = OUT_PATH+'/gs_file.080721'
for gs in [x for x in os.listdir(GS_PATH) if x.endswith('.perturb')]:
    df_gs = pd.read_csv(GS_PATH+'/'+gs, sep='\t', index_col=0)
    df_gs_ref = pd.read_csv(GA_PATH_REF+'/'+gs, sep='\t', index_col=0)
    v_dif_gene = [len(set(df_gs.loc[x,'GENELIST'].split(','))-set(df_gs_ref.loc[x,'GENELIST'].split(',')))
                  for x in df_gs.index]
    v_dif_gene = [(np.array(df_gs.loc[x,'GENEEFF'].split(','), dtype=float)-
                   np.array(df_gs_ref.loc[x,'GENEEFF'].split(','), dtype=float)).sum()
                  for x in df_gs.index]
    v_dif_cell = [len(set(df_gs.loc[x,'CELLLIST'].split(','))-set(df_gs_ref.loc[x,'CELLLIST'].split(',')))
                  for x in df_gs.index]
    print(gs, np.array(v_dif_gene).sum(), np.array(v_dif_gene).sum(), np.array(v_dif_cell).sum())
#     break

tms_facs_ncell_10k.all_ngene1000.eff_5_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.eff_10_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.eff_25_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.Bcell.eff_35_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.eff_25_overlap_20.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.Bcell.eff_50_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.eff_30_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.eff_35_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.Bcell.eff_25_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.eff_20_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.Bcell.eff_15_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.Bcell.eff_30_overlap_25.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.eff_25_overlap_50.perturb 0.0 0.0 0
tms_facs_ncell_10k.all_ngene1000.Bcell.eff_20_overlap_25.perturb 0.0 0.0 0

In [16]:
df_gs_ref

Unnamed: 0_level_0,GENELIST,GENEEFF,CELLLIST
TRAIT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
all_ngene1000_rep0,"2210404J11Rik,Abhd10,Irx1,Mir680-2,Fancg,S100g...","0.04879016416943205,0.04879016416943205,0.0487...","E18_B002452_B009020_S114.mm10-plus-0-0,L20_B00..."
all_ngene1000_rep1,"Mtap1a,Zfp57,Sh2d2a,Mbip,Aktip,Eif4a1,Ccdc64b,...","0.04879016416943205,0.04879016416943205,0.0487...","M4.MAA000938.3_8_M.1.1-1-1,L18_B010818_S270.mu..."
all_ngene1000_rep2,"Ranbp9,Myo16,Kcnk3,Cnst,LOC622070,A230072E10Ri...","0.04879016416943205,0.04879016416943205,0.0487...","G7_B000176_B008056_S151.mm10-plus-3-0,I1_B0007..."
all_ngene1000_rep3,"A630089N07Rik,Slc26a10,Dnajc27,Acsl5,Sipa1l1,T...","0.04879016416943205,0.04879016416943205,0.0487...","N14.B002421.3_39_F.1.1-1-1,E1_B000120_B007345_..."
all_ngene1000_rep4,"BC053749,Epha3,Gm5485,Hcfc1,Acot2,9930021J03Ri...","0.04879016416943205,0.04879016416943205,0.0487...","K18.MAA000913.3_9_M.1.1-1-1,A14.MAA001857.3_38..."
...,...,...,...
all_ngene1000_rep95,"Apol7b,Esyt1,Chaf1b,0610011L14Rik,Zc3h12a,Zyg1...","0.04879016416943205,0.04879016416943205,0.0487...","H19_B002894_S259_L002.mus-2-0-1,I19_B000797_B0..."
all_ngene1000_rep96,"Gmip,Serpina3n,Eral1,Arpc1a,Krt42,Glt8d2,Mtmr1...","0.04879016416943205,0.04879016416943205,0.0487...","L6_D045853_B009304_S66.mm10-plus-1-0,P16.B0023..."
all_ngene1000_rep97,"Rfc1,Eif4h,Tm9sf4,Rab43,Fam184a,Tnrc6a,Mblac2,...","0.04879016416943205,0.04879016416943205,0.0487...","B13_B000843_S121_L001.mus-0-0-1,I5_B003009_B00..."
all_ngene1000_rep98,"Pou5f2,Tspan11,Dhrsx,Zfp82,Ccdc68,Arhgap27,Urb...","0.04879016416943205,0.04879016416943205,0.0487...","J10_B002999_S226_L002.mus-6-0-1,B15.MAA000910...."
