In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy as sp
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from gprofiler import GProfiler

# scTRS tools
import scTRS.util as util
import scTRS.data_loader as dl
import scTRS.method as md

# autoreload
%load_ext autoreload
%autoreload 2
# logging
sc.logging.print_versions()

scanpy==1.5.1 anndata==0.7.4 umap==0.4.6 numpy==1.19.0 scipy==1.5.1 pandas==1.0.5 scikit-learn==0.23.1 statsmodels==0.11.1


In [2]:
# Setup file paths
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data'
# Please change this to your own path
RESULT_PATH = '/n/home11/mjzhang/gwas_informed_scRNAseq/results'

### Load TMS data 

In [3]:
# Count data (used for scoring genes)
start_time = time.time()
data_facs_ct = dl.load_tms_ct(DATA_PATH, data_name='facs')
print('# TMS facs count data: n_cell=%d, n_gene=%d'
      %(data_facs_ct.shape[0], data_facs_ct.shape[1]))
sc.pp.subsample(data_facs_ct, n_obs=20000, random_state=0, copy=False)
print('# Subsampled TMS facs count data: n_cell=%d, n_gene=%d'
      %(data_facs_ct.shape[0], data_facs_ct.shape[1]))
print('# time=%0.1fs'%(time.time() - start_time))

Trying to set attribute `.obs` of view, copying.


# TMS facs count data: n_cell=110096, n_gene=22966
# Subsampled TMS facs count data: n_cell=20000, n_gene=22966
# time=10.3s


### Generate config list 

In [27]:
# ctrl_opt_list = ['random', 'mean_match', 'mean_bvar_match']
# trs_opt_list = ['mean', 'vst', 'inv_std']
# bc_opt_list = ['recipe_vision', 'empi']

# df_config = pd.DataFrame(columns=['ctrl_opt', 'trs_opt', 'bc_opt'])
# for ctrl_opt in ctrl_opt_list:
#     for trs_opt in trs_opt_list:
#         for bc_opt in bc_opt_list:
#             config_name='ctrl_opt=%s;trs_opt=%s;bc_opt=%s'%(ctrl_opt,trs_opt,bc_opt)
#             df_config.loc[config_name] = [str(ctrl_opt),str(trs_opt),str(bc_opt)]
# df_config['config_name'] = df_config.index
# df_config = df_config[['config_name', 'ctrl_opt', 'trs_opt', 'bc_opt']]
# df_config.to_csv('/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/config_list', sep='\t', index=False)

### Test TRS

In [4]:
# Precompute mean and var 
md.compute_stats(data_facs_ct)

In [8]:
print('# n_config=%d'%df_config.shape[0])

# n_config=18


In [5]:
# Obtain TRS: This takes a lot of time.

DATA_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data'
flist = [x for x in os.listdir(DATA_PATH+'/data_null_geneset') if x[-5:]=='_rep0']
flist.sort()

df_config = pd.read_csv(DATA_PATH+'/config_list', sep='\t')
df_config.index = df_config['config_name']
    
start_time = time.time()

for config in df_config.index:
    ctrl_opt = df_config.loc[config,'ctrl_opt']
    if ctrl_opt=='None':
        ctrl_opt = None 
    trs_opt = df_config.loc[config,'trs_opt']
    bc_opt = df_config.loc[config,'bc_opt']
    if bc_opt=='None':
        bc_opt = None 
    
    print('=================================================================')
    print('# ctrl_opt=%s,trs_opt=%s,bc_opt=%s'%(ctrl_opt,trs_opt,bc_opt))
    for fname in flist:
        temp_df = pd.read_csv(DATA_PATH+'/data_null_geneset/'+fname, sep='\t')
        
        # Equal weight
        md.score_cell(data_facs_ct, temp_df['gene'].values, suffix='', 
                      ctrl_opt=ctrl_opt, trs_opt=trs_opt, bc_opt=bc_opt,
                      n_ctrl=1, n_genebin=200, 
                      return_list=['trs_ep'], verbose=False)
        prop_sig = (data_facs_ct.obs['trs_ep']<5e-4).mean()
        if prop_sig>5e-4:
            print('# %-30s equal_weight        prop_sig=%0.2e'%(fname, prop_sig))
            
        # Uniform weight
        md.score_cell(data_facs_ct, temp_df['gene'].values, 
                      gene_weight=temp_df['gene_weight.unif'].values, suffix='', 
                      ctrl_opt=ctrl_opt, trs_opt=trs_opt, bc_opt=bc_opt,
                      n_ctrl=1, n_genebin=200, 
                      return_list=['trs_ep'], verbose=False)
        prop_sig = (data_facs_ct.obs['trs_ep']<5e-4).mean()
        if prop_sig>5e-4:
            print('# %-30s unif_weight         prop_sig=%0.2e'%(fname, prop_sig))
            
        # Normal weight
        md.score_cell(data_facs_ct, temp_df['gene'].values, 
                      gene_weight=temp_df['gene_weight.normal'].values, suffix='', 
                      ctrl_opt=ctrl_opt, trs_opt=trs_opt, bc_opt=bc_opt,
                      n_ctrl=1, n_genebin=200, 
                      return_list=['trs_ep'], verbose=False)
        prop_sig = (data_facs_ct.obs['trs_ep']<5e-4).mean()
        if prop_sig>5e-4:
            print('# %-30s norm_weight         prop_sig=%0.2e'%(fname, prop_sig))

# ctrl_opt=random,trs_opt=mean,bc_opt=recipe_vision
# all_size100_rep0               norm_weight         prop_sig=9.00e-04
# all_size500_rep0               equal_weight        prop_sig=5.50e-04
# all_size500_rep0               unif_weight         prop_sig=6.50e-04
# all_size500_rep0               norm_weight         prop_sig=1.30e-03
# lowexp_size100_rep0            norm_weight         prop_sig=2.05e-03
# lowexp_size20_rep0             norm_weight         prop_sig=1.00e-03
# lowexp_size500_rep0            norm_weight         prop_sig=3.40e-03
# ctrl_opt=random,trs_opt=mean,bc_opt=empi
# all_size100_rep0               equal_weight        prop_sig=9.50e-04
# all_size100_rep0               unif_weight         prop_sig=6.00e-04
# all_size100_rep0               norm_weight         prop_sig=6.50e-04
# all_size500_rep0               equal_weight        prop_sig=7.50e-04
# all_size500_rep0               unif_weight         prop_sig=7.00e-04
# lowexp_size100_rep0            equal_weight        

# all_size100_rep0               equal_weight        prop_sig=2.05e-03
# all_size100_rep0               unif_weight         prop_sig=1.25e-03
# all_size100_rep0               norm_weight         prop_sig=1.20e-03
# all_size20_rep0                norm_weight         prop_sig=3.15e-03
# all_size500_rep0               unif_weight         prop_sig=6.50e-04
# all_size500_rep0               norm_weight         prop_sig=1.25e-03
# allgene_rep0                   unif_weight         prop_sig=5.50e-04
# allgene_rep0                   norm_weight         prop_sig=1.30e-03
# highexp_size100_rep0           equal_weight        prop_sig=1.55e-03
# highexp_size100_rep0           norm_weight         prop_sig=6.00e-04
# highexp_size20_rep0            equal_weight        prop_sig=1.85e-03
# highexp_size20_rep0            unif_weight         prop_sig=7.00e-04
# highexp_size500_rep0           unif_weight         prop_sig=7.50e-04
# lowexp_size100_rep0            equal_weight        prop_sig=6.50e-04
# lowe

# ctrl_opt=mean_bvar_match,trs_opt=inv_std,bc_opt=empi
# all_size100_rep0               unif_weight         prop_sig=9.50e-04
# all_size100_rep0               norm_weight         prop_sig=8.50e-04
# all_size20_rep0                unif_weight         prop_sig=6.50e-04
# all_size20_rep0                norm_weight         prop_sig=6.50e-04
# all_size500_rep0               equal_weight        prop_sig=7.00e-04
# highexp_size100_rep0           equal_weight        prop_sig=9.50e-04
# highexp_size100_rep0           unif_weight         prop_sig=7.50e-04
# highexp_size100_rep0           norm_weight         prop_sig=7.00e-04
# highexp_size500_rep0           equal_weight        prop_sig=1.20e-03
# highexp_size500_rep0           unif_weight         prop_sig=1.00e-03
# lowexp_size100_rep0            unif_weight         prop_sig=5.50e-04
# lowexp_size20_rep0             equal_weight        prop_sig=9.50e-04
# lowexp_size500_rep0            norm_weight         prop_sig=6.50e-04
