In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy as sp
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
from gprofiler import GProfiler

# scTRS tools
import scTRS.util as util
import scTRS.data_loader as dl
import scTRS.method as md

# autoreload
%load_ext autoreload
%autoreload 2
# logging
sc.logging.print_versions()

scanpy==1.5.1 anndata==0.7.4 umap==0.4.6 numpy==1.19.0 scipy==1.5.1 pandas==1.0.5 scikit-learn==0.23.1 statsmodels==0.11.1


In [2]:
# Setup file paths
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data'
# Please change this to your own path
OUT_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data'

### Load TMS data 

In [3]:
# Count data (used for scoring genes)
start_time = time.time()
data_facs_ct = dl.load_tms_ct(DATA_PATH, data_name='facs')
print('# TMS facs count data: n_cell=%d, n_gene=%d'
      %(data_facs_ct.shape[0], data_facs_ct.shape[1]))
print('# time=%0.1fs'%(time.time() - start_time))

Trying to set attribute `.obs` of view, copying.


# TMS facs count data: n_cell=110096, n_gene=22966
# time=14.7s


### Null gene set

In [8]:
np.random.seed(0)
gs_size_list = [20, 100, 500]
n_rep=100

md.compute_stats(data_facs_ct)
df_gene = pd.DataFrame(index=data_facs_ct.var_names)
df_gene['mean'] = data_facs_ct.var['mean'].values
df_gene['std'] = np.sqrt(data_facs_ct.var['var'].values)
df_gene['cv'] = df_gene['std'].values/df_gene['mean'].values
df_gene['var'] = data_facs_ct.var['var'].values
df_gene['var_tech'] = data_facs_ct.var['var_tech'].values
df_gene = df_gene.sort_values(by=['mean'])

for n_gene in gs_size_list:
    for i_rep in np.arange(n_rep):
        gs_name = 'all_size%d_rep%d'%(n_gene,i_rep)
        ind_select = np.random.permutation(data_facs_ct.shape[1])[:n_gene]
        gene_list = list(data_facs_ct.var_names[ind_select])
        temp_df = pd.DataFrame()
        temp_df['gene'] = gene_list
        temp_df['gene_weight.unif'] = np.random.rand(len(gene_list))
        temp_df['gene_weight.normal'] = np.random.randn(len(gene_list))
        temp_df.to_csv(OUT_PATH+'/data_null_geneset/%s'%gs_name, sep='\t', index=False)
        
for n_gene in gs_size_list:
    for i_rep in np.arange(n_rep):
        # Select lowly- and highly- expressed random genes 
        ind_select = np.random.permutation(5000)[:n_gene]
        gs_name = 'lowexp_size%d_rep%d'%(n_gene,i_rep)
        gene_list = list(df_gene.index[:5000][ind_select])
        temp_df = pd.DataFrame()
        temp_df['gene'] = gene_list
        temp_df['gene_weight.unif'] = np.random.rand(len(gene_list))
        temp_df['gene_weight.normal'] = np.random.randn(len(gene_list))
        temp_df.to_csv(OUT_PATH+'/data_null_geneset/%s'%gs_name, sep='\t', index=False)
        
for n_gene in gs_size_list:
    for i_rep in np.arange(n_rep):
        ind_select = np.random.permutation(5000)[:n_gene]
        gs_name = 'highexp_size%d_rep%d'%(n_gene,i_rep)
        gene_list = list(df_gene.index[-5000:][ind_select])
        temp_df = pd.DataFrame()
        temp_df['gene'] = gene_list
        temp_df['gene_weight.unif'] = np.random.rand(len(gene_list))
        temp_df['gene_weight.normal'] = np.random.randn(len(gene_list))
        temp_df.to_csv(OUT_PATH+'/data_null_geneset/%s'%gs_name, sep='\t', index=False)

  if __name__ == '__main__':


In [9]:
np.random.seed(0)
n_rep=1

gs_name = 'allgene_rep0'
gene_list = list(data_facs_ct.var_names)
temp_df = pd.DataFrame()
temp_df['gene'] = gene_list
temp_df['gene_weight.unif'] = np.random.rand(len(gene_list))
temp_df['gene_weight.normal'] = np.random.randn(len(gene_list))
temp_df.to_csv(OUT_PATH+'/data_null_geneset/%s'%gs_name, sep='\t', index=False)