In [3]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy as sp
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
from anndata import AnnData
import os
import time
from gprofiler import GProfiler

# scTRS tools
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md

# autoreload
%load_ext autoreload
%autoreload 2

In [5]:
DATA_FILE='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/single_cell_data/tabula_sapiens/raw_data/TabulaSapiens.h5ad'
OUT_PATH='/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/single_cell_data/tabula_sapiens'

In [3]:
adata_full = sc.read_h5ad(DATA_FILE)
adata_full.X = adata_full.raw.X
del adata_full.layers["decontXcounts"]
del adata_full.raw
adata_full.obs['tissue'] = adata_full.obs['organ_tissue']
adata_full.obs['tissue_celltype'] = ['%s.%s'%(x,y) for x,y in zip(adata_full.obs['tissue'], 
                                                                  adata_full.obs['cell_ontology_class'])]

for method in ['smartseq2', '10X']:
    adata = adata_full[adata_full.obs['method']==method].copy()
    
    # Before filtering 
    print(method)
    print('# n_cell=%d, n_gene=%d'%(adata.shape[0], adata.shape[1]))
    print('# n_tissue=%d'%(len(set(adata.obs['organ_tissue']))))
    print('# n_celltype=%d'%(len(set(adata.obs['cell_ontology_class']))))
    print('# n_tissue_celltype=%d'%(len(set(adata.obs['tissue_celltype']))))

    # Remove tissue-cell types with <3 cells:
    sc.pp.filter_cells(adata, min_genes=250)
    sc.pp.filter_genes(adata, min_cells=50)
    adata.write(OUT_PATH+'/obj_%s_raw.h5ad'%method)

    # After filtering 
    print('After filtering')
    print('# n_cell=%d, n_gene=%d'%(adata.shape[0], adata.shape[1]))
    print('# n_tissue=%d'%(len(set(adata.obs['tissue']))))
    print('# n_celltype=%d'%(len(set(adata.obs['cell_ontology_class']))))
    print('# n_tissue_celltype=%d'%(len(set(adata.obs['tissue_celltype']))))

smartseq2
# n_cell=27051, n_gene=58870
# n_tissue=24
# n_celltype=134
# n_tissue_celltype=353


... storing 'tissue_celltype' as categorical


After filtering
# n_cell=26813, n_gene=34963
# n_tissue=24
# n_celltype=134
# n_tissue_celltype=353
10X
# n_cell=454069, n_gene=58870
# n_tissue=24
# n_celltype=180
# n_tissue_celltype=470


... storing 'tissue_celltype' as categorical


After filtering
# n_cell=453582, n_gene=36715
# n_tissue=24
# n_celltype=180
# n_tissue_celltype=470


In [4]:
# TS FACS 
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/single_cell_data/tabula_sapiens'
adata_raw = sc.read_h5ad(DATA_PATH+'/obj_smartseq2_raw.h5ad')

In [14]:
# Make .cov file 
df_cov = pd.DataFrame(index=adata_raw.obs.index)
df_cov['const'] = 1
df_cov['n_genes'] = (adata_raw.X>0).sum(axis=1)
for donor in sorted(set(adata_raw.obs['donor'])):
    if donor!='TSP1':
        df_cov['donor_%s'%donor] = (adata_raw.obs['donor']==donor)*1    
df_cov.to_csv(DATA_PATH+'/ts_smartseq2.cov', sep='\t')

In [8]:
# TS Droplet 
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scTRS_data/single_cell_data/tabula_sapiens'
adata_raw = sc.read_h5ad(DATA_PATH+'/obj_10X_raw.h5ad')

In [9]:
# Make .cov file 
df_cov = pd.DataFrame(index=adata_raw.obs.index)
df_cov['const'] = 1
df_cov['n_genes'] = (adata_raw.X>0).sum(axis=1)
df_cov.to_csv(DATA_PATH+'/ts_10X.cov', sep='\t')