In [1]:
import pandas as pd
import scanpy as sc

# scTRS tools
import scdrs.util as util
import scdrs.data_loader as dl
import scdrs.method as md

# autoreload
%load_ext autoreload
%autoreload 2

In [2]:
DATA_PATH = '/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data'
OUT_FILE = '/n/holystore01/LABS/price_lab/Users/mjzhang/scDRS_data/supp_table/data_info.tsv'

In [4]:
df_info = pd.DataFrame(columns=['Data set', 'Species', 'Number of cells', 'Tissues', 'Cell types'])

# TMS FACS
dname = 'TMS FACS'
species = 'Mus musculus'
adata = dl.load_tms_ct(DATA_PATH, data_name='facs')
df_info.loc[dname] = [dname, species, adata.shape[0], ';'.join(sorted(set(adata.obs['tissue']))),
                      ';'.join(sorted(set(adata.obs['cell_ontology_class'])))]

# TMS Droplet
dname = 'TMS Droplet'
species = 'Mus musculus'
adata = dl.load_tms_ct(DATA_PATH, data_name='droplet')
df_info.loc[dname] = [dname, species, adata.shape[0], ';'.join(sorted(set(adata.obs['tissue']))),
                      ';'.join(sorted(set(adata.obs['cell_ontology_class'])))]

# TS FACS
dname = 'TS FACS'
species = 'Homo sapiens'
adata =  dl.load_ts_facs(DATA_PATH, opt='raw', flag_size_factor=False, flag_log1p=False)
df_info.loc[dname] = [dname, species, adata.shape[0], ';'.join(sorted(set(adata.obs['organ_tissue']))),
                      ';'.join(sorted(set(adata.obs['cell_ontology_class'])))]

# Cano-Gamez and Soskic
dname = 'Cano-Gamez & Soskic'
species = 'Homo sapiens'
adata =  dl.load_canogamez(DATA_PATH, opt='raw', flag_size_factor=False, flag_log1p=False)
df_info.loc[dname] = [dname, species, adata.shape[0], 'Blood', ';'.join(sorted(set(adata.obs['cluster.id'])))]

# Nathan
dname = 'Nathan'
species = 'Homo sapiens'
adata =  dl.load_nathan(DATA_PATH, opt='raw', flag_size_factor=False, flag_log1p=False)
df_info.loc[dname] = [dname, species, adata.shape[0], 'Blood', ';'.join(sorted(set(adata.obs['cluster_name'])))]

# Aizarani
dname = 'Aizarani'
species = 'Homo sapiens'
adata =  dl.load_aizarani(DATA_PATH, opt='raw', flag_size_factor=False, flag_log1p=False)
df_info.loc[dname] = [dname, species, adata.shape[0], 'Liver', ';'.join(sorted(set(adata.obs['celltype'])))]

# Halpern and Shenhav
dname = 'Halpern & Shenhav'
species = 'Mus musculus'
adata =  dl.load_halpern(DATA_PATH, opt='raw', flag_size_factor=False, flag_log1p=False)
df_info.loc[dname] = [dname, species, adata.shape[0], 'Liver', 'Hepatocyte']

# Richter and Deligiannis
dname = 'Richter & Deligiannis'
species = 'Mus musculus'
adata =  dl.load_richter(DATA_PATH, opt='raw', flag_size_factor=False, flag_log1p=False)
df_info.loc[dname] = [dname, species, adata.shape[0], 'Liver', 'Hepatocyte']

# Rao
dname = 'Rao'
species = 'Mus musculus'
adata =  dl.load_rao(DATA_PATH, opt='raw', flag_size_factor=False, flag_log1p=False)
df_info.loc[dname] = [dname, species, adata.shape[0], 'Liver', 
                      ';'.join(sorted(set(adata.obs['annotated.idents'])))]

Trying to set attribute `.obs` of view, copying.


In [5]:
tissue_list = []
celltype_list = []
for dname in df_info.index:
    temp_tissue_list = df_info.loc[dname, 'Tissues'].split(';')
    temp_celltype_list = df_info.loc[dname, 'Cell types'].split(';')
    tissue_list += temp_tissue_list
    celltype_list += temp_celltype_list
    print('# %s\tn_cell=%d\tn_tissue=%d\tn_celltype=%d'
          %(dname, df_info.loc[dname, 'Number of cells'], len(temp_tissue_list), len(temp_celltype_list)))

tissue_list = sorted(set(tissue_list))
celltype_list = sorted(set(celltype_list))
print('# Total\tn_cell=%d\tn_tissue=%d\tn_celltype=%d'
      %(df_info['Number of cells'].sum(), len(tissue_list), len(celltype_list)) )

df_info.to_csv(OUT_FILE, sep='\t')

# TMS FACS	n_cell=110096	n_tissue=23	n_celltype=120
# Total	n_cell=110096	n_tissue=23	n_celltype=120


In [8]:
tissue_list = []
for ds in df_info.index:
    tissue_list.extend(df_info.loc[ds, 'Tissues'].split(';'))
tissue_list = sorted(set(tissue_list))
print(', '.join(tissue_list))

Aorta, BAT, Bladder, Brain_Myeloid, Brain_Non-Myeloid, Diaphragm, GAT, Heart, Kidney, Large_Intestine, Limb_Muscle, Liver, Lung, MAT, Mammary_Gland, Marrow, Pancreas, SCAT, Skin, Spleen, Thymus, Tongue, Trachea


In [27]:
# Balancenes of TMS FACS
dname = 'TMS FACS'
adata = dl.load_tms_ct(DATA_PATH, data_name='facs')
temp_df = adata.obs.groupby('cell_ontology_class').agg({'cell':len})
temp_df = temp_df.sort_values('cell', ascending=False) / adata.shape[0]
for top in [1,5,10]:
    print('Top %-2d %s cell types: %.1f%%'%(top, dname, temp_df['cell'][:top].sum()*100))

Trying to set attribute `.obs` of view, copying.


Top 1  TMS FACS cell types: 12.1%
Top 5  TMS FACS cell types: 29.8%
Top 10 TMS FACS cell types: 44.2%
