# Download

The raw datasets are available for download from 

https://figshare.com/projects/Tabula_Muris_Transcriptomic_characterization_of_20_organs_and_tissues_from_Mus_musculus_at_single_cell_resolution/27733 

10X Single-cell RNA-seq data from microfluidic emulsion (v2)

SS2 Single-cell RNA-seq data from Smart-seq2 sequencing of FACS sorted cells (v2)

# Preprocess

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc

In [2]:
data_path = '../datasets/preprocessed/'

if not os.path.exists(data_path):
    
    os.makedirs(data_path)

path_SS2 = '../datasets/raw/Tabula_Muris/FACS/'
path_10X = '../datasets/raw/Tabula_Muris/droplet/'

meta_SS2 = pd.read_csv('../datasets/raw/Tabula_Muris/annotations_facs.csv', 
                       keep_default_na = False)
meta_10X = pd.read_csv('../datasets/raw/Tabula_Muris/annotations_droplet.csv', 
                       keep_default_na = False)

# SS2
for tissue in ['Aorta', 'Bladder', 'Brain_Myeloid', 'Brain_Non-Myeloid', 'Diaphragm', 'Fat', 'Heart',  
               'Kidney', 'Large_Intestine', 'Limb_Muscle', 'Liver', 'Lung', 'Mammary_Gland', 'Marrow', 
               'Pancreas', 'Skin', 'Spleen', 'Thymus', 'Tongue', 'Trachea']:

    # Read SS2 cell-by-gene counts
    adata_SS2 = sc.read_csv(os.path.join(path_SS2, "%s-counts.csv" % tissue)).transpose()
    ERCC_idx = pd.Series(adata_SS2.var.index).str.startswith('ERCC')
    cell_idx = adata_SS2.obs.index.isin(meta_SS2[(meta_SS2.cell_ontology_class != 0) & 
                                                         (meta_SS2.cell_ontology_class != '')].cell)
    adata_SS2 = adata_SS2[cell_idx, -ERCC_idx]
    
    if tissue == 'Aorta':
        
        adata_SS2_all = adata_SS2.copy()
        
    else:
        
        genes = adata_SS2_all.var.index & adata_SS2.var.index
        adata_SS2_all = adata_SS2_all[:, genes].concatenate(adata_SS2[:, genes], index_unique = None)

# 10X
for tissue in ['Bladder', 'Heart_and_Aorta', 'Kidney', 'Limb_Muscle', 'Liver', 'Lung', 'Mammary_Gland', 
               'Marrow', 'Spleen', 'Thymus', 'Tongue', 'Trachea']:

    # Read 10X cell-by-gene counts
    channels = sorted(set(meta_10X[meta_10X.tissue == tissue].channel))
    
    for i, channel in enumerate(channels):
        
        if i == 0:
            
            adata_10X = sc.read_10x_mtx(path_10X + '/%s-%s/' % (tissue, channel), 
                                        var_names = 'gene_symbols', cache = False)
            adata_10X.obs.index = channel + "_" + adata_10X.obs.index
            adata_10X.obs.index = adata_10X.obs.index.map(lambda x: x[:-2])
            cell_idx = adata_10X.obs.index.isin(meta_10X[(meta_10X.cell_ontology_class != 0) &
                                                         (meta_10X.cell_ontology_class != '')].cell)
            adata_10X = adata_10X[cell_idx, :]
            
        else:
            
            tmp = sc.read_10x_mtx(path_10X + '/%s-%s/' % (tissue, channel), 
                                  var_names = 'gene_symbols', cache = False)
            tmp.obs.index = channel + "_" + tmp.obs.index
            tmp.obs.index = tmp.obs.index.map(lambda x: x[:-2])
            cell_idx = tmp.obs.index.isin(meta_10X[(meta_10X.cell_ontology_class != 0) &
                                                   (meta_10X.cell_ontology_class != '')].cell)
            adata_10X = adata_10X.concatenate(tmp[cell_idx, :], index_unique=None)
            
    if tissue == "Bladder":
        
        adata_10X_all = adata_10X.copy()
        
    else:
        
        genes = adata_10X_all.var.index & adata_10X.var.index
        adata_10X_all = adata_10X_all[:, genes].concatenate(adata_10X[:, genes], index_unique = None)

meta_SS2 = meta_SS2[meta_SS2.cell.isin(adata_SS2_all.obs.index)][['cell', 'tissue', 'cell_ontology_class']].set_index('cell')
meta_SS2['batch'] = 'SS2'
meta_SS2 = meta_SS2.rename(columns = {'cell_ontology_class': 'celltype'})
adata_SS2_all.obs = meta_SS2.loc[adata_SS2_all.obs.index]

meta_10X = meta_10X[meta_10X.cell.isin(adata_10X_all.obs.index)][['cell', 'tissue', 'cell_ontology_class']].set_index('cell')
meta_10X['batch'] = '10X'
meta_10X = meta_10X.rename(columns = {'cell_ontology_class': 'celltype'})
adata_10X_all.obs = meta_10X.loc[adata_10X_all.obs.index]

adata_SS2_all.write(filename = os.path.join(data_path, 'TM_SS2.h5ad'))
adata_10X_all.write(filename = os.path.join(data_path, 'TM_10X.h5ad'))

sc.concat((adata_SS2_all, adata_10X_all)).write(filename = os.path.join(data_path, 'TM_full.h5ad'))

  genes = adata_SS2_all.var.index & adata_SS2.var.index
  genes = adata_10X_all.var.index & adata_10X.var.index
... storing 'tissue' as categorical
... storing 'celltype' as categorical
... storing 'batch' as categorical
... storing 'tissue' as categorical
... storing 'celltype' as categorical
... storing 'batch' as categorical
... storing 'tissue' as categorical
... storing 'celltype' as categorical
... storing 'batch' as categorical


In [None]:
# Run the following code in R to convert h5ad to h5seurat which Seurat handles
# Check current directory first
# library(SeuratDisk)
# Convert('TM_full.h5ad', 'h5seurat', overwrite = TRUE)