# Data preparation for [Compass](https://github.com/YosefLab/Compass)

In [48]:
import scanpy as sc
import anndata as ad
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import mmread

def data2h5ad(prefix):
    X = mmread(prefix + '.mtx').tocsr()
    obs = pd.read_csv(
        prefix + '.metadata.tsv',
        sep = '\t'
    )
    with open(prefix + f'.rownames.txt', 'r') as f:
        genes = [r.rstrip() for r in f]
    
    var = pd.DataFrame(
        index = genes
    )
    
    return ad.AnnData(
        X = X.T,
        obs = obs,
        var = var
    )

def splith5ad(adataobj, split_on):
    subsets = adataobj.obs[split_on].unique()
    return {subset: adataobj[adataobj.obs[split_on] == subset, :].copy() for subset in subsets}

In [63]:
adata = data2h5ad('../processed/novarino_scRNA')
adata

AnnData object with n_obs × n_vars = 203 × 26249
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seq_folder', 'nUMI', 'nGene', 'log10GenesPerUMI', 'mitoRatio', 'cells', 'sample', 'sample_mouseID', 'sample_litter', 'sample_treatment', 'sample_sex', 'RNA_snn_res.0.9', 'seurat_clusters'

In [64]:
tmp = sc.read_h5ad('../processed/scRNAseq_featureCounts_NumReads.h5ad')
genes = tmp.var.copy()
adata.var['Length'] = adata.var.merge(
    tmp.var,
    left_index = True,
    right_index = True
).Length

In [65]:
adata = adata[:, ~adata.var.Length.isna()].copy()

In [66]:
annotation = {
    0: 'ollfactory_cell',
    1: 'oligodendrytes',
    2: 'astrocytes',
    3: 'neurons'
}
adata.obs['cell_type'] = adata.obs.seurat_clusters.apply(lambda x: annotation[x])
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,seq_folder,nUMI,nGene,log10GenesPerUMI,mitoRatio,cells,sample,sample_mouseID,sample_litter,sample_treatment,sample_sex,RNA_snn_res.0.9,seurat_clusters,cell_type
wt_162322,wt,62570,2514,SeuratProject,62723,2537,0.709614,0.065861,wt_162322,wt,GNF1/464,G1/59 A,prep 1,f,0,0,ollfactory_cell
wt_162323,wt,99844,3289,SeuratProject,99977,3315,0.704111,0.043490,wt_162323,wt,GNF1/464,G1/59 A,prep 1,f,1,1,oligodendrytes
wt_162324,wt,107544,3215,SeuratProject,107931,3279,0.698518,0.121318,wt_162324,wt,GNF1/464,G1/59 A,prep 1,f,0,0,ollfactory_cell
wt_162325,wt,136321,3141,SeuratProject,136624,3172,0.681787,0.037987,wt_162325,wt,GNF1/464,G1/59 A,prep 1,f,3,3,neurons
wt_162326,wt,109397,3513,SeuratProject,109533,3537,0.704158,0.073549,wt_162326,wt,GNF1/464,G1/59 A,prep 1,f,3,3,neurons
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ko_162553,ko,331737,4999,SeuratProject,332022,5055,0.670822,0.071040,ko_162553,ko,GN21/475,G1/59 B,prep 2,f,3,3,neurons
ko_162554,ko,435996,5670,SeuratProject,436357,5742,0.666519,0.042850,ko_162554,ko,GN21/475,G1/59 B,prep 2,f,3,3,neurons
ko_162555,ko,260654,4243,SeuratProject,260946,4306,0.670920,0.062971,ko_162555,ko,GN21/475,G1/59 B,prep 2,f,0,0,ollfactory_cell
ko_162556,ko,211388,3149,SeuratProject,211590,3177,0.657595,0.034033,ko_162556,ko,GN21/475,G1/59 B,prep 2,f,0,0,ollfactory_cell


In [67]:
data = {}
for celltype in annotation.values():
    for sample in ['wt', 'ko']:
        data[celltype + '_' + sample] = adata[(adata.obs.cell_type == celltype) & (adata.obs['sample'] == sample), :].copy()
data

{'ollfactory_cell_wt': AnnData object with n_obs × n_vars = 35 × 26160
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seq_folder', 'nUMI', 'nGene', 'log10GenesPerUMI', 'mitoRatio', 'cells', 'sample', 'sample_mouseID', 'sample_litter', 'sample_treatment', 'sample_sex', 'RNA_snn_res.0.9', 'seurat_clusters', 'cell_type'
     var: 'Length',
 'ollfactory_cell_ko': AnnData object with n_obs × n_vars = 20 × 26160
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seq_folder', 'nUMI', 'nGene', 'log10GenesPerUMI', 'mitoRatio', 'cells', 'sample', 'sample_mouseID', 'sample_litter', 'sample_treatment', 'sample_sex', 'RNA_snn_res.0.9', 'seurat_clusters', 'cell_type'
     var: 'Length',
 'oligodendrytes_wt': AnnData object with n_obs × n_vars = 33 × 26160
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seq_folder', 'nUMI', 'nGene', 'log10GenesPerUMI', 'mitoRatio', 'cells', 'sample', 'sample_mouseID', 'sample_litter', 'sample_treatment', 'sample_sex', 'RNA_snn_res.0.9', 'seurat_clu

In [69]:
from scipy.sparse import issparse
def normalize_tpm(adata, scale = 1e4):
    X = adata.X.toarray().T / adata.var.Length.values[:, None] * 1000
    factors = X.sum(axis = 0) * scale
    return X.T / factors[:, None]

def write_expression(adataobj, file, layer = None):
    adataobj = adataobj.copy()
    if layer:
        adataobj.X = adataobj.layers[layer]
    
    X = normalize_tpm(adataobj)
    df = pd.DataFrame(
        X.T,
        index = adataobj.var.index,
        columns = adataobj.obs.index
    )
    df.to_csv(
        file,
        sep = '\t'
    )

for key in data.keys():
    write_expression(
        data[key], 
        f'../processed/{key}.expression.tsv',
    )