# Converting read quantifications to Anndata objects

In [1]:
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
import gzip
import re

def read_feature_counts(
    filename, 
    clean_regex = None,
    rename_columns = None
):
    with gzip.open(filename) as file:
        #getting rid of comment
        file.readline()
        
        tab = pd.read_csv(
            file,
            sep = '\t'
        )
        
    exclude_cols = ['Chr', 'Start', 'End', 'Strand']
    if clean_regex:
        new_columns = []
        for col in tab.columns:
            match = re.findall(clean_regex, col)
            if match:
                new_columns.append(match[0])
                
            else:
                new_columns.append(col)
                
        tab.columns = new_columns
    
    if rename_columns:
        tab.rename(
            columns = rename_columns,
            inplace = True
        )
    
    return tab.loc[:, ~tab.columns.isin(exclude_cols)]

## Gathering metadata

In [2]:
metadata = pd.read_csv(
    '../meta/Samples_info_scRNASeq_MADMcells_Knaus_Novarino.csv'
)
metadata = metadata.loc[:, ['sample_id', 'sample_description', 'sample_genotype', 'sample_treatment', 'sample_sex', 'sample_litter', 'sample_mouseID']]
metadata = metadata.set_index('sample_id')
metadata

Unnamed: 0_level_0,sample_description,sample_genotype,sample_treatment,sample_sex,sample_litter,sample_mouseID
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
162194,plate 1 control (200 unlabeled cells),mix,prep 1,m,G1/55 A,GNF1/480
162195,1 b1,wt,prep 1,m,G1/55 A,GNF1/480
162196,1 c1,wt,prep 1,m,G1/55 A,GNF1/480
162197,1 d1,wt,prep 1,m,G1/55 A,GNF1/480
162198,1 e1,wt,prep 1,m,G1/55 A,GNF1/480
...,...,...,...,...,...,...
162567,4 b12,empty,prep 2,,,
162568,4 c12,empty,prep 2,,,
162569,4 d12,empty,prep 2,,,
162570,4 e12,empty,prep 2,,,


## Converting FeatureCounts quantification

In [3]:
sctab = read_feature_counts(
    '../raw/scRNA_featureCounts.tsv.gz',
    clean_regex = '[0-9]{6}',
    rename_columns = {
        'Geneid': 'gene_name'
    }
)
sctab = sctab.set_index('gene_name')
sctab

Unnamed: 0_level_0,Length,162194,162195,162196,162197,162198,162199,162200,162201,162202,...,162555,162556,162557,162559,162560,162561,162562,162563,162564,162565
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4933401J01Rik,1070,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm26206,110,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Xkr4,6094,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm18956,480,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm37180,2819,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd6,519,193,29,8,34,61,79,159,9,9,...,66,37,43,53,69,137,97,128,290,75
mt-Te,69,1,1,0,0,2,4,0,0,0,...,2,0,1,0,0,0,3,0,1,0
mt-Cytb,1144,3944,870,440,511,629,1047,2217,319,729,...,3684,1684,1203,1517,4216,3993,4811,5539,6595,2376
mt-Tt,67,1,0,0,0,0,1,0,0,0,...,0,0,0,0,3,5,0,5,0,1


In [4]:
data = csr_matrix(sctab.loc[:, sctab.columns.str.startswith('1')].values.T)
obs = metadata.loc[[int(i) for i in sctab.columns[sctab.columns.str.startswith('1')]], :]
var = sctab.loc[:, ['Length']]
adata = ad.AnnData(
    data,
    obs = obs,
    var = var
)
adata.write_h5ad(
    '../processed/scRNAseq_featureCounts.h5ad'
)

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_genotype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_treatment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_litter' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_mouseID' as categorical


## Converting Salmon quantification
After rerunning the nf-core rnaseq pipeline the Salmon quantification continued to have RefSeq IDs as gene names which indicates that there might be some built in Salmon index or annotation or whatever since for the second run we provided Gencode assembly and annotation to the pipeline which did not change the Salmon output. Thus this quantification should either be done manually (see featureCounts below)

In [5]:
# copying salmon quantifications to raw directory
# !cp ../results/star_salmon/salmon.merged.gene_counts.tsv ../raw/scRNA_salmon_quant_counts.tsv
# !gzip ../raw/scRNA_salmon_quant_counts.tsv
# !cp ../results/star_salmon/salmon.merged.gene_tpm.tsv ../raw/scRNA_salmon_quant_tpm.tsv
# !gzip ../raw/scRNA_salmon_quant_tpm.tsv

In [6]:
# getting gene length from featureCounts
gene_length = sctab.loc[:, ['Length']].copy()
gene_length['gene_name'] = gene_length.index.to_list()
gene_length.reset_index(
    inplace = True,
    drop = True
)
gene_length

Unnamed: 0,Length,gene_name
0,1070,4933401J01Rik
1,110,Gm26206
2,6094,Xkr4
3,480,Gm18956
4,2819,Gm37180
...,...,...
54451,519,mt-Nd6
54452,69,mt-Te
54453,1144,mt-Cytb
54454,67,mt-Tt


In [7]:
sctab_dict = {}
for key in ['counts', 'tpm']:
    sctab = pd.read_csv(
        f'../raw/scRNA_salmon_quant_{key}.tsv.gz', 
        sep = '\t',
        compression = 'gzip'
    )
    sctab = sctab.merge(
        gene_length,
        on = 'gene_name'
    )
    sctab = sctab.loc[:, ['gene_id', 'gene_name', 'Length'] + sctab.columns[sctab.columns.str.startswith('X')].to_list()]

    agg_dict = {
        'Length': 'max',
        'gene_id': lambda x: ','.join(x) if type(x) == list else x
    }
    agg_dict.update(
        {
            column[1:]: 'sum' for column in sctab.columns[sctab.columns.str.startswith('X')]
        }
    )
    
    sctab.columns = ['gene_id', 'gene_name', 'Length'] + [s[1:] for s in sctab.columns[sctab.columns.str.startswith('X')]]
    sctab = sctab.groupby('gene_name') \
        .agg(agg_dict) \
        .reset_index()

    # splitting dataset because it also contains bulk RNA samples that were processed together with the scRNA
    sctab.loc[:, ~sctab.columns.str.startswith('1')].to_csv(
        f'../raw/RNA_salmon_quant_{key}.tsv.gz',
        sep = '\t',
        index = False,
        compression = 'gzip'
    )
    sctab.set_index(
        'gene_name', 
        inplace = True
    )
    sctab_dict[key] = sctab.loc[:, ~sctab.columns.str.startswith('8')]

sctab_dict

{'counts':                Length                gene_id  162194  162195  162196  162197  \
 gene_name                                                                      
 0610005C13Rik    3583   ENSMUSG00000109644.2   0.000     0.0     0.0     0.0   
 0610006L08Rik    2128   ENSMUSG00000108652.2   0.000     0.0     0.0     0.0   
 0610009B22Rik     998  ENSMUSG00000007777.10  20.581     0.0     0.0     0.0   
 0610009E02Rik    1803   ENSMUSG00000086714.2  25.469     0.0     0.0     0.0   
 0610009L18Rik     619   ENSMUSG00000043644.5   7.000     0.0    19.0     0.0   
 ...               ...                    ...     ...     ...     ...     ...   
 n-R5s88           118   ENSMUSG00000065107.4   0.000     0.0     0.0     0.0   
 n-R5s92           119   ENSMUSG00000119822.1   0.000     0.0     0.0     0.0   
 n-R5s93           119   ENSMUSG00000119639.1   0.000     0.0     0.0     0.0   
 snoZ159            87   ENSMUSG00002075734.1   0.000     0.0     0.0     0.0   
 snoZ196          

In [9]:
data = csr_matrix(sctab_dict['tpm'].loc[:, sctab_dict['tpm'].columns.str.startswith('1')].values.T)
obs = metadata
var = sctab.loc[:, ['Length', 'gene_id']]
var.columns = ['Length', 'gencode_id']
var.loc[:, 'gencode_id'] = var.gencode_id.astype('string')
adata = ad.AnnData(
    data,
    obs = obs,
    var = var
)
adata.layers['counts'] = csr_matrix(sctab_dict['counts'].loc[:, sctab_dict['counts'].columns.str.startswith('1')].values.T)
adata



AnnData object with n_obs × n_vars = 376 × 54456
    obs: 'sample_description', 'sample_genotype', 'sample_treatment', 'sample_sex', 'sample_litter', 'sample_mouseID'
    var: 'Length', 'gencode_id'
    layers: 'counts'

In [10]:
adata.write_h5ad(
    '../processed/scRNAseq_salmon.h5ad'
)

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_genotype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_treatment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_litter' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_mouseID' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'gencode_id' as categorical
