# Converting read quantifications to Anndata objects

In [21]:
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
import gzip
import re

def read_feature_counts(
    filename, 
    clean_regex = None,
    rename_columns = None
):
    with gzip.open(filename) as file:
        #getting rid of comment
        file.readline()
        
        tab = pd.read_csv(
            file,
            sep = '\t'
        )
        
    exclude_cols = ['Chr', 'Start', 'End', 'Strand']
    if clean_regex:
        new_columns = []
        for col in tab.columns:
            match = re.findall(clean_regex, col)
            if match:
                new_columns.append(match[0])
                
            else:
                new_columns.append(col)
                
        tab.columns = new_columns
    
    if rename_columns:
        tab.rename(
            columns = rename_columns,
            inplace = True
        )
    
    return tab.loc[:, ~tab.columns.isin(exclude_cols)]

## Gathering metadata

In [14]:
gencode = pd.read_csv(
    '../data/gencode.vM28.annotation.gtf.gz',
    sep = '\t',
    comment = '#',
    header = None
)
gencode.columns = ['chr', 'anno', 'type', 'start', 'end', 'x', 'strand', 'y', 'description']
gencode

Unnamed: 0,chr,anno,type,start,end,x,strand,y,description
0,chr1,HAVANA,gene,3143476,3144545,.,+,.,"gene_id ""ENSMUSG00000102693.2""; gene_type ""TEC..."
1,chr1,HAVANA,transcript,3143476,3144545,.,+,.,"gene_id ""ENSMUSG00000102693.2""; transcript_id ..."
2,chr1,HAVANA,exon,3143476,3144545,.,+,.,"gene_id ""ENSMUSG00000102693.2""; transcript_id ..."
3,chr1,ENSEMBL,gene,3172239,3172348,.,+,.,"gene_id ""ENSMUSG00000064842.3""; gene_type ""snR..."
4,chr1,ENSEMBL,transcript,3172239,3172348,.,+,.,"gene_id ""ENSMUSG00000064842.3""; transcript_id ..."
...,...,...,...,...,...,...,...,...,...
1869204,chrM,ENSEMBL,transcript,15289,15355,.,+,.,"gene_id ""ENSMUSG00000064371.1""; transcript_id ..."
1869205,chrM,ENSEMBL,exon,15289,15355,.,+,.,"gene_id ""ENSMUSG00000064371.1""; transcript_id ..."
1869206,chrM,ENSEMBL,gene,15356,15422,.,-,.,"gene_id ""ENSMUSG00000064372.1""; gene_type ""Mt_..."
1869207,chrM,ENSEMBL,transcript,15356,15422,.,-,.,"gene_id ""ENSMUSG00000064372.1""; transcript_id ..."


In [3]:
metadata = pd.read_csv(
    '../meta/Samples_info_scRNASeq_MADMcells_Knaus_Novarino.csv'
)
metadata = metadata.loc[:, ['sample_id', 'sample_description', 'sample_genotype', 'sample_treatment', 'sample_sex', 'sample_litter', 'sample_mouseID']]
metadata = metadata.set_index('sample_id')
metadata

Unnamed: 0_level_0,sample_description,sample_genotype,sample_treatment,sample_sex,sample_litter,sample_mouseID
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
162194,plate 1 control (200 unlabeled cells),mix,prep 1,m,G1/55 A,GNF1/480
162195,1 b1,wt,prep 1,m,G1/55 A,GNF1/480
162196,1 c1,wt,prep 1,m,G1/55 A,GNF1/480
162197,1 d1,wt,prep 1,m,G1/55 A,GNF1/480
162198,1 e1,wt,prep 1,m,G1/55 A,GNF1/480
...,...,...,...,...,...,...
162567,4 b12,empty,prep 2,,,
162568,4 c12,empty,prep 2,,,
162569,4 d12,empty,prep 2,,,
162570,4 e12,empty,prep 2,,,


## Converting Salmon quantification

In [20]:
gtf = pd.read_csv(
    '/resources/references/igenomes/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf',
    sep = '\t',
    header = None
)
gtf = gtf \
    .loc[:, 8] \
    .str.split(';', expand = True)

gtf = gtf.loc[:, range(4)]
gtf.columns = ['gene_id', 'gene_name', 'pid', 'Name']

for column in gtf.columns:
    gtf.loc[:, column] = gtf.loc[:, column].apply(
        lambda x: x.split('"')[1]
    )

gtf.loc[:, 'Name'] = gtf.apply(
    lambda x: x['Name'] if x['Name'].startswith('N') else x['pid'],
    axis = 1
)
gtf = gtf.groupby(['gene_name', 'Name']) \
    .agg('nunique') \
    .reset_index()
gtf

Unnamed: 0,gene_name,Name,gene_id,pid
0,0610005C13Rik,NR_038165,1,1
1,0610005C13Rik,NR_038166,1,1
2,0610007P14Rik,NM_021446,1,1
3,0610009B22Rik,NM_025319,1,1
4,0610009L18Rik,NR_038126,1,1
...,...,...,...,...
35114,l7Rn6,NM_001291287,1,1
35115,l7Rn6,NM_001291288,1,1
35116,l7Rn6,NM_001291289,1,1
35117,l7Rn6,NM_026304,1,1


In [4]:
sctab = pd.read_csv(
    '../raw/scRNA_salmon_quant_NumReads.tsv.gz',
    sep = '\t'
)
sctab = sctab.merge(
    gtf,
    on = 'Name',
    how = 'left'
)
agg_dict = {
    'Length': 'max'
}
agg_dict.update(
    {
        column: 'sum' for column in sctab.columns[sctab.columns.str.startswith('1')]
    }
)

sctab = sctab.groupby('gene_name') \
    .agg(agg_dict) \
    .reset_index()
sctab = sctab.set_index('gene_name')
sctab

Unnamed: 0_level_0,Length,162194,162206,162260,162211,162253,162246,162244,162220,162249,...,162561,162569,162570,162566,162567,162568,162564,162565,162571,162562
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610005C13Rik,1122,0.000,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
0610007P14Rik,1185,50.000,0.0,3.0,0.0,37.0,0.0,6.000,0.0,3.0,...,177.0,0.0,0.0,0.0,0.0,0.0,31.000,6.0,0.0,0.0
0610009B22Rik,795,19.606,8.0,1.0,9.0,6.0,0.0,65.878,0.0,0.0,...,57.0,0.0,0.0,0.0,0.0,0.0,5.000,124.0,0.0,119.0
0610009L18Rik,619,7.000,0.0,15.0,0.0,1.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0
0610009O20Rik,2404,41.000,0.0,3.0,11.0,13.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,53.882,92.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zyx,3399,71.000,0.0,23.0,0.0,0.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,39.0,0.0,70.0
Zzef1,11150,53.000,0.0,3.0,3.0,17.0,0.0,0.000,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,6.0,0.0,4.0
Zzz3,7607,39.000,0.0,5.0,39.0,19.0,0.0,0.000,0.0,5.0,...,117.0,0.0,0.0,0.0,0.0,0.0,0.000,16.0,0.0,89.0
a,692,0.000,0.0,0.0,0.0,8.0,0.0,0.000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000,0.0,0.0,0.0


In [5]:
data = csr_matrix(sctab.loc[:, sctab.columns.str.startswith('1')].values.T)
obs = metadata
var = sctab.loc[:, ['Length']]
adata = ad.AnnData(
    data,
    obs = obs,
    var = var
)
adata.write_h5ad(
    '../processed/scRNAseq_salmon_NumReads.h5ad'
)

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_genotype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_treatment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_litter' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_mouseID' as categorical


## Converting FeatureCounts quantification

In [25]:
sctab = read_feature_counts(
    '../raw/scRNA_featureCounts.tsv.gz',
    clean_regex = '[0-9]{6}',
    rename_columns = {
        'Geneid': 'gene_name'
    }
)
sctab = sctab.set_index('gene_name')
sctab

Unnamed: 0_level_0,Length,162194,162195,162196,162197,162198,162199,162200,162201,162202,...,162555,162556,162557,162559,162560,162561,162562,162563,162564,162565
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4933401J01Rik,1070,0,0,0,0,0,0,0,0,21,...,0,0,0,0,0,0,0,0,0,0
Gm26206,110,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Xkr4,6094,0,0,0,0,0,0,7,0,0,...,20,0,0,0,0,0,0,0,0,0
Gm18956,480,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm37180,2819,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mt-Nd6,519,193,29,8,34,61,79,159,9,9,...,66,37,43,53,69,137,97,128,290,75
mt-Te,69,1,1,0,0,2,4,0,0,0,...,2,0,1,0,0,0,3,0,1,0
mt-Cytb,1144,3944,870,440,511,629,1047,2217,319,729,...,3684,1684,1203,1517,4216,3993,4811,5539,6595,2376
mt-Tt,67,1,0,0,0,0,1,0,0,0,...,0,0,0,0,3,5,0,5,0,1


In [27]:
data = csr_matrix(sctab.loc[:, sctab.columns.str.startswith('1')].values.T)
obs = metadata.loc[[int(i) for i in sctab.columns[sctab.columns.str.startswith('1')]], :]
var = sctab.loc[:, ['Length']]
adata = ad.AnnData(
    data,
    obs = obs,
    var = var
)
adata.write_h5ad(
    '../processed/scRNAseq_featureCounts_NumReads.h5ad'
)

  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_genotype' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_treatment' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_sex' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_litter' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'sample_mouseID' as categorical
