In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
from scipy import sparse
import gc

In [2]:
merfish_genes = pd.read_csv("../../data/merfish/BAYSOR/seurat_objects/ns-atlas.merfish_baysor.scvi_integrated.gene_data.csv", index_col=0)

For imputation, the scRNAseq object as-is is too large to use as a reference. To ease memory constraints and maximize biologically informative predictions, we need to downsample both cells and genes in our reference.

In [3]:
scrna = sc.read_h5ad("../../data/scrna/seurat_objects/normal_skin.scrna.harmony.integrated.reclustered.annotated.filtered.anndata.h5ad")

In [4]:
del scrna.layers['log1p']

In [5]:
scrna.layers['counts'] = sparse.csr_matrix(scrna.layers['counts']).copy()
scrna.X = scrna.layers['counts'].copy()

In [6]:
gc.collect()

31

In the scRNA-seq, we only care about predicting genes that are biologically informative to signaling or pathways. so we can remove things like mt genes, ribo genes, lincRNAs etc. We can also filter more stringently for genes with higher counts.

In [7]:
len(scrna.var)

41138

In [8]:
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
scrna.var["mt"] = scrna.var_names.str.startswith("MT-")
# ribosomal genes
scrna.var["ribo"] = scrna.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
scrna.var["hb"] = scrna.var_names.str.contains("^HB[^(P)]")

In [9]:
sc.pp.calculate_qc_metrics(scrna, qc_vars=["mt", "ribo", "hb"], percent_top=None, log1p=False, inplace=True)

In [10]:
annot = sc.queries.biomart_annotations('hsapiens', ['external_gene_name', 'gene_biotype']).set_index('external_gene_name')

In [11]:
var = scrna.var.copy()
var = var.merge(annot, left_index=True, right_index=True)
var.head()

Unnamed: 0,mt,ribo,hb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,gene_biotype
FAM87B,False,False,False,530,0.010367,99.805987,2832.0,lncRNA
LINC00115,False,False,False,5120,0.072462,98.125764,19795.0,lncRNA
FAM41C,False,False,False,5245,0.020379,98.080006,5567.0,lncRNA
SAMD11,False,False,False,1050,0.007695,99.615635,2102.0,protein_coding
NOC2L,False,False,False,69217,0.861314,74.662308,235292.0,protein_coding


In [12]:
var = var[~var['mt']&~var['ribo']&~var['hb']].copy()
var = var[var['n_cells_by_counts'] > 10]
var = var[var['total_counts'] > 100]

In [13]:
all_genes = np.unique(np.concatenate([var.index.values, merfish_genes.index.values]))

In [14]:
scrna = scrna[:, all_genes].copy()

In [15]:
sc.pp.calculate_qc_metrics(scrna, percent_top=None, log1p=False, inplace=True)

In [16]:
scrna.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pct.mito', 'pct.ribo',
       'pct.hemo', 'study_id', 'sample_barcode', 'donor_id', 'donor_sex',
       'donor_age', 'anatomic_site', 'reported.cell_type',
       'anatomic_site.detailed', 'harmony.snn_res.0.2', 'harmony.snn_res.0.4',
       'harmony.snn_res.0.5', 'harmony.snn_res.0.6', 'harmony.snn_res.0.8',
       'harmony.snn_res.1', 'harmony.snn_res.1.2', 'harmony.snn_res.1.5',
       'harmony.snn_res.2', 'harmony.snn_res.2.5', 'seurat_clusters',
       'cell_barcode', 'cell_type.broad', 'cell_category', 'cell_type',
       'cell_type.broad.res_0.2', 'cell_type.detailed',
       'cell_type.reclustered', 'n_genes_by_counts', 'total_counts',
       'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb'],
      dtype='object')

In [17]:
sc.pp.normalize_total(scrna)
scrna.layers['norm'] = scrna.X.copy()
sc.pp.log1p(scrna)
scrna.layers['log1p'] = scrna.X.copy()

In [18]:
sc.pp.highly_variable_genes(scrna, layer='log1p', subset=False, batch_key='study_id', n_top_genes=len(scrna.var_names))

In [19]:
scrna.var['highly_variable_nbatches'].value_counts().sort_index()

highly_variable_nbatches
1       815
2       624
3       306
4       239
5       715
6       270
7       320
8       378
9       485
10      310
11      415
12      635
13     1381
14    12285
Name: count, dtype: int64

Keep genes that are variable in all datasets that were added to the integration. However ensure that the merfish panel genes are all in there.

In [20]:
scrna.var.loc[merfish_genes.index.values, 'highly_variable_nbatches'].value_counts().sort_index()

highly_variable_nbatches
1       1
2       4
4       1
5       4
6       5
7       7
8       9
9       8
10      7
11     15
12     26
13     66
14    409
Name: count, dtype: int64

In [21]:
genes2keep = scrna.var_names[[x > 10 for x in scrna.var['highly_variable_nbatches'].values]]
len(genes2keep)

14716

In [22]:
genes2keep = np.unique(np.concatenate([genes2keep, merfish_genes.index.values]))
len(genes2keep)

14762

In [23]:
scrna = scrna[:, genes2keep].copy()

In [24]:
scrna.X = scrna.layers['counts'].copy()
sc.pp.filter_cells(scrna, min_counts=10, inplace=True)
sc.pp.normalize_total(scrna)
scrna.layers['norm'] = scrna.X.copy()
sc.pp.log1p(scrna)
scrna.layers['log1p'] = scrna.X.copy()



In [25]:
scrna.write_h5ad("../../data/scrna/seurat_objects/normal_skin.scrna.harmony.integrated.reclustered.annotated.genes_filtered.tangram_input.h5ad", compression='gzip')

In [2]:
scrna = sc.read_h5ad("../../data/scrna/seurat_objects/normal_skin.scrna.harmony.integrated.reclustered.annotated.genes_filtered.tangram_input.h5ad")

In [26]:
scrna.var.to_csv("../../data/scrna/seurat_objects/normal_skin.scrna.harmony.integrated.reclustered.annotated.genes_filtered.gene_metadata.csv.gz")

In [27]:
scrna.obs.to_csv("../../data/scrna/seurat_objects/normal_skin.scrna.harmony.integrated.reclustered.annotated.genes_filtered.cell_metadata.csv.gz")

In [28]:
scrna

AnnData object with n_obs × n_vars = 273178 × 14762
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pct.mito', 'pct.ribo', 'pct.hemo', 'study_id', 'sample_barcode', 'donor_id', 'donor_sex', 'donor_age', 'anatomic_site', 'reported.cell_type', 'anatomic_site.detailed', 'harmony.snn_res.0.2', 'harmony.snn_res.0.4', 'harmony.snn_res.0.5', 'harmony.snn_res.0.6', 'harmony.snn_res.0.8', 'harmony.snn_res.1', 'harmony.snn_res.1.2', 'harmony.snn_res.1.5', 'harmony.snn_res.2', 'harmony.snn_res.2.5', 'seurat_clusters', 'cell_barcode', 'cell_type.broad', 'cell_category', 'cell_type', 'cell_type.broad.res_0.2', 'cell_type.detailed', 'cell_type.reclustered', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'pct_counts_hb', 'n_counts'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_varia