# Step 1: Aggregate all QC metrics per sample
As the first step to constructing our snRNA-seq maps, we will aggregate all non-empty droplets from CellBender with appropriate QC metrics included. This requires 3 inputs per sample:
1. CellRanger 4.0.0 count output: `<sampleID>.raw_feature_bc_matrix.h5`
2. CellBender remove-background FPR = 0.01 h5: `<sampleID>_out_FPR_0.01_filtered.h5`
3. scR-Invex output called per sample, collapsed to retain the number of exonic, intronic, and junction reads per barcode: `<sampleID>.scrinvex.collapsed.tsv`

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

In [2]:
# Libraries
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import scanpy as sc
import scrublet as scr
import ndd

In [6]:
print('scanpy: ' + sc.__version__)
print('pandas: ' + pd.__version__)
print('numpy: ' + np.__version__)
print('ndd: ' + ndd.__version__)

scanpy: 1.7.2
pandas: 1.2.4
numpy: 1.18.1
ndd: 1.6.3


### Supplemental code to process the raw scR-Invex output
Note: this was run on the direct output of scrinvex run on each CellRanger BAM file separately. It simply collapses the number of reads into a count of intronic-only, exonic-only, and overlapping intronic & exonic for each cell barcode.

    df = pd.read_csv('scrinvex_output.tsv', sep='\t')
    grp = df[['barcode','introns','junctions','exons']].groupby('barcode')
    out = grp.sum()

### Set paths for aggregation

In [4]:
# Path to CellRanger 4.0.0 output files
# Files should be saved as <sampleID>.raw_feature_bc_matrix.h5
path_cellranger = '../../data/CellRanger4.0.0'

# Path to CellBender v2.1-alpha
# Docker available at: us.gcr.io/broad-dsde-methods/cellbender:v2.1-alpha
# Files should be saved as <sampleID>_out_FPR_0.01_filtered.h5
path_cellbender = '../../data/CellBenderV2'

# Path to scR-Invex output
# Tool at https://github.com/getzlab/scrinvex
path_scrinvex = '../../data/scrinvex'

# Path to aggregation output
path_aggregation = '../../data/aggregation'

### Combine CellRanger and CellBender and Calculate some basic QC metrics

In [5]:
# Samples to aggregate
sample_list = ['1364', '1452', '1579', '1650', '1690', '1693', '1703', '1716', 
               '1733', '1739', '1763', '1773', '1785', '1800', '1801']

In [6]:
# Phenotype values for age, sex, and disease status
sex_map = {'1364':'F',
           '1452':'F',
           '1579':'F',
           '1650':'M',
           '1690':'F',
           '1693':'M',
           '1703':'M',
           '1716':'F',
           '1733':'F',
           '1739':'M',
           '1763':'M',
           '1773':'M',
           '1785':'F',
           '1800':'F',
           '1801':'M'}
disease_map = {'1800':'ICM',
               '1452':'NF',
               '1579':'ICM',
               '1690':'NF',
               '1693':'ICM',
               '1773':'ICM',
               '1801':'NF',
               '1785':'NF',
               '1733':'ICM',
               '1364':'ICM',
               '1739':'NF',
               '1716':'NF',
               '1703':'ICM',
               '1763':'NF',
               '1650':'NF'}
age_map = {'1364':55,
           '1452':47,
           '1579':47,
           '1650':58,
           '1690':63,
           '1693':64,
           '1703':55,
           '1716':64,
           '1733':64,
           '1739':52,
           '1763':58,
           '1773':59,
           '1785':53,
           '1800':62,
           '1801':42}
mydicts = [disease_map, sex_map, age_map]
phenos = pd.concat([pd.Series(d) for d in mydicts], axis=1).fillna(0).reset_index().sort_values('index')
phenos.columns = ['individual','disease','sex','age']
phenos

Unnamed: 0,individual,disease,sex,age
9,1364,ICM,F,55
1,1452,NF,F,47
2,1579,ICM,F,47
14,1650,NF,M,58
3,1690,NF,F,63
4,1693,ICM,M,64
12,1703,ICM,M,55
11,1716,NF,F,64
8,1733,ICM,F,64
10,1739,NF,M,52


In [7]:
'''
Function to process each sample
Adds the following information/QC metrics to the anndata:
1. Phenotypic information from above
2. ncount: Total UMI per nucleus based on CellBender and CellRanger
3. ngenes: Total unique genes per nucleus based on CellBender and CellRanger
4. percent_mito: Percent mitochondrial mapped reads, CellRanger only
5. exon_prop: Proportion of reads mapping exclusively to exons
6. entropy: Nucleus-based entropy based on CellBender and Cellranger
'''
def import_sample(sample,
                  path_cellranger,
                  path_cellbender,
                  path_scrinvex,
                  phenos):
    # CellRanger Counts -- all barcodes (not just CellRanger derived cells)
    adata_cr = sc.read_10x_h5(os.path.join(path_cellranger, sample + '.raw_feature_bc_matrix.h5'))
    adata_cr.var_names_make_unique()
    
    # CellBender Cells and Counts -- we will use the non-empty droplets from CellBender as our cells
    adata_cb = sc.read_10x_h5(os.path.join(path_cellbender, sample + '_out_FPR_0.01_filtered.h5'))
    adata_cb.var_names_make_unique()
    
    # Add the cellranger counts to layers
    adata_cr = adata_cr[adata_cb.obs.index.tolist(),:,] # subset raw to cellbender cells and order appropriately
    adata_cb.layers['cellranger_raw'] = adata_cr.X # keep the cellranger counts in the object
    
    # Add in some sample based annotations: In this case sample & individual are the same
    # But this data will eventually be combined with the DCM/HCM data in which case this isn't true
    # so best to keep all the info for now
    adata_cb.obs['sample'] = sample
    adata_cb.obs['individual'] = sample
    for phen in [y for y in phenos.columns if not y == 'individual']:
        adata_cb.obs[phen] = phenos[phenos['individual'] == sample][phen].iloc[0]

    # QC metrics we need to calculate
    # nUMI and nGenes per nucleus
    adata_cb.obs['cellranger_ncount'] = np.array(adata_cb.layers['cellranger_raw'].sum(1)).squeeze()
    adata_cb.obs['cellbender_ncount'] = np.array(adata_cb.X.sum(1)).squeeze()
    adata_cb.obs['cellranger_ngenes'] = np.array((adata_cb.layers['cellranger_raw'] > 0).sum(1)).squeeze()
    adata_cb.obs['cellbender_ngenes'] = np.array((adata_cb.X > 0).sum(1)).squeeze()
    
    # % MT reads
    mito_genes = adata_cb.var_names.str.startswith('MT-') # get MT genes
    adata_cb.obs['cellranger_percent_mito'] = np.sum(adata_cb[:, mito_genes].layers['cellranger_raw'], axis=1).A1 / np.sum(adata_cb.layers['cellranger_raw'], axis=1).A1
    
    # Exon proportion
    scrinvex = pd.read_csv(os.path.join(path_scrinvex,sample+'.scrinvex.collapsed.tsv'),sep='\t')
    scrinvex = scrinvex[scrinvex['barcode'].isin(adata_cb.obs.index)]
    scrinvex['exon_prop'] = scrinvex['exons'] / scrinvex[['introns','junctions','exons']].sum(1)
    scrinvex_lookup = {x:y for x,y in zip(scrinvex['barcode'],scrinvex['exon_prop'])}
    adata_cb.obs['exon_prop'] = [scrinvex_lookup[x] for x in adata_cb.obs.index]
    
    # Remove cells with empty cellbender counts -- these will fail and need to be discarded
    # This happens when CellBender removes all the counts for a droplet,
    # and normally indicates that something is off with that barcode
    adata_cb = adata_cb[adata_cb.obs['cellbender_ncount'] > 0].copy()

    # Calculate and append entropy
    entropy_cb = np.zeros(adata_cb.X.shape[0])
    for i in range(adata_cb.X.shape[0]):
        # calculate entropy (based on cellbender)
        entropy_cb[i] = ndd.entropy(np.array(adata_cb.X[i, :].todense()).squeeze(), k=adata_cb.X.shape[1])
    adata_cb.obs['cellbender_entropy'] = entropy_cb
    entropy_cr = np.zeros(adata_cb.X.shape[0])
    for i in range(adata_cb.X.shape[0]):
        # calculate entropy (based on cellranger)
        entropy_cr[i] = ndd.entropy(np.array(adata_cb.layers['cellranger_raw'][i, :].todense()).squeeze(), k=adata_cb.X.shape[1])
    adata_cb.obs['cellranger_entropy'] = entropy_cr
    return(adata_cb)

In [None]:
# Build the adata object for each individual sample and save this adata for subsequent aggregation
for s in sample_list:
    print(s)
    adata = import_sample(s,
                          path_cellranger,
                          path_cellbender,
                          path_scrinvex,
                          phenos)
    adata.write(os.path.join(path_aggregation,s+'.preqc.h5ad'))

### Append Scrublet Scores to Nuclei from Each Sample
This is based on both CellBender and CellRanger separately.

In [None]:
# Calculate Scrublet Scores for Each Nucleus in each sample
# Based on CellRanger and CellBender
# Save the output to an h5ad file
for s in sample_list:
    print(s)
    print('CellBender...')
    adata = sc.read(os.path.join(path_aggregation, s + '.preqc.h5ad'))
    scrub = scr.Scrublet(adata.X)
    doublet_scores, predicted_doublets = scrub.scrub_doublets()
    adata.obs['cellbender_doublet_scores'] = doublet_scores
    adata.obs['cellbender_predicted_doublets'] = predicted_doublets
    
    print('CellRanger...')
    scrub = scr.Scrublet(adata.layers['cellranger_raw'])
    doublet_scores, predicted_doublets = scrub.scrub_doublets()
    adata.obs['cellranger_doublet_scores'] = doublet_scores
    adata.obs['cellranger_predicted_doublets'] = predicted_doublets
    adata.write(os.path.join(path_aggregation, s + '.preqc.scrub.h5ad'))

In [10]:
# Remove the pre-scrublet data object as we don't need it anymore and it is just taking up space
for s in sample_list:
    os.system('rm ' + os.path.join(path_aggregation,s+'.preqc.h5ad'))