In [None]:
from sctoolbox.utils.jupyter import bgcolor, _compare_version

# change the background of input cells
bgcolor("PowderBlue", select=[3, 11, 15, 27])

nb_name = "02_QC_filtering.ipynb"

_compare_version(nb_name)

# 02 - QC and filtering
<hr style="border:2px solid black"> </hr>

## 1 - Description

For subsequent analysis, it is crucial to ensure that all cellular barcodes correspond to viable cells. Quality control (QC) is mandatory for RNA-seq data and focuses on three key aspects:

 1. The number of counts per barcode (count depth)
 2. The number of genes per barcode
 3. The fraction of reads derived from mitochondrial vs. nuclear origin
 
Outliers based on these covariates are filtered out. Such outliers may occur if a cell is dying, leading to broken membranes that cause mRNA leakage, resulting in low count depth and detected genes, along with elevated mitochondrial counts.

It is important to note that these covariates can vary between cell types. For example, cells involved in respiratory processes may have a higher mitochondrial content compared to other cells.

Therefore, cutoffs should be chosen to remove only outliers. This can be achieved automatically using algorithms such as median absolute deviation (MAD) or a Gaussian mixture model (GMM).


**DOI:** https://doi.org/10.1038/s41576-023-00586-w

_______

## 2 - Setup

In [None]:
import sctoolbox
import sctoolbox.utils as utils
import sctoolbox.tools.qc_filter as qc
import sctoolbox.tools.marker_genes as marker_genes
import sctoolbox.plotting as pl

import matplotlib.pyplot as plt
import pandas as pd

sctoolbox.settings.settings_from_config("config.yaml", key="02")

_________

## 3 - Load anndata
Uses the anndata object written by the previous notebook.

In [None]:
adata = utils.adata.load_h5ad("anndata_1.h5ad")

with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata)
    display(adata.obs)
    display(adata.var)

__________

## 4 - QC and filtering
<hr style="border:2px solid black"> </hr>

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Set the species of the data 
species = "human"  # available species: human, mouse, rat, zebrafish

# Set the column in adata.obs containing the biological condition to evaluate
condition_column = "sample"

# Set the column in adata.var containing gene names (or set to None to use adata.var index)
gene_column = None

# Absolute minimum number of genes for pre-selection of cells before QC plotting
min_genes = 1

# Decide whether to remove doublets using scrublet (True) or to skip doublet calculation (False)
# Set the threshold manually or use "None" for automatic threshold prediction
filter_doublets = True
threads = 4
doublet_threshold = None
use_native = False  # whether to use the scrublet package (set True in case of error)

# Whether to predict the sex of samples using the expression of a female gene.
predict_sex = True
# Name of the gene to use for the sex assignment.
# Note: gene expression can vary significantly across celltypes, disease and development stages. 
# Therefore, there is no guarantee that the example genes provided below will lead to accurate sex-determination.

# Examples by species:
# mouse: Xist https://www.science.org/doi/10.1126/science.adf1046
# human: XIST https://doi.org/10.1016/j.fsigen.2016.10.018
# rat: Xist https://www.science.org/doi/10.1126/science.adf1046
# zebrafish: rbpms2a, rbpms2b, foxl2 https://doi.org/10.1007/s00018-021-04066-4

female_gene = "XIST"

# Decide whether to estimate thresholds individual per condition (False) or globally (True)
global_threshold = True
# function used for automatic filtering.
# Either:
# qc.gmm_threshold (gaussian-mixture model),
# qc.mad_threshold (median aboslute deviation)
# or a custom function
filter_fun = qc.gmm_threshold
fun_kwargs = {  # additional parameters of the filter function
    "min_n": 3,  # Lower threshold (standard deviation) multiplier
    "max_n": 3  # Upper threshold (standard deviation) multiplier
}  

# Identify mitochondrial, ribosomal or gender genes
# 1. "internal" to use sctoolbox provided list, a list of gene names or a txt-file containing one gene per line or None
# 2. case insesitive regex pattern used as a fallback to option one
mito_list = "internal"  # 1.
mito_regex = "^mt"  # 2.
ribo_list = "internal"  # 1.
ribo_regex = "^rps|rpl"  # 2.
gen_list = "internal"  # 1.
gen_regex = None  # 2.

# Removal of Gene Subsets

# Decide whether to remove unwanted genes, such as mitochondrial, ribosomal, or gender-related genes. 
# This can reduce complexity and eliminate potential biases in subsequent analyses.
# Note: Removal is only possible if marker lists for these regions are available for the organism. 
# Marker lists are available by default for mouse, human, and rat.
filter_mito = True
filter_ribo = False
filter_gender = False

# Optional: Plot STARsolo quality if a path is given
quant_folder = ""
# correction of ambient RNA using scAR
# Caution this process is expensive and thus will take time to run!
# Requires the raw (unfiltered) AnnData object containing all droplets.
path_raw_adata = ""  # The path to the raw h5ad file. Leave empty to skip.
epochs = 150  # Number of iterations for the model.

# overwrite previously applied sctoolbox filtering steps
# set to apply new filtering on top of previously applied filters
# Caution: Not recommended as it invalidates filter tracking
overwrite = False

________

In [None]:
# Ensure that the condition column is of type category
adata.obs[condition_column] = adata.obs[condition_column].astype("category")

### 4.1 - Show STARsolo quality (optional)

If the data was mapped using STARsolo, use the parameter to set the path to the STARsolo runs and plot quality measures across runs. The path must be a folder, e.g. "path/to/starsolo_output", which contains folders per condition e.g. "cond1", "cond2", etc.

In [None]:
if quant_folder != "":
    _ = pl.qc_filter.plot_starsolo_quality(quant_folder, save="starsolo_quality.pdf")
    _ = pl.qc_filter.plot_starsolo_UMI(quant_folder, ncol=3, save="starsolo_cell_selection.pdf")

_________

### 4.2 - Label genes
Mark genes on their general association. E.g. mitochondrial.

In [None]:
qc_vars = marker_genes.label_genes(adata,
                                   gene_column=gene_column,
                                   species=species,
                                   m_genes=mito_list,
                                   m_regex=mito_regex,
                                   r_genes=ribo_list,
                                   r_regex=ribo_regex,
                                   g_genes=gen_list,
                                   g_regex=gen_regex
                                  )

In [None]:
adata.obs

_________

### 4.3 - Calculate QC metrics
Create quality control metrics to filter the data on.

In [None]:
adata = qc.calculate_qc_metrics(adata, qc_vars=qc_vars)

_________

### 4.5 Calculate and remove doublets
Doublets are artifacts where two (doublet) or more (multiplet) cells receive the same barcode. As multiplets behave as a joined feature set of the collected cells they may show up as a separate group in downstream analysis, thus potentially skewing results. Therefore, it is recommended to remove doublets.

**DOI: [10.1016/j.cels.2018.11.005](https://doi.org/10.1016/j.cels.2018.11.005)**

In [None]:
# Set filter for number of genes before calculating doublets
n_cells_before = len(adata)
adata = adata[adata.obs["n_genes"] >= min_genes]
n_cells_after = len(adata)
print(f"Filtered out {n_cells_before-n_cells_after} cells which had less than {min_genes} gene(s) expressed.")

In [None]:
if filter_doublets:
    qc.estimate_doublets(adata, groupby=condition_column, threads=threads, threshold=doublet_threshold, use_native=use_native)

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Remove predicted doublet
filter_doublets = True

In [None]:
if filter_doublets:
    # Remove the duplicates from adata
    qc.filter_cells(adata, "predicted_doublet", name="doublet", overwrite=overwrite)

_________

### 4.5 - Predict sex per sample

In [None]:
if predict_sex:
    qc.predict_sex(adata, groupby=condition_column, gene_column=gene_column, gene=female_gene, 
                   save="female_prediction.pdf")

_________

### 4.6 - Cell filtering
<hr style="border:1px solid black"> </hr>

Low and high count depth indicates cells with low integrity and doublets, respectively (DOI: 10.15252/msb.20188746).

In [None]:
# available obs columns
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata.obs)

Any numeric column shown above can be used as filter metric. Here is a description of the commonly available metrics:

| Metric | Description |
|--------|-------------|
|n_genes|The number of genes associated with a barcode (cell).|
|total_counts|The total amount of reads detected for this barcode (cell).|
|log1p_total_counts|Same as above but on a logarithmic scale.|
|total_counts_is_mito|The total amount of reads associated with mitochondrial genes.|
|log1p_total_counts_is_mito|Same as above but on a logarithmic scale.|
|pct_counts_is_mito|Percentage of mitochondrial reads per barcode (cell).|
|total_counts_is_ribo|The total amount of reads associated with ribosomal genes.|
|log1p_total_counts_is_ribo|Same as above but on a logarithmic scale.|
|pct_counts_is_ribo|Percentage of ribosomal reads per barcode (cell).|
|total_counts_is_gender|The total amount of reads associated with gender genes.|
|log1p_total_counts_is_gender|Same as above but on a logarithmic scale.|
|pct_counts_is_gender|Percentage of gender related reads per barcode (cell).|

In [None]:
# Decide whether to estimate thresholds individual per condition (False) or globally (True)
global_threshold = True

# Before filtering the impact of the individual filters are plotted by an UpSet plot.
# To restrict complexity of the plot the plotted combinations can be limited below.
limit_combinations = 2 # Either provide the combination grade as Integer or None to include all

# Set initial filter thresholds
# The thresholds can be interactively changed later on
# Note: Only metrics provided below are available for filtering
default_obs_thresholds = {
    'n_genes': {'min': None, 'max': None},
    'log1p_total_counts': {'min': None, 'max': None},
    'pct_counts_is_mito': {'min': None, 'max': None},
    'pct_counts_is_ribo': {'min': None, 'max': None},
    'pct_counts_is_gender': {'min': None, 'max': None}
    # add additional thresholds based on the available columns shown above
    # format: '<obs clolumn>': {'min': <threshold|None>, 'max': <threshold|None>}
    # None = automatically derive initial threshold
    # float('inf') or float('-inf') = no filter
}

#### 4.6.1 - Estimate initial thresholds automatically

In [None]:
groupby = condition_column if global_threshold is False else None
initial_obs_thresholds = qc.get_thresholds(adata,
                                       default_obs_thresholds,
                                       only_automatic=False,
                                       groupby=groupby)
obs_columns = list(initial_obs_thresholds.keys())
qc.thresholds_as_table(initial_obs_thresholds)

The plot below estimates the impact each metric (and combination of metrics) would have on the data. Metrics that filter the same amount of cells independent of being alone or combined with other metrics can be disregarded as they have little effect on the overall outcome of the filtering.

In [None]:
_ = pl.qc_filter.upset_plot_filter_impacts(adata, 
                                           thresholds=initial_obs_thresholds, 
                                           groupby=groupby,
                                           limit_combinations=limit_combinations)

#### 4.6.2 - Customize thresholds via sliders

 (Rerun cell if plot is not shown)

In [None]:
%matplotlib widget

# Plot violins and sliders
obs_figure, obs_slider_dict = pl.qc_filter.quality_violin(
    adata,
    obs_columns,
    groupby=condition_column,
    which="obs",
    thresholds=initial_obs_thresholds,
    global_threshold=global_threshold,
    title="Cell quality control (before)",
    save="cell_filtering.png"
)
obs_figure

In [None]:
plt.close()

# Get final thresholds
final_obs_thresholds = pl.qc_filter.get_slider_thresholds(obs_slider_dict)
qc.thresholds_as_table(final_obs_thresholds) # show thresholds

In [None]:
%matplotlib inline

_ = pl.qc_filter.upset_plot_filter_impacts(adata, 
                                           thresholds=final_obs_thresholds, 
                                           groupby=groupby,
                                           limit_combinations=limit_combinations)

In [None]:
# Show pairwise comparisons of column values w/ thresholds (mean values in case thresholds are grouped)

if len(final_obs_thresholds) > 1:
    mean_thresholds = qc.get_mean_thresholds(final_obs_thresholds)
    _ = pl.general.pairwise_scatter(adata.obs, obs_columns, thresholds=mean_thresholds, save="cell_filtering_scatter.pdf")

#### 4.6.3 - Apply final thresholds
Filter the anndata object based on the thresholds in the threshold table.

In [None]:
qc.apply_qc_thresholds(adata, which="obs", thresholds=final_obs_thresholds, overwrite=overwrite)

#### 4.6.4 - Show data after filtering

In [None]:
%matplotlib inline 

# Plot violins and sliders
figure, slider_dict = pl.qc_filter.quality_violin(
    adata,
    obs_columns,
    groupby=condition_column,
    which="obs",
    ncols=3,
    global_threshold = global_threshold,
    title="Cell quality control (after)",
    save="cell_filtering_final.png"
)
figure

_________

### 4.7 - Gene filtering
<hr style="border:1px solid black"> </hr>
The following section is about filtering genes. The user can specify thresholds to filter by the number of cells.

In [None]:
# Recalculate quality measures for genes
adata = qc.calculate_qc_metrics(adata)

In [None]:
# Remove genes with 0 count
zero_bool = adata.var["n_cells_by_counts"] == 0
adata = adata[:,~zero_bool]

In [None]:
# available var columns
with pd.option_context("display.max.rows", 5, "display.max.columns", None):
    display(adata.var)

Any numeric column shown above can be used as filter metric. Here is a description of the commonly available metrics:

| Metric | Description |
|--------|-------------|
|n_cells_by_counts|The number of cells that contain reads associated with the gene.|
|mean_counts|The mean amount of reads over all cells.|
|log1p_mean_counts|Same as above but on a logarithmic scale.|
|pct_dropout_by_counts|Percentage of cells this gene does not appear in.|
|total_counts|The total amount of reads associated to this gene.|
|log1p_total_counts|Same as above but on a logarithmic scale.|

<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
# Set initial filter thresholds
# The thresholds can be interctively changed later on
# Note: Only metrics provided below are available for filtering
default_var_thresholds = {
    'n_cells_by_counts': {'min': float('-inf'), 'max': float('inf')},
    'log1p_mean_counts': {'min': float('-inf'), 'max': float('inf')}
    # add additional thresholds based on the available columns shown above
    # format: '<obs clolumn>': {'min': <threshold|None>, 'max': <threshold|None>}
    # None = automatically derive initial threshold
    # float('inf') or float('-inf') = no filter
}

In [None]:
initial_var_thresholds = qc.get_thresholds(adata, which="var", manual_thresholds=default_var_thresholds)

var_columns = list(initial_var_thresholds.keys())
qc.thresholds_as_table(initial_var_thresholds)

____________

#### 4.7.1 - Customize thresholds via sliders

 (Rerun cell if plot is not shown)

In [None]:
%matplotlib widget

# Plot violins and sliders
var_figure, var_slider_dict = pl.qc_filter.quality_violin(
    adata,
    var_columns,
    which="var",
    title="Gene quality control (before)",
    save="gene_filtering.png"
)
var_figure

#### 4.7.2 - Apply gene filtering

In [None]:
# Get final thresholds
final_var_thresholds = pl.qc_filter.get_slider_thresholds(var_slider_dict)
qc.thresholds_as_table(final_var_thresholds) # show thresholds

In [None]:
qc.apply_qc_thresholds(adata, which="var", thresholds=final_var_thresholds, overwrite=overwrite)

#### 4.7.3 - Show data after filtering

In [None]:
%matplotlib inline 

# Plot violins and sliders
figure, slider_dict = pl.qc_filter.quality_violin(
    adata,
    var_columns,
    which="var",
    ncols=3,
    title="Gene quality control (after)",
    save="gene_filtering_final.png"
)
figure

#### 4.7.4 - Filter additional marked genes
Remove genes that are labeled as e.g. mitochondrial genes.

In [None]:
# Remove mitochrondrial genes
if filter_mito:
    print("Removing mitochrondrial genes:")
    qc.filter_genes(adata, "is_mito", name="mito", overwrite=overwrite)

# Remove ribosomal genes
if filter_ribo:
    print("Removing ribosomal genes:")
    qc.filter_genes(adata, "is_ribo", name="ribo", overwrite=overwrite)

# Remove gender genes
if filter_gender:
    print("Removing gender genes:")
    qc.filter_genes(adata, "is_gender", name="gender", overwrite=overwrite)

_________

### 4.8 - Denoising
Remove ambient RNA and technical noise from the count matrix using [scAR](https://www.biorxiv.org/content/10.1101/2022.01.14.476312v4). The tool estimates the ambient profile by averaging cell-free droplets. An autoencoder neural network later corrects the count matrix.
<hr style="border:1px solid black"> </hr>

In [None]:
import scanpy as sc

if path_raw_adata:
    print("Loading raw anndata...")
    adata_raw = sc.read_h5ad(path_raw_adata)
    print("Denoising data, this will take a while...")
    adata = qc.denoise_data(adata, adata_raw, feature_type='Gene Expression', epochs=epochs,
                            verbose=False, save='droplets_kneeplot.pdf', overwrite=overwrite)

_________

## 5 - Save filtered adata
<hr style="border:2px solid black"> </hr>
Store the final results

In [None]:
adata

In [None]:
#Saving the data
adata_output = "anndata_2.h5ad"
utils.adata.save_h5ad(adata, adata_output)

In [None]:
sctoolbox.settings.close_logfile()