In [None]:
from sctoolbox.utilities import bgcolor

# QC and filtering
<hr style="border:2px solid black"> </hr>

**Quality control**

"We must ensure that all cellular barcode data correspond to viable cells.

Cell QC is commonly performed based on three QC covariates: the number of counts per barcode (count depth), the number of genes per barcode, and the fraction of counts from mitochondrial genes per barcode.

The distributions of these QC covariates are examined for outlier peaks that are filtered out by thresholding.

These outlier barcodes can correspond to dying cells, cells whose membranes are broken, or doublets".

Samples with a low count depth, few detected genes, and a high fraction of mitochondrial counts are indicative of cells whose cytoplasmic mRNA has leaked out through a broken membrane.

In contrast, cells with unexpectedly high counts and a large number of detected genes may represent doublets.

**DOI: 10.15252/msb.20188746**

<hr style="border:2px solid black"> </hr>
<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Set the species of the data
species = "human"

# Set the column in adata.obs containing the biological condition to evaluate
condition_column = "sample"

# Set the column in adata.var containing gene names (or set to None to use adata.var index)
gene_column = None

# Absolute minimum number of genes for pre-selection of cells before QC plotting
min_genes = 1

# Decide whether to remove doublets using scrublet (True) or to skip doublet calculation (False)
filter_doublets = True
threads = 4
doublet_threshold = 0.2

# Whether to try to predict sex of samples using the expression of a female gene
predict_sex = True
female_gene = "Xist"  #name of gene to use for assignment

# Decide whether to estimate thresholds individual per condition (False) or globally (True)
global_threshold = False

# Removal of gene subsets
filter_mito = True
filter_ribo = False
filter_gender = False

<hr style="border:2px solid black"> </hr>

## Setup

In [None]:
#####################IMPORTING PACKAGES#############################
import sctoolbox.utilities as utils
import sctoolbox.marker_genes as marker_genes
import sctoolbox.analyser as analyser
import sctoolbox.qc_filter as qc
import sctoolbox.plotting as pl

## Load anndata
Uses anndata object written by the previous notebook.

In [None]:
adata = utils.load_anndata(is_from_previous_note=True, which_notebook=1, data_to_evaluate=condition_column)
display(adata)

In [None]:
#Ensure that condition column is a category
adata.obs[condition_column] = adata.obs[condition_column].astype("category")

In [None]:
figure_path = adata.uns["infoprocess"]["Anndata_path"] + "/"
adata.uns["infoprocess"]["plot_metrics"] = []  # metrics for later plotting

## Label genes
Mark genes on their general association. E.g. mitochondrial.

In [None]:
marker_genes.label_genes(adata, gene_column=gene_column, species=species)

display(adata.uns["infoprocess"]["genes_labeled"])

## Calculate QC metrics
Create quality control metrics to filter the data on.

In [None]:
# If control_var=True, include gene labels from in qc computation
# TODO can't be disabled at the moment
adata = analyser.calculate_qc_metrics(adata, qc_vars=adata.uns["infoprocess"]["genes_labeled"])

## Set filter for number of genes before calculating doublets

In [None]:
n_cells_before = len(adata)
adata = adata[adata.obs["n_genes"] >= min_genes]
n_cells_after = len(adata)
print(f"Filtered out {n_cells_before-n_cells_after} cells which had less than {min_genes} gene(s) expressed.")

## Calculate doublet scores

In [None]:
if filter_doublets:
    
    qc.estimate_doublets(adata, groupby=condition_column, threads=threads, threshold=doublet_threshold)
    
    #Remove the duplicates from adata
    qc.filter_cells(adata, "predicted_doublet", remove_bool=True)
    
    #Save to adata to plot later on
    adata.uns["infoprocess"]["plot_metrics"].append("doublet_score")

## Predict sex per sample

In [None]:
if predict_sex:
    qc.predict_sex(adata, groupby=condition_column, gene_column=gene_column, gene=female_gene, 
                   #save=figure_path + "female_prediction.pdf" #to be implemented
                   )
    adata.uns["infoprocess"]["plot_metrics"].append("predicted_sex")

------------------

## Cell filtering

Low and high count depth indicates cells with low integrity and doublets, respectively (DOI: 10.15252/msb.20188746).

In [None]:
# Choose columns to be used for filtering
obs_columns = ["n_genes", "log1p_total_counts"]
obs_columns += ["pct_counts_" + var for var in adata.uns["infoprocess"]["genes_labeled"] if var != "is_gender"]

adata.uns["infoprocess"]["qc_columns_obs"] = obs_columns # save to infoprocess
adata.uns["infoprocess"]["plot_metrics"].extend(obs_columns)

### Estimate initial thresholds automatically

In [None]:
groupby = condition_column if global_threshold is False else None
initial_thresholds = qc.automatic_thresholds(adata, which="obs", groupby=groupby, columns=obs_columns)
qc.thresholds_as_table(initial_thresholds) # show thresholds

### Customize thresholds via sliders

 (Rerun cell if plot is not shown)

In [None]:
%matplotlib widget
%bgcolor PowderBlue

#Plot violins and sliders
obs_figure, obs_slider_dict = qc.quality_violin(adata, obs_columns,
                                            groupby=condition_column,
                                            which="obs",
                                            thresholds=initial_thresholds,
                                            global_threshold=global_threshold,
                                            title="Cell quality control (before)",
                                            save=figure_path + "cell_filtering.png")
obs_figure

In [None]:
# Get final thresholds
final_thresholds = qc.get_slider_thresholds(obs_slider_dict)
qc.thresholds_as_table(final_thresholds) # show thresholds

### Apply final thresholds
Filter the anndata object based on the thresholds in the threshold table.

In [None]:
qc.apply_qc_thresholds(adata, which="obs", thresholds=final_thresholds, groupby=groupby)

### Show data after filtering

In [None]:
%matplotlib inline 

#Plot violins and sliders
figure, slider_dict = qc.quality_violin(adata, obs_columns,
                                     groupby=condition_column,
                                     which="obs", ncols=3,
                                     global_threshold = global_threshold,
                                     title="Cell quality control (after)",
                                     save=figure_path + "cell_filtering_final.png")
figure

In [None]:
#Save final thresholds to infoprocess
adata.uns["infoprocess"]["cell_qc_thresholds"] = final_thresholds

-------------------

## Gene filtering

In [None]:
#Recalculate quality measures for genes
adata = analyser.calculate_qc_metrics(adata)

In [None]:
#Remove genes with 0 count
zero_bool = adata.var["n_cells_by_counts"] == 0
adata = adata[:,~zero_bool]

In [None]:
%bgcolor PowderBlue

#Choose columns for quality control
var_columns = ["n_cells_by_counts", "log1p_mean_counts"]

### Customize thresholds via sliders

 (Rerun cell if plot is not shown)

In [None]:
%matplotlib widget
%bgcolor PowderBlue

#Plot violins and sliders
var_figure, var_slider_dict = qc.quality_violin(adata, var_columns,
                                            which="var",
                                            title="Gene quality control (before)",
                                            save=figure_path + "gene_filtering.png")
var_figure

### Apply gene filtering

In [None]:
# Get final thresholds
final_thresholds = qc.get_slider_thresholds(var_slider_dict)
qc.thresholds_as_table(final_thresholds) # show thresholds

In [None]:
qc.apply_qc_thresholds(adata, which="var", thresholds=final_thresholds)

In [None]:
adata

### Show data after filtering

In [None]:
%matplotlib inline 

#Plot violins and sliders
figure, slider_dict = qc.quality_violin(adata, var_columns,
                                        which="var", ncols=3,
                                        title="Gene quality control (after)",
                                        save=figure_path + "gene_filtering_final.png")
figure

### Filter additional marked genes
Remove genes that are labeled as e.g. mitochondrial genes.

In [None]:
#Remove mitochrondrial genes
if filter_mito is True:
    print("Removing mitochrondrial genes:")
    qc.filter_genes(adata, "is_mito")

#Remove ribosomal genes
if filter_ribo is True:
    print("Removing ribosomal genes:")
    qc_filter_genes(adata, "is_ribo")

#Remove gender genes    
if filter_gender is True:
    print("Removing gender genes:")
    qc.filter_genes(adata, "is_gender")

-------------

## Save filtered adata
Store the final results

In [None]:
adata

In [None]:
utils.saving_anndata(adata, current_notebook=2)