In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import sctoolbox
from sctoolbox.utils.jupyter import bgcolor

# QC and filtering
<hr style="border:2px solid black"> </hr>

**Quality control**

"We must ensure that all cellular barcode data correspond to viable cells.

Cell QC is commonly performed based on three QC covariates: the number of counts per barcode (count depth), the number of genes per barcode, and the fraction of counts from mitochondrial genes per barcode.

The distributions of these QC covariates are examined for outlier peaks that are filtered out by thresholding.

These outlier barcodes can correspond to dying cells, cells whose membranes are broken, or doublets".

Samples with a low count depth, few detected genes, and a high fraction of mitochondrial counts are indicative of cells whose cytoplasmic mRNA has leaked out through a broken membrane.

In contrast, cells with unexpectedly high counts and a large number of detected genes may represent doublets.

**DOI: 10.15252/msb.20188746**

<hr style="border:2px solid black"> </hr>
<h1><center>⬐ Fill in input data here ⬎</center></h1>

In [None]:
%bgcolor PowderBlue

# Set the species of the data
species = "zebrafish"

# Set the column in adata.obs containing the biological condition to evaluate
condition_column = "timepoint"

# Set the column in adata.var containing gene names (or set to None to use adata.var index)
gene_column = None

# Absolute minimum number of genes for pre-selection of cells before QC plotting
min_genes = 1

# Decide whether to remove doublets using scrublet (True) or to skip doublet calculation (False)
filter_doublets = True
threads = 4
doublet_threshold = 0.2

# Whether to try to predict sex of samples using the expression of a female gene
predict_sex = True
female_gene = "Xist"  #name of gene to use for assignment

# Decide whether to estimate thresholds individual per condition (False) or globally (True)
global_threshold = True

# Removal of gene subsets
filter_mito = True
filter_ribo = True
filter_gender = False

<hr style="border:2px solid black"> </hr>

## Setup

In [None]:
import sctoolbox.utils as utils
import sctoolbox.tools.qc_filter as qc
import sctoolbox.tools.marker_genes as marker_genes
import sctoolbox.plotting as pl
from sctoolbox._settings import settings

import matplotlib.pyplot as plt

settings.settings_from_config("config.yaml", key="02")

-------

## Load anndata
Uses anndata object written by the previous notebook.

In [None]:
adata = utils.adata.load_h5ad("anndata_1.h5ad")
display(adata)

In [None]:
#Ensure that condition column is a category
adata.obs[condition_column] = adata.obs[condition_column].astype("category")

---------

## Show STARsolo quality (optional)

If the data was mapped using STARsolo, use the parameter to set the path to the STARsolo runs and plot quality measures across runs. The path must be a folder, e.g. "path/to/starsolo_output", which contains folders per condition e.g. "cond1", "cond2", etc.

In [None]:
quant_folder = ""

In [None]:
if quant_folder != "":
    _ = pl.plot_starsolo_quality(quant_folder, save="starsolo_quality.pdf")
    _ = pl.plot_starsolo_UMI(quant_folder, ncol=3, save="starsolo_cell_selection.pdf")

---------

# Remove 20hpf sample

In [None]:
# sample 20hpf is low quality
adata = adata[adata.obs['timepoint'] != '20hpf']
adata

## Label genes
Mark genes on their general association. E.g. mitochondrial.

In [None]:
qc_vars = marker_genes.label_genes(adata, gene_column=gene_column, species=species)
qc_vars

In [None]:
adata.obs

------

## Calculate QC metrics
Create quality control metrics to filter the data on.

In [None]:
adata = qc.calculate_qc_metrics(adata, qc_vars=qc_vars)

----------

## Calculate doublet scores

In [None]:
# Set filter for number of genes before calculating doublets
n_cells_before = len(adata)
adata = adata[adata.obs["n_genes"] >= min_genes]
n_cells_after = len(adata)
print(f"Filtered out {n_cells_before-n_cells_after} cells which had less than {min_genes} gene(s) expressed.")

In [None]:
if filter_doublets:
    
    qc.estimate_doublets(adata, groupby=condition_column, threads=threads, threshold=doublet_threshold)
    
    #Remove the duplicates from adata
    qc.filter_cells(adata, "predicted_doublet", remove_bool=True)

------------

## Predict sex per sample

In [None]:
if predict_sex:
    qc.predict_sex(adata, groupby=condition_column, gene_column=gene_column, gene=female_gene, 
                   save="female_prediction.pdf")

------------------

## Cell filtering

Low and high count depth indicates cells with low integrity and doublets, respectively (DOI: 10.15252/msb.20188746).

In [None]:
# Choose columns to be used for filtering
obs_columns = ["n_genes", "log1p_total_counts"]
obs_columns += ["pct_counts_" + var for var in qc_vars if var != "is_gender"]

utils.adata.add_uns_info(adata, "obs_metrics", obs_columns, how="append")
obs_columns

### Estimate initial thresholds automatically

In [None]:
groupby = condition_column if global_threshold is False else None
initial_thresholds = qc.automatic_thresholds(adata, which="obs", groupby=groupby, columns=obs_columns)
qc.thresholds_as_table(initial_thresholds) # show thresholds

### Customize thresholds via sliders

 (Rerun cell if plot is not shown)

In [None]:
#Plot violins and sliders
obs_figure, obs_slider_dict = pl.qc_filter.quality_violin(adata, obs_columns,
                                                        groupby=condition_column,
                                                        which="obs",
                                                        thresholds=initial_thresholds,
                                                        global_threshold=global_threshold,
                                                        title="Cell quality control (before)",
                                                        save="cell_filtering.png")
obs_figure

In [None]:
%matplotlib widget

#Plot violins and sliders
obs_figure, obs_slider_dict = pl.qc_filter.quality_violin(adata, obs_columns,
                                                        groupby=condition_column,
                                                        which="obs",
                                                        thresholds=initial_thresholds,
                                                        global_threshold=global_threshold,
                                                        title="Cell quality control (before)",
                                                        save="cell_filtering.png")
obs_figure

In [None]:
# Get final thresholds
final_thresholds = pl.qc_filter.get_slider_thresholds(obs_slider_dict)
qc.thresholds_as_table(final_thresholds) # show thresholds

In [None]:
# adjust thresholds manually, if slider is not working
final_thresholds['n_genes']['min'] = float(457.4)
final_thresholds['n_genes']['max'] = float(6000)

final_thresholds['log1p_total_counts']['min'] = float(8.016606)
final_thresholds['log1p_total_counts']['max'] = float(11.)

final_thresholds['pct_counts_is_ribo']['min'] = float(5)
final_thresholds['pct_counts_is_ribo']['max'] = float(40)

final_thresholds['pct_counts_is_mito']['min'] = float(0)
final_thresholds['pct_counts_is_mito']['max'] = float(40.40)

qc.thresholds_as_table(final_thresholds) # show thresholds

In [None]:
# Show pairwise comparisons of column values w/ thresholds (mean values in case thresholds are grouped)
%matplotlib inline
plt.close()  # close previous figure
if len(final_thresholds) > 1:
    mean_thresholds = qc.get_mean_thresholds(final_thresholds)
    _ = pl.general.pairwise_scatter(adata.obs, obs_columns, thresholds=mean_thresholds, save="cell_filtering_scatter.pdf")

### Apply final thresholds
Filter the anndata object based on the thresholds in the threshold table.

In [None]:
qc.apply_qc_thresholds(adata, which="obs", thresholds=final_thresholds, groupby=groupby)

### Show data after filtering

In [None]:
%matplotlib inline 

#Plot violins and sliders
figure, slider_dict = pl.qc_filter.quality_violin(adata, obs_columns,
                                                 groupby=condition_column,
                                                 which="obs", ncols=3,
                                                 global_threshold = global_threshold,
                                                 title="Cell quality control (after)",
                                                 save="cell_filtering_final.png")
figure

-------------------

## Gene filtering

In [None]:
#Recalculate quality measures for genes
adata = qc.calculate_qc_metrics(adata)

In [None]:
#Remove genes with 0 count
zero_bool = adata.var["n_cells_by_counts"] == 0
adata = adata[:,~zero_bool]

In [None]:
%bgcolor PowderBlue

#Choose columns for quality control
var_columns = ["n_cells_by_counts", "log1p_mean_counts"]

### Customize thresholds via sliders

 (Rerun cell if plot is not shown)

In [None]:
%matplotlib inline

#Plot violins and sliders
var_figure, var_slider_dict = pl.qc_filter.quality_violin(adata, var_columns,
                                                        which="var",
                                                        title="Gene quality control (before)",
                                                        save="gene_filtering.png")
var_figure

In [None]:
%matplotlib widget

#Plot violins and sliders
var_figure, var_slider_dict = pl.qc_filter.quality_violin(adata, var_columns,
                                                        which="var",
                                                        title="Gene quality control (before)",
                                                        save="gene_filtering.png")
var_figure

### Apply gene filtering

In [None]:
# Get final thresholds
final_thresholds = pl.qc_filter.get_slider_thresholds(var_slider_dict)
qc.thresholds_as_table(final_thresholds) # show thresholds

In [None]:
# adjust manually
final_thresholds['n_cells_by_counts']['min'] = 100.  # filter out genes which are expressed in less than 100 cells

qc.thresholds_as_table(final_thresholds) # show thresholds

In [None]:
qc.apply_qc_thresholds(adata, which="var", thresholds=final_thresholds)

### Show data after filtering

In [None]:
%matplotlib inline 

#Plot violins and sliders
figure, slider_dict = pl.qc_filter.quality_violin(adata, var_columns,
                                                which="var", ncols=3,
                                                title="Gene quality control (after)",
                                                save="gene_filtering_final.png")
figure

### Filter additional marked genes
Remove genes that are labeled as e.g. mitochondrial genes.

In [None]:
#Remove mitochrondrial genes
if filter_mito is True:
    print("Removing mitochrondrial genes:")
    qc.filter_genes(adata, "is_mito")

#Remove ribosomal genes
if filter_ribo is True:
    print("Removing ribosomal genes:")
    qc.filter_genes(adata, "is_ribo")

#Remove gender genes    
if filter_gender is True:
    print("Removing gender genes:")
    qc.filter_genes(adata, "is_gender")

-------------

## Save filtered adata
Store the final results

In [None]:
adata

In [None]:
#Saving the data
adata_output = "anndata_2.h5ad"
utils.adata.save_h5ad(adata, adata_output)

In [None]:
sctoolbox.settings.close_logfile()

### Sort by timepoint

In [None]:
adata_tmp = adata.copy()
tmp_obs = adata_tmp.obs.copy()
tmp_obs['tmp_1'] = tmp_obs['timepoint'].str.extract('([\d]+)').astype(int)
tmp_obs['tmp_2'] = tmp_obs['timepoint'].str.extract('([A-Za-z]+)')
cell_ids = (tmp_obs.sort_values(by=['tmp_2', 'tmp_1'], ascending=(False, True))).index
adata_tmp = adata_tmp[cell_ids]
adata_tmp.obs['timepoint'] = adata_tmp.obs['timepoint'].cat.reorder_categories(adata_tmp.obs['timepoint'].unique().to_list())
adata = adata_tmp

In [None]:
%matplotlib inline 

#Plot violins and sliders
figure, slider_dict = pl.qc_filter.quality_violin(adata, obs_columns,
                                                 groupby=condition_column,
                                                 which="obs", ncols=3,
                                                 global_threshold = global_threshold,
                                                 title="Cell quality control (after)",
                                                 save="cell_filtering_final.png")
figure