# QC filtering

This notebook is about filtering scATAC seq data within an adata object to archive a good quality peak matrix.
For this task various QC filters are implemented. The latter includes filters related to:

- doublet score
- number of features per barcode
- mean insert size
- promotor enrichment


## Settings

In [142]:
#Path related settings (these should be the same as for the previous notebook)
output_dir = '/mnt/workspace/jdetlef/processed_data' # specify the output directory of the processing should be the same in every notebook
test = 'Esophagus_146_0.01' # sample name, this should be the same for all the notebooks

## QC Settings

In [143]:
# Set the column in adata.obs containing the biological condition to evaluate
condition_column = "Sample_new"

# Data inspection
# Set True if the insertsize distribution should be plotted
# If there is no insertsize distribution set this False
plot_insertsize_dist = True

filter_xy = True  # True or False; filtering out chrX and chrY
binarize_mtx = True  # True or False; convert matrix to binary

############################# filters ##########################################

# Decide whether to estimate thresholds individual per condition (False) or globally (True)
global_threshold = False

# Filter to use:

# Decide whether to remove doublets using scrublet (True) or to skip doublet calculation (False)
filter_doublets = True
threads = 2
doublet_threshold = 0.2
use_condition_column = False
condition_doublet_removal = None

# True or False; filtering out cells with numbers of features not in the range defined below
n_features_filter = True
# True or False; filtering out cells with mean insertsize not in the range defined below
mean_insertsize_filter = True
# True or False; filtering out cells with promotor_enrichment not in the range defined below
filter_pct_fp= True
# True or False; filtering out cells with a number features in chromosome M not in the defined range
filter_chrM_fragments= False
# True or False; filtering out cells with fragments mapped not in the defined range
filter_uniquely_mapped_fragments= False


# if this is True thresholds below are ignored 
only_automatic_thresholds = False # True or False; to use automatic thresholds

############################# set default values #######################################
# 
# This will be applied to all samples the thresholds can be changed manually when plotted 
# if thresholds None they are set automatically

# default values n_features 
min_features = 100
max_features = 5000

# default mean_insertsize
upper_threshold_mis=None
lower_threshold_mis=None

# default promotor enrichment
upper_threshold_pct_fp=None
lower_threshold_pct_fp=None

# default number of fragments
upper_thr_fragments=None
lower_thr_fragments=None

# default number of fragments in chrM
upper_thr_chrM_fragments=1000
lower_thr_chrM_fragments=0

# default number of uniquely mapped fragments
upper_thr_um=20000
lower_thr_um=0



## Sum up settings

In [144]:
manual_thresholds = {}
if n_features_filter:
    manual_thresholds['n_features_by_counts'] = {'min' : min_features, 'max' : max_features}

if mean_insertsize_filter:
    manual_thresholds['mean_insertsize'] = {'min' : lower_threshold_mis, 'max' : upper_threshold_mis}

if filter_pct_fp:
    manual_thresholds['pct_fragments_in_promoters'] = {'min' : lower_threshold_pct_fp, 'max' : upper_threshold_pct_fp}

if filter_chrM_fragments:
    manual_thresholds['CM'] = {'min' : lower_thr_chrM_fragments, 'max' : upper_thr_chrM_fragments}

if filter_uniquely_mapped_fragments:
    manual_thresholds['UM'] = {'min' : lower_thr_um, 'max' : upper_thr_um}

obs_columns = list(manual_thresholds.keys())

## Loading packages and setup

In [145]:
# sctoolbox modules
import sctoolbox.atac_tree as sub_tree
import sctoolbox.creators as cr
import sctoolbox.annotation as an
import sctoolbox.qc_filter as qc
import sctoolbox.plotting as pl
import sctoolbox.atac_utils as atac_utils
import sctoolbox.atac as atac
# import episcanpy
import scanpy as sc
import episcanpy as epi
import numpy as np

## Setup path handling object 

In [146]:
# make an instance of the class
tree = sub_tree.ATAC_tree()
# set processing/output directory
tree.processing_dir = output_dir
# set sample/experiment.. 
tree.run = test

all directories existing
all directories existing


## Load anndata 

In [147]:
# probably updated in future
assembling_output = tree.assembled_anndata
adata = epi.read_h5ad(assembling_output)
adata

AnnData object with n_obs × n_vars = 73652 × 174262
    obs: 'TN', 'UM', 'PP', 'UQ', 'CM', 'file', 'sample', 'Sample_new', 'n_features_by_counts', 'log1p_n_features_by_counts', 'total_counts', 'log1p_total_counts', 'insertsize_count', 'mean_insertsize', 'n_fragments_in_promoters', 'n_total_fragments', 'pct_fragments_in_promoters'
    var: 'peak_chr', 'peak_start', 'peak_end', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'color_set', 'infoprocess', 'insertsize_distribution', 'legend'

In [148]:
adata.var

Unnamed: 0,peak_chr,peak_start,peak_end,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts
chr1:10005-10731,chr1,10005,10731,410,0.005811,0.005794,99.443328,428.0,6.061457
chr1:11169-11386,chr1,11169,11386,0,0.000000,0.000000,100.000000,0.0,0.000000
chr1:28730-29439,chr1,28730,29439,22,0.000299,0.000299,99.970130,22.0,3.135494
chr1:29492-29686,chr1,29492,29686,0,0.000000,0.000000,100.000000,0.0,0.000000
chr1:34644-35030,chr1,34644,35030,0,0.000000,0.000000,100.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...
chrY:57203298-57203493,chrY,57203298,57203493,0,0.000000,0.000000,100.000000,0.0,0.000000
chrY:57206407-57206560,chrY,57206407,57206560,0,0.000000,0.000000,100.000000,0.0,0.000000
chrY:57215319-57215425,chrY,57215319,57215425,0,0.000000,0.000000,100.000000,0.0,0.000000
chrY:57215519-57215848,chrY,57215519,57215848,0,0.000000,0.000000,100.000000,0.0,0.000000


In [149]:
adata.uns['infoprocess']

{'Anndata_path': '/mnt/workspace/jdetlef/processed_data',
 'Input_for_assembling': '/mnt/workspace/jdetlef/data/anndata/esophagus_muscularis_146_0.01.h5ad',
 'Strategy': 'Read from h5ad',
 'Test_number': 'Esophagus_146_0.01'}

In [150]:
if plot_insertsize_dist:
    atac.plot_insertsize(adata)

# QC

## 1. filtering out chrX, chrY and chrM

In [151]:
non_m = [name for name in adata.var_names if not name.startswith('chrM')]  # remove chrM
adata = adata[:, non_m]

In [152]:
if filter_xy:
    non_xy = [name for name in adata.var_names if not name.startswith('chrY') | name.startswith('chrX')]
    adata = adata[:, non_xy]

In [153]:
display(adata)

View of AnnData object with n_obs × n_vars = 73652 × 169256
    obs: 'TN', 'UM', 'PP', 'UQ', 'CM', 'file', 'sample', 'Sample_new', 'n_features_by_counts', 'log1p_n_features_by_counts', 'total_counts', 'log1p_total_counts', 'insertsize_count', 'mean_insertsize', 'n_fragments_in_promoters', 'n_total_fragments', 'pct_fragments_in_promoters'
    var: 'peak_chr', 'peak_start', 'peak_end', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts'
    uns: 'color_set', 'infoprocess', 'insertsize_distribution', 'legend'

## 2. removing empty cells

In [154]:
epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
# log-convert number of features
adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
display(adata)



AnnData object with n_obs × n_vars = 71715 × 166856
    obs: 'TN', 'UM', 'PP', 'UQ', 'CM', 'file', 'sample', 'Sample_new', 'n_features_by_counts', 'log1p_n_features_by_counts', 'total_counts', 'log1p_total_counts', 'insertsize_count', 'mean_insertsize', 'n_fragments_in_promoters', 'n_total_fragments', 'pct_fragments_in_promoters', 'nb_features', 'log_nb_features'
    var: 'peak_chr', 'peak_start', 'peak_end', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'n_cells'
    uns: 'color_set', 'infoprocess', 'insertsize_distribution', 'legend'

## 3. binarize

In [155]:
adata.raw = adata
if binarize_mtx:
    epi.pp.binarize(adata)
    adata.layers['binary'] = adata.X.copy()

## 4. doublet removal

In [156]:
adata.uns["infoprocess"]["plot_metrics"] = []

In [157]:
 if filter_doublets:
        
    if use_condition_column:
        condition_doublet_removal = condition_column
    
    qc.estimate_doublets(adata, groupby=condition_doublet_removal, threads=threads, threshold=doublet_threshold)
    
    #Remove the duplicates from adata
    qc.filter_cells(adata, "predicted_doublet", remove_bool=True)
    
    #Save to adata to plot later on
    adata.uns["infoprocess"]["plot_metrics"].append("doublet_score")



ValueError: Found array with 0 feature(s) (shape=(62894, 0)) while a minimum of 1 is required by PCA.

## 5. visualize adata

In [None]:
adata.var

In [None]:
# plot_ov_hist(adata, threshold_features=50) 
epi.pp.filter_features(adata, min_cells=1)
epi.pp.filter_cells(adata, min_features=1)
epi.pp.coverage_cells(adata, binary=True, log=False, bins=50)
epi.pp.coverage_cells(adata, binary=True, log=10, bins=50)

epi.pp.coverage_features(adata, binary=True, log=False, bins=50)
epi.pp.coverage_features(adata, binary=True, log=10, bins=50)

In [None]:
adata.raw = adata
if binarize_mtx:
    epi.pp.binarize(adata)
    adata.layers['binary'] = adata.X.copy()

## 6. Cells Filtering

### 6.1 get thresholds dict

In [None]:
groupby = condition_column if global_threshold is False else None
thresholds = atac_utils.get_thresholds_atac_wrapper(adata, manual_thresholds, only_automatic_thresholds, groupby=groupby)
qc.thresholds_as_table(thresholds)

### 6.2 plot thresholds

In [None]:
%matplotlib widget
%bgcolor PowderBlue

#Plot violins and sliders
obs_figure, obs_slider_dict = qc.quality_violin(adata, columns=obs_columns,
                                            groupby=condition_column,
                                            which="obs",
                                            thresholds=thresholds,
                                            global_threshold=global_threshold,
                                            title="Cell quality control (before)",
                                            save=tree.qc_plots + "cell_filtering.png")
obs_figure

In [None]:
# Get final thresholds
final_thresholds = qc.get_slider_thresholds(obs_slider_dict)
qc.thresholds_as_table(final_thresholds) # show thresholds

### 6.3 filter adata

In [None]:
qc.apply_qc_thresholds(adata, final_thresholds, groupby=groupby)
# remove empty features
epi.pp.filter_features(adata, min_cells=1)

In [None]:
# log-convert number of features
adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
display(adata)

## 7. save plots

In [None]:
%matplotlib inline 

#Plot violins and sliders
figure, slider_dict = qc.quality_violin(adata, columns=obs_columns,
                                     groupby=condition_column,
                                     which="obs", ncols=3,
                                     global_threshold = global_threshold,
                                     title="Cell quality control (after)",
                                     save=tree.qc_plots + "cell_filtering_final.png")
figure 

## 8. save anndata

In [None]:
adata_output = tree.qc_anndata
adata.write(filename=adata_output)

In [None]:
tree.qc_dir

In [None]:
import os
import shutil
repo_path = os.getcwd()
notebook_name = '2_QC.ipynb'
notebook_path = os.path.join(repo_path, notebook_name)
notebook_copy = os.path.join(tree.qc_dir , notebook_name)
shutil.copyfile(notebook_path, notebook_copy)