<h1 align="center">QC and filtering</h1> 
<hr style="border:2px solid black"> </hr>

# A: loading packages and setup

In [1]:
#####################IMPORTING PACKAGES AND SETUP################################
notebook=2 #Here is the number of present notebook
import sctoolbox.checker as checker
import scanpy as sc
import os
from os import path
from scipy import sparse
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import sctoolbox
from sctoolbox.qc_filter import *

# B: loading the anndata

In [2]:
#Loading adata file and printing num cells and num genes
adata = sc.read_h5ad(filename=checker.load_anndata("Output_path", notebook))
display(adata)

AnnData object with n_obs × n_vars = 20202 × 55359
    obs: 'condition'
    var: 'gene_ids', 'feature_types'
    uns: 'infoprocess'
    layers: 'ambiguous', 'spliced', 'unspliced'

# B: set up parameters. THIS WILL BE DELETED AFTER FINISH THE NOTEBOOK

In [None]:
#####################DEFINING TEST NUMBER##############################
test="Test1"

##########################DEFINING IMPUTS##############################
path_imput="/mnt/agnerds/loosolab_SC_RNA_framework/raw_data" #This is the directory where the preprocessed data is located

#################DEFINING THE STAGE OF ANALYSIS#######################
is_it_the_final_run="No" #Set to Yes if this is the final running before send to colaborators

########DEFINING THE EXPERIMENTAL CONDITION TO BE EVALUATED###########
data_to_evaluate='timepoint' #Define the anndata.obs choose for analysis

####################CELL FILTERING########################

#Filter dataset based on the Minimum number of genes expressed required for a cell to pass filtering?
#Types Yes or No, followed by the minimum QUANTILE for filter. E.g. "Yes:0.2"
filt_min_gen="Yes:0.2"

#Filter dataset excluding cells with some mitochondrial content?
#Types Yes or No, followed by the minimum PERCENTAGE for filter (type from 0 to 1). E.g. "Yes:0.03"
filt_mito_content="Yes:0.035"

#Filtering genes concerning expression in a certain number of cells
#Types Yes or No, followed by the minimum PERCENTAGE for filter (type from 0 to 1). E.g. "Yes:0.001". 0.01 means 1%
filt_min_cells="Yes:0.01"

#Predict doublets (when different cells are mistakenly considered as a single cell)?
test_doublets="Yes"

##################GENE FILTERING######################

remove_mito = False     #Remove mitochrondrial genes from the anndata object (FALSE: keep, TRUE: remove)
remove_ribo = True     #Remove ribosomal genes from the anndata object (FALSE: keep, TRUE: remove)
remove_gender = False   #Remove gender-specific genes from the anndata object (FALSE: keep, TRUE: remove)
remove_custom = []      #A custom list of genes (names) to remove from the anndata object

###################DEFINING COLLORS AND FIGURE PATH####################
color_list=['green', 'red', 'blue', 'pink', 'chartreuse', 'gray', 'yellow', 'brown', 'purple', 'orange', 'wheat', 'lightseagreen', 'cyan', 'khaki', 'cornflowerblue', 'olive', 'gainsboro', 'darkmagenta', 'slategray', 'ivory', 'darkorchid', 'papayawhip', 'paleturquoise', 'oldlace', 'orangered', 'lavenderblush', 'gold', 'seagreen', 'deepskyblue', 'lavender', 'peru', 'silver', 'midnightblue', 'antiquewhite', 'blanchedalmond', 'firebrick', 'greenyellow', 'thistle', 'powderblue', 'darkseagreen', 'darkolivegreen', 'moccasin', 'olivedrab', 'mediumseagreen', 'lightgray', 'darkgreen', 'tan', 'yellowgreen', 'peachpuff', 'cornsilk', 'darkblue', 'violet', 'cadetblue', 'palegoldenrod', 'darkturquoise', 'sienna', 'mediumorchid', 'springgreen', 'darkgoldenrod', 'magenta', 'steelblue', 'navy', 'lightgoldenrodyellow', 'saddlebrown', 'aliceblue', 'beige', 'hotpink', 'aquamarine', 'tomato', 'darksalmon', 'navajowhite', 'lawngreen', 'lightsteelblue', 'crimson', 'mediumturquoise', 'mistyrose', 'lightcoral', 'mediumaquamarine', 'mediumblue', 'darkred', 'lightskyblue', 'mediumspringgreen', 'darkviolet', 'royalblue', 'seashell', 'azure', 'lightgreen', 'fuchsia', 'floralwhite', 'mintcream', 'lightcyan', 'bisque', 'deeppink', 'limegreen', 'lightblue', 'darkkhaki', 'maroon', 'aqua', 'lightyellow', 'plum', 'indianred', 'linen', 'honeydew', 'burlywood', 'goldenrod', 'mediumslateblue', 'lime', 'lightslategray', 'forestgreen', 'dimgray', 'lemonchiffon', 'darkgray', 'dodgerblue', 'darkcyan', 'orchid', 'blueviolet', 'mediumpurple', 'darkslategray', 'turquoise', 'salmon', 'lightsalmon', 'coral', 'lightpink', 'slateblue', 'darkslateblue', 'white', 'sandybrown', 'chocolate', 'teal', 'mediumvioletred', 'skyblue', 'snow', 'palegreen', 'ghostwhite', 'indigo', 'rosybrown', 'palevioletred', 'darkorange', 'whitesmoke']

# D: Cell QC and filtering

In [None]:
# Initial filter to remove cells without any genes
sc.pp.filter_cells(adata, min_genes=1)

In [None]:
#Assign mitochondrial genes
is_mito = adata.var_names.str.startswith(('mt', 'Mt', 'MT'))
adata.var['is_mito'] = is_mito

#Assign ribosomal genes
is_ribo = adata.var_names.str.startswith(("RPS","RPL","MRPS","MRPL","Mrps","Mrpl","Rps","Rpl"))
adata.var['is_ribo'] = is_ribo

In [None]:
#calculating qc metrics. Here, genes counts, num mitochondrial genes, and num cells will be calculated
obs_qc, var_qc = sc.pp.calculate_qc_metrics(adata=adata, qc_vars=['is_mito', 'is_ribo'], inplace=False)

obs_metrics = ['n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_is_mito', 'pct_counts_is_ribo']
adata.obs[obs_metrics] = obs_qc[obs_metrics] #Add metrics to adata.obs

In [None]:
#Overview of all quality measures
filename="cell_quality_before_filtering.png"
quality_violin(adata, groupby=data_to_evaluate,
                      columns=['n_genes_by_counts', 'log1p_total_counts', 'pct_counts_is_mito', 'pct_counts_is_ribo'],
                      header=['Genes expressed (count)', 'Total counts (log)', 'Mitochondrial content (%)', 'Ribosomal content (%)'],
                      color_list=color_list,
                      title="Quality before filtering",
                      save=filename)

## Apply individual filters

In [None]:
#Stablishing the filterings cutoff and filtering
def filtering_steps(QUANTILE, COND, IDX):
    global data_to_evaluate
    global conditions
    value_append=np.quantile(adata.obs[adata.obs[data_to_evaluate]==conditions[conditions.index(IDX)]][COND], QUANTILE)
    return value_append

### Minimum number of genes

In [None]:
#Filtering cells by min genes
conditions=adata.obs[data_to_evaluate].unique().tolist()
raw_total_genes_content=[]

if filt_min_gen.split(":")[0] == "Yes":
    for a in conditions:
        raw_total_genes_content.append(filtering_steps(float(filt_min_gen.split(":")[1]), "n_genes_by_counts", a))
    sc.pp.filter_cells(adata, min_genes=min(raw_total_genes_content)) #Minimum number of genes expressed required for a cell to pass filtering.
    print(str(adata.shape) + " Shape after filter by minimun number of expressed genes.")
    print("\tExcluded cells with less than " + str(int(min(raw_total_genes_content))) + " genes expressed.")
else:
    message_filt("It was not choose to filter by the minimum number of genes per cell.")

### Maximum mitochrondrial content

In [None]:
#Filtering cells by mitochondrial content
if filt_mito_content.split(":")[0] == "Yes":
    adata = adata[adata.obs['pct_counts_is_mito'] < float(filt_mito_content.split(":")[1])*100]
    print(str(adata.shape) + " Shape after filter mitochondrial content")
    print("\tExcluded cells with more than " + str(float(filt_mito_content.split(":")[1])*100) + " mitochondrial counts.")
else:
    message_filt("It was not choose to filter by the mitochondrial content.")

### Doublet filtering

## Final violin plot

In [None]:
filename="cell_quality_after_filtering.png"

quality_violin(adata, groupby=data_to_evaluate,
                      columns=['n_genes_by_counts', 'log1p_total_counts', 'pct_counts_is_mito', 'pct_counts_is_ribo'],
                      header=['Genes expressed (count)', 'Total counts (log)', 'Mitochondrial content (%)', 'Ribosomal content (%)'],
                      color_list=color_list,
                      title="Quality after filtering",
                      save=filename)

---------------

# E: Gene QC and filtering

In [None]:
#Plotting highest expressed genes
sc.pl.highest_expr_genes(adata, n_top=20, show=False)

In [None]:
#calculating qc metrics for genes
_, var_qc = sc.pp.calculate_qc_metrics(adata=adata, inplace=False)

var_metrics = ['n_cells_by_counts', 'mean_counts', 'log1p_mean_counts']
adata.var[var_metrics] = var_qc[var_metrics] #Add metrics to adata.var

### Filtering genes concerning expression in a certain number of cells

In [None]:
if filt_min_cells.split(":")[0] == "Yes":
    X_cells=round((float(adata.shape[0])*(float(filt_min_cells.split(":")[1])*100))/100) #Min num cells of a given gene needs to be present to pass the filter
    sc.pp.filter_genes(adata, min_cells = X_cells)
    print("\tExcluded genes expressed in less than " + str(X_cells) + " cells.")
    print(str(adata.shape) + " Final shape")

### Remove ribosomal/mitochrondrial/gender genes

In [None]:
#remove ribosomal genes
if remove_ribo: 
    ribo_genes = adata.var_names[adata.var['is_ribo']]
    print("Ribosomal filtering:")
    adata = sctoolbox.qc_filter.filter_genes(adata, ribo_genes)
    
#remove mitochrondrial genes
if remove_mito: 
    mito_genes = adata.var_names[adata.var['is_mito']]
    print("Mitochrondrial filtering:")
    adata = sctoolbox.qc_filter.filter_genes(adata, mito_genes)

#Remove gender genes
if remove_gender:
    pass
    #TODO
    #get gender genes
    #adata = sctoolbox.qc_filter.filter_genes(adata, gender_genes)
    
#Remove custom genes from list
if len(remove_custom) > 0:
    adata = sctoolbox.qc_filter.filter_genes(adata, remove_custom)

-----------------------------

# F: Save the final adata

In [None]:
path_results=path_imput

In [None]:
#Saving the anndata
print("Saving the anndata.")
adata_output= path_results + "/anndata_2_" + test +".h5ad"
adata.write(filename=adata_output)

# G: Saving information

In [None]:
display(adata)
displayoutput=str(adata)
information="\n2_QC_filtering\n" + "\nOutput_2:anndata_2_" + test + ".h5ad" + "\nAnndata:\n" + displayoutput
print(information, file=open(path_results + "/information.txt", "a"))