# scAutoQC

In [None]:
import scanpy as sc
import sctk

sc.settings.set_figure_params(dpi=80)

In [None]:
# this cell is tagged 'parameters' to use papermill
input_file = '/data/cephfs-1/home/users/cemo10_c/work/scRNA/scRNA_preprocessing_pipeline/results/per_sample/CE_SC_5FU_Conti_5/adata.h5ad'
count_layer = "soupX_counts"
output_file = "results/per_sample/CE_SC_5FU_Conti_2/adata_ready_for_merge_soupX_counts_scAutoQC.h5ad"
qc_method = "theislab_tutorial_then_scAutoQC"

In [None]:
adata = sc.read_h5ad(input_file)
adata.X = adata.layers[count_layer].copy()
adata.var_names_make_unique()
adata

In [None]:
if qc_method == 'theislab_tutorial_then_scAutoQC':
    print(f"Total number of cells: {adata.n_obs}")
    adata = adata[(~adata.obs.outlier) & (~adata.obs.mt_outlier)].copy()
    print(f"Number of cells after filtering of low quality cells: {adata.n_obs}")

In [None]:
sctk.calculate_qc(adata)
adata

In [None]:
sctk.default_metric_params_df

In [None]:
metrics = sctk.default_metric_params_df.loc[[
    "n_counts",
    "n_genes",
    "percent_mito",
    "percent_ribo",
    "percent_hb"], :]

# add new column
import pandas as pd
metrics.loc['scDblFinder_score'] = [pd.NA, 1, 'linear', 'max_only', 0]
metrics

In [None]:
sctk.cellwise_qc(adata, metrics=metrics)
adata

In [None]:
adata.obs['cell_passed_qc'].sum()

In [None]:
adata.uns['scautoqc_ranges']

In [None]:
#present as columns in obs of the object
metrics_list = ["log1p_n_counts", "log1p_n_genes", "percent_mito", 'percent_hb', "percent_ribo", "scDblFinder_score", "percent_top50"]
sctk.generate_qc_clusters(adata, metrics = metrics_list)
sc.pl.embedding(adata, "X_umap_qc", color=metrics_list, color_map="OrRd")

In [None]:
sctk.clusterwise_qc(adata)
sc.pl.embedding(adata, "X_umap_qc", color=["cell_passed_qc", "cluster_passed_qc"])

In [None]:
sctk.multi_resolution_cluster_qc(adata, metrics = metrics_list)
sc.pl.embedding(adata, "X_umap_qc", color=["cell_passed_qc",
                                           "cluster_passed_qc",
                                           "consensus_fraction",
                                           "consensus_passed_qc"])

In [None]:
import seaborn as sns
p1 = sns.displot(adata.obs["n_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
p2 = sc.pl.violin(adata, "percent_mito")
p3 = sc.pl.scatter(adata, "n_counts", "n_genes", color="percent_mito")

In [None]:
# filter out cells that did not pass qc
adata = adata[adata.obs['cell_passed_qc'], :]

In [None]:
import seaborn as sns
p1 = sns.displot(adata.obs["n_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
p2 = sc.pl.violin(adata, "percent_mito")
p3 = sc.pl.scatter(adata, "n_counts", "n_genes", color="percent_mito")

# Normalization (per sample, required later)

In [None]:
scales_counts = sc.pp.normalize_total(adata, target_sum=None, inplace=False)
# log1p transform
# name for new layer
layer_name = "log1p_norm" + "_of_" + count_layer
adata.layers[layer_name] = sc.pp.log1p(scales_counts["X"], copy=True)

In [None]:
adata.uns['scautoqc_ranges'] = adata.uns['scautoqc_ranges'].to_dict()
adata.write(output_file)

In [None]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
p1 = sns.histplot(adata.obs["total_counts"], bins=100, kde=False, ax=axes[0])
axes[0].set_title("Total counts")
p2 = sns.histplot(adata.layers[layer_name].sum(1), bins=100, kde=False, ax=axes[1])
axes[1].set_title("Shifted logarithm")
plt.show()

In [None]:
# are the counts still integers? (They should be)
print(adata.X[0:5, 0:5].todense())
print(adata.layers['counts'][0:5, 0:5].todense())
adata.raw[0:5, 0:5]