# Lab 8: Quality Control Metrics

**Module:** 8 - Quality Control Metrics  
**Duration:** 60 minutes

## Objectives
- Calculate per-cell QC metrics
- Visualize QC distributions
- Identify outliers and problematic cells


In [None]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt

sc.settings.verbosity = 3
sc.set_figure_params(dpi=100, facecolor='white')


In [None]:
# Load PBMC dataset
adata = sc.datasets.pbmc3k()
print(f"Loaded {adata.n_obs} cells and {adata.n_vars} genes")

# Identify mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith('MT-')
print(f"Mitochondrial genes: {adata.var['mt'].sum()}")


## 1. Calculate QC Metrics


In [None]:
# Calculate QC metrics
sc.pp.calculate_qc_metrics(
    adata, 
    qc_vars=['mt'],  # Calculate % for mitochondrial genes
    percent_top=None, 
    inplace=True
)

# Summary statistics
qc_cols = ['n_genes_by_counts', 'total_counts', 'pct_counts_mt']
adata.obs[qc_cols].describe()


## 2. Violin Plots


In [None]:
fig, axes = plt.subplots(1, 3, figsize=(12, 4))

sc.pl.violin(adata, 'n_genes_by_counts', ax=axes[0], show=False)
axes[0].set_title('Genes per Cell')

sc.pl.violin(adata, 'total_counts', ax=axes[1], show=False)
axes[1].set_title('UMIs per Cell')

sc.pl.violin(adata, 'pct_counts_mt', ax=axes[2], show=False)
axes[2].set_title('Mitochondrial %')

plt.tight_layout()
plt.show()


## 3. Scatter Plots


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', ax=axes[0], show=False)
axes[0].set_title('UMIs vs Genes Detected')

sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', ax=axes[1], show=False)
axes[1].set_title('UMIs vs Mitochondrial %')

plt.tight_layout()
plt.show()


## 4. Identifying Outliers with MAD


In [None]:
def mad_outlier(data, n_mads=5):
    """Identify outliers using Median Absolute Deviation."""
    median = np.median(data)
    mad = np.median(np.abs(data - median))
    lower = median - n_mads * mad * 1.4826
    upper = median + n_mads * mad * 1.4826
    return lower, upper

# Calculate bounds
genes_lower, genes_upper = mad_outlier(adata.obs['n_genes_by_counts'])
print(f"Gene bounds: {genes_lower:.0f} - {genes_upper:.0f}")

# Mark outliers
adata.obs['is_outlier'] = (
    (adata.obs['n_genes_by_counts'] < genes_lower) |
    (adata.obs['n_genes_by_counts'] > genes_upper) |
    (adata.obs['pct_counts_mt'] > 20)
)
print(f"Outliers: {adata.obs['is_outlier'].sum()} / {adata.n_obs} cells")


## Exercise Questions

1. What does a high mitochondrial percentage indicate about a cell?
2. Why might you see cells with very high gene counts?
3. Should you always use the same QC thresholds for different tissues?
4. What is the advantage of MAD-based thresholds over fixed thresholds?


In [None]:
# Your answers here

