# Lab 9: Filtering & Doublet Detection

**Module:** 9 - Filtering Strategies  
**Duration:** 60-75 minutes

## Objectives
- Apply QC filters to remove low-quality cells
- Run doublet detection with Scrublet
- Compare before/after filtering statistics


In [None]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt

# Try importing scrublet
try:
    import scrublet as scr
    SCRUBLET_AVAILABLE = True
except ImportError:
    print("Scrublet not installed. Run: pip install scrublet")
    SCRUBLET_AVAILABLE = False

sc.settings.verbosity = 3
sc.set_figure_params(dpi=100, facecolor='white')


## 1. Load and Prepare Data


In [None]:
# Load PBMC dataset
adata = sc.datasets.pbmc3k()

# Calculate QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)

# Store original counts
n_cells_original = adata.n_obs
n_genes_original = adata.n_vars
print(f"Original: {n_cells_original} cells, {n_genes_original} genes")


## 2. Define Filtering Thresholds


In [None]:
# QC thresholds
MIN_GENES = 200
MAX_GENES = 2500
MAX_MITO_PCT = 20
MIN_CELLS_PER_GENE = 3

print("Filtering thresholds:")
print(f"  Min genes per cell: {MIN_GENES}")
print(f"  Max genes per cell: {MAX_GENES}")
print(f"  Max mito %: {MAX_MITO_PCT}")
print(f"  Min cells per gene: {MIN_CELLS_PER_GENE}")


## 3. Apply Filters


In [None]:
# Filter cells
print("Filtering cells...")
sc.pp.filter_cells(adata, min_genes=MIN_GENES)
print(f"  After min_genes: {adata.n_obs} cells")

adata = adata[adata.obs['n_genes_by_counts'] < MAX_GENES, :]
print(f"  After max_genes: {adata.n_obs} cells")

adata = adata[adata.obs['pct_counts_mt'] < MAX_MITO_PCT, :]
print(f"  After mito filter: {adata.n_obs} cells")

# Filter genes
print("\nFiltering genes...")
sc.pp.filter_genes(adata, min_cells=MIN_CELLS_PER_GENE)
print(f"  After min_cells: {adata.n_vars} genes")


## 4. Doublet Detection with Scrublet


In [None]:
if SCRUBLET_AVAILABLE:
    # Run Scrublet
    scrub = scr.Scrublet(adata.X, expected_doublet_rate=0.06)
    doublet_scores, predicted_doublets = scrub.scrub_doublets(
        min_counts=2, min_cells=3, min_gene_variability_pctl=85
    )
    
    adata.obs['doublet_score'] = doublet_scores
    adata.obs['predicted_doublet'] = predicted_doublets
    
    print(f"\nDoublet detection:")
    print(f"  Predicted doublets: {predicted_doublets.sum()}")
    print(f"  Doublet rate: {100 * predicted_doublets.mean():.1f}%")
    
    # Remove doublets
    n_before = adata.n_obs
    adata = adata[~adata.obs['predicted_doublet'], :]
    print(f"  Final cells after doublet removal: {adata.n_obs}")
else:
    print("Scrublet not available - skipping doublet detection")


## 5. Summary


In [None]:
print("=" * 50)
print("FILTERING SUMMARY")
print("=" * 50)
print(f"{'Metric':<20} {'Before':<12} {'After':<12} {'Removed':<12}")
print("-" * 50)
print(f"{'Cells':<20} {n_cells_original:<12} {adata.n_obs:<12} {n_cells_original - adata.n_obs:<12}")
print(f"{'Genes':<20} {n_genes_original:<12} {adata.n_vars:<12} {n_genes_original - adata.n_vars:<12}")
print(f"{'% Cells retained':<20} {'100%':<12} {100*adata.n_obs/n_cells_original:.1f}%")


## Exercise Questions

1. What percentage of cells were removed by each filter?
2. How does the doublet rate compare to the expected rate for this technology?
3. What would happen if you set MAX_GENES too low?
4. Would you use the same thresholds for a tumor sample vs healthy tissue?


In [None]:
# Your answers here

