# Lab 2: Principal Component Analysis

**Module 2** - Understanding PCA for scRNA-seq

## Objectives
- Understand variance explained by PCs
- Select appropriate number of PCs
- Interpret PC loadings
- Identify technical vs biological variation


In [None]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white')

# Load PBMC data
adata = sc.datasets.pbmc3k()
print(f"Cells: {adata.n_obs}, Genes: {adata.n_vars}")


In [None]:
# Preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)

# Calculate QC metrics
adata.var['mt'] = adata.var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)

# Filter
adata = adata[adata.obs.n_genes_by_counts < 2500, :]
adata = adata[adata.obs.pct_counts_mt < 5, :]

# Normalize
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

print(f"After preprocessing: {adata.n_obs} cells")


In [None]:
# Select highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
print(f"HVGs selected: {adata.var.highly_variable.sum()}")

# Visualize HVG selection
sc.pl.highly_variable_genes(adata)


In [None]:
# Run PCA
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, n_comps=50)

# Check variance explained
print("Variance ratios for first 10 PCs:")
print(adata.uns['pca']['variance_ratio'][:10])


In [None]:
# Elbow plot - choose number of PCs
sc.pl.pca_variance_ratio(adata, n_pcs=50, log=True)

# Typically choose where the curve "elbows" - often 10-30 PCs


In [None]:
# Examine PC loadings - which genes contribute most to each PC
loadings = adata.varm['PCs']

# Top genes for PC1
pc1_loadings = loadings[:, 0]
top_genes_pc1 = adata.var_names[np.argsort(np.abs(pc1_loadings))[-10:]]
print("Top 10 genes contributing to PC1:")
for gene in top_genes_pc1:
    idx = adata.var_names.tolist().index(gene)
    print(f"  {gene}: {pc1_loadings[idx]:.4f}")


In [None]:
# Check if PC1 correlates with technical factors (total counts)
from scipy.stats import pearsonr

pc1 = adata.obsm['X_pca'][:, 0]
total_counts = adata.obs['total_counts'].values

corr, pval = pearsonr(pc1, total_counts)
print(f"PC1 vs total_counts correlation: r={corr:.3f}, p={pval:.2e}")

# If highly correlated (|r| > 0.5), PC1 may capture technical variation
plt.figure(figsize=(6, 4))
plt.scatter(total_counts, pc1, alpha=0.3, s=5)
plt.xlabel('Total counts')
plt.ylabel('PC1')
plt.title(f'PC1 vs Total Counts (r={corr:.3f})')
plt.tight_layout()
plt.show()


In [None]:
# Visualize first two PCs
sc.pl.pca(adata, color=['CST3', 'NKG7', 'MS4A1'])  # Monocyte, NK, B cell markers

# Different cell types separate in PC space
