# Lab 2: Diagnosing Batch Effects

**Module 2** - Identifying and Quantifying Batch Effects

## Objectives
- Visualize batch effects
- Compute batch mixing metrics (LISI)
- Determine if integration is needed


In [None]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=100, facecolor='white')

# For this lab, we'll simulate batches using PBMC data
# In practice, use real multi-batch datasets
adata = sc.datasets.pbmc3k_processed()

# Simulate two "batches" by splitting the data
np.random.seed(42)
adata.obs['batch'] = np.random.choice(['batch_1', 'batch_2'], size=adata.n_obs)
print(f"Batch distribution:\n{adata.obs['batch'].value_counts()}")


In [None]:
# Visual inspection: UMAP colored by batch
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sc.pl.umap(adata, color='batch', ax=axes[0], show=False, title='Colored by Batch')
sc.pl.umap(adata, color='louvain', ax=axes[1], show=False, title='Colored by Cell Type')

plt.tight_layout()
plt.show()

# If batches separate clearly (different regions), you need integration
# If batches mix well within cell types, integration may not be needed


In [None]:
# Compute LISI (Local Inverse Simpson Index)
# Higher iLISI = better batch mixing
# Lower cLISI = better cell type separation

def compute_lisi(adata, batch_key='batch', label_key='louvain', perplexity=30):
    """Simplified LISI computation"""
    from sklearn.neighbors import NearestNeighbors
    
    # Get neighbors in PCA space
    X = adata.obsm['X_pca'][:, :30]
    nn = NearestNeighbors(n_neighbors=perplexity)
    nn.fit(X)
    _, indices = nn.kneighbors(X)
    
    # For each cell, compute diversity of batch/labels in neighbors
    batch_lisi = []
    label_lisi = []
    
    for i in range(len(adata)):
        neighbor_idx = indices[i]
        
        # Batch LISI
        batch_counts = adata.obs[batch_key].iloc[neighbor_idx].value_counts()
        batch_props = batch_counts / batch_counts.sum()
        batch_lisi.append(1 / (batch_props ** 2).sum())
        
        # Cell type LISI
        label_counts = adata.obs[label_key].iloc[neighbor_idx].value_counts()
        label_props = label_counts / label_counts.sum()
        label_lisi.append(1 / (label_props ** 2).sum())
    
    return np.mean(batch_lisi), np.mean(label_lisi)

iLISI, cLISI = compute_lisi(adata)
print(f"iLISI (batch mixing): {iLISI:.3f} (higher=better mixing)")
print(f"cLISI (cell type): {cLISI:.3f} (lower=better separation)")
