# Lab 8: Integration Evaluation

**Module 8** - Comprehensive Quality Assessment

## Objectives
- Calculate batch mixing metrics
- Assess biological conservation
- Compare multiple methods


In [None]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Note: For full evaluation, install scib: pip install scib
try:
    import scib
    SCIB_AVAILABLE = True
except ImportError:
    SCIB_AVAILABLE = False
    print("scib not installed. Run: pip install scib")

sc.settings.set_figure_params(dpi=100, facecolor='white')


In [None]:
# Simple evaluation metrics implementation

from sklearn.metrics import silhouette_score, adjusted_rand_score

def evaluate_integration(adata, batch_key='batch', label_key='louvain', embed_key='X_pca'):
    """Compute basic integration metrics"""
    X = adata.obsm[embed_key][:, :30] if embed_key in adata.obsm else adata.X
    
    results = {}
    
    # Silhouette score for batch (should be LOW = good mixing)
    batch_labels = adata.obs[batch_key].astype('category').cat.codes
    results['batch_silhouette'] = silhouette_score(X, batch_labels)
    
    # Silhouette score for cell types (should be HIGH = good separation)
    if label_key in adata.obs:
        cell_labels = adata.obs[label_key].astype('category').cat.codes
        results['celltype_silhouette'] = silhouette_score(X, cell_labels)
    
    # ARI if we have both cluster assignments and true labels
    if 'leiden' in adata.obs and label_key in adata.obs:
        results['ARI'] = adjusted_rand_score(
            adata.obs[label_key], 
            adata.obs['leiden']
        )
    
    return results

# Example usage
adata = sc.datasets.pbmc3k_processed()
np.random.seed(42)
adata.obs['batch'] = np.random.choice(['b1', 'b2'], size=adata.n_obs)

metrics = evaluate_integration(adata, batch_key='batch', label_key='louvain')
print("Integration metrics:")
for k, v in metrics.items():
    print(f"  {k}: {v:.4f}")
