# Lab 1: Explore High-Dimensional Gene Expression

## Objectives
- Build intuition for high-dimensional data
- See why distances become less informative (“distance concentration”)
- Understand why we use PCA/HVGs before clustering

## Outputs
- Save notes to `../results/lab01_highdim_notes.md`

---


In [None]:
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

sc.settings.set_figure_params(dpi=110, facecolor='white')

adata = sc.datasets.pbmc3k()
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

print(adata)


In [None]:
# Distance concentration demo
# Compute pairwise distances on a random subset in gene space vs PCA space

from sklearn.metrics import pairwise_distances

np.random.seed(0)
idx = np.random.choice(adata.n_obs, size=min(400, adata.n_obs), replace=False)
X = adata.X[idx]
X = X.toarray() if hasattr(X, 'toarray') else X

# Gene space distances (high-D)
D_gene = pairwise_distances(X, metric='euclidean')

# PCA space distances (low-D)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
adata_hvg = adata[:, adata.var.highly_variable].copy()
sc.pp.scale(adata_hvg, max_value=10)
sc.tl.pca(adata_hvg, n_comps=30)
X_pca = adata_hvg.obsm['X_pca'][idx]
D_pca = pairwise_distances(X_pca, metric='euclidean')

# Compare distance distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(D_gene[np.triu_indices_from(D_gene, k=1)], bins=50, color='grey70')
axes[0].set_title('Gene space distances')
axes[1].hist(D_pca[np.triu_indices_from(D_pca, k=1)], bins=50, color='steelblue')
axes[1].set_title('PCA space distances (30 PCs)')
plt.tight_layout()
plt.show()

# Save notes
Path('../results').mkdir(exist_ok=True)
(Path('../results/lab01_highdim_notes.md')).write_text(
    "# Lab 1 Notes\n\n"
    "- Gene-space distances often concentrate in high dimensions.\n"
    "- PCA space gives a more useful geometry for kNN graphs.\n"
)
print('Wrote ../results/lab01_highdim_notes.md')
