In [2]:
# cluster_and_plot_features.py
"""
Leiden clustering + UMAP + spatial plots for CLIP-GO features

Inputs:
  • AnnData with x_centroid / y_centroid
  • NPZ saved as: cell_id, vision_raw, vision_proj, text_raw, text_proj

Outputs (under CFG.out_dir):
  • clusters_<tag>.csv        (leiden labels per embedding)
  • umap_<tag>.png            (UMAP colored by leiden)
  • spatial_<tag>.png         (x/y centroid scatter colored by same palette)
  • adata_with_leiden_<cancer>.h5ad (adata with .obsm embeddings and leiden labels)
"""

from __future__ import annotations
import os, numpy as np, pandas as pd, scanpy as sc, matplotlib.pyplot as plt

# -----------------------
# Config (match your style)
# -----------------------
class CFG:
    # dataset selection
    cancer = "lung"
    ground_truth = "refined"

    # paths
    adata_root = "/rsrch9/home/plm/idso_fa1_pathology/TIER1/paul-xenium/public_data/10x_genomics"
    xenium_sample_dict = {
        "lung":"Xenium_Prime_Human_Lung_Cancer_FFPE_outs",
        "breast": "Xenium_Prime_Breast_Cancer_FFPE_outs",
        "lymph_node": "Xenium_Prime_Human_Lymph_Node_Reactive_FFPE_outs",
        "prostate": "Xenium_Prime_Human_Prostate_FFPE_outs",
        "skin": "Xenium_Prime_Human_Skin_FFPE_outs",
        "ovarian": "Xenium_Prime_Ovarian_Cancer_FFPE_outs",
        "cervical": "Xenium_Prime_Cervical_Cancer_FFPE_outs",
    }
    xenium_sample = xenium_sample_dict[cancer]
    # NPZ created by your extractor
    npz_path = f"/rsrch9/home/plm/idso_fa1_pathology/TIER2/paul-xenium/embeddings/public_data/{xenium_sample}/GoCLIP/features_lung.npz"

    # outputs
    out_dir = os.path.join(os.path.dirname(npz_path), "leiden_eval")

    # neighbors / leiden / umap
    n_neighbors = 15
    resolution  = 0.5
    umap_min_dist = 0.5
    umap_spread   = 1.0
    umap_random_state = 0

    # plotting
    dot_size_umap   = 8.0
    dot_size_spatial= 1.5
    alpha           = 0.9
    invert_y        = True   # image coords usually increase downward

# -----------------------
# Resolve AnnData path
# -----------------------
sample  = CFG.xenium_sample_dict[CFG.cancer]
adata_path = os.path.join(
    CFG.adata_root, sample, "preprocessed",
    f"fine_tune_{CFG.ground_truth}_v2",
    f"processed_xenium_data_fine_tune_{CFG.ground_truth}_v2_annotated.h5ad",
)
os.makedirs(CFG.out_dir, exist_ok=True)

# -----------------------
# 1) Load AnnData + NPZ, align by cell_id
# -----------------------
adata = sc.read_h5ad(adata_path)
need = {"x_centroid", "y_centroid"}
if not need.issubset(adata.obs.columns):
    raise ValueError(f"AnnData missing columns: {need - set(adata.obs.columns)}")

Z = np.load(CFG.npz_path)
ids_np = pd.Index(Z["cell_id"].astype(str))

# exact keys from your saver
feat_map = {
    "X_v_raw":  Z["vision_raw"],   # (N, 1536)
    "X_v_proj": Z["vision_proj"],  # (N, 256)
    "X_t_raw":  Z["text_raw"],     # (N, D_text)
    "X_t_proj": Z["text_proj"],    # (N, 256)
}

# Align to intersection (and reorder to match adata)
common = adata.obs.index.intersection(ids_np)
if len(common) == 0:
    raise ValueError("No overlapping cell IDs between adata and NPZ features.")
order_in_npz = ids_np.get_indexer(common)  # NPZ row positions for adata[common]
adata = adata[common].copy()

for key, arr in feat_map.items():
    adata.obsm[key] = np.asarray(arr, dtype=np.float32)[order_in_npz]

print(f"Aligned cells: {adata.n_obs}")
print({k: adata.obsm[k].shape for k in feat_map.keys()})

# -----------------------
# 2) Neighbors + Leiden + UMAP per embedding
# -----------------------
# cosine for projected (they're L2-normalized); euclidean for raw
configs = {
    "v_proj": dict(use_rep="X_v_proj", metric="cosine"),
    "t_proj": dict(use_rep="X_t_proj", metric="cosine"),
    "v_raw":  dict(use_rep="X_v_raw",  metric="euclidean"),
    "t_raw":  dict(use_rep="X_t_raw",  metric="euclidean"),
}

def umap_and_leiden(adata, tag, use_rep, metric):
    neigh_key  = f"neighbors_{tag}"
    leiden_key = f"leiden_{tag}"
    sc.pp.neighbors(
        adata,
        use_rep=use_rep,
        n_neighbors=CFG.n_neighbors,
        metric=metric,
        key_added=neigh_key,
        method="umap",  # uses pynndescent if available → faster on big N
    )
    sc.tl.umap(
        adata,
        neighbors_key=neigh_key,
        min_dist=CFG.umap_min_dist,
        spread=CFG.umap_spread,
        random_state=CFG.umap_random_state,
    )
    sc.tl.leiden(
        adata,
        neighbors_key=neigh_key,
        key_added=leiden_key,
        resolution=CFG.resolution,
    )
    # save labels
    adata.obs[[leiden_key]].to_csv(os.path.join(CFG.out_dir, f"clusters_{tag}.csv"))
    return leiden_key

def save_umap(adata, color_key, fname_png):
    fig = sc.pl.umap(
        adata, color=color_key, size=CFG.dot_size_umap,
        legend_loc="right margin", frameon=False, show=False, return_fig=True
    )
    fig.savefig(fname_png, dpi=200, bbox_inches="tight")
    plt.close(fig)

def save_spatial(adata, color_key, fname_png):
    cats = adata.obs[color_key].astype("category")
    # Scanpy stores category colors under <key>_colors
    colors = adata.uns.get(f"{color_key}_colors", None)
    if colors is None:
        # fallback palette
        import matplotlib as mpl
        cmap = mpl.cm.get_cmap("tab20", 20)
        colors = [cmap(i % 20) for i in range(len(cats.cat.categories))]
    colmap = dict(zip(cats.cat.categories, colors))
    point_colors = cats.map(colmap).values

    plt.figure(figsize=(8, 8))
    plt.scatter(
        adata.obs["x_centroid"].values,
        adata.obs["y_centroid"].values,
        c=point_colors, s=CFG.dot_size_spatial, alpha=CFG.alpha,
        edgecolors="none", rasterized=True
    )
    if CFG.invert_y:
        plt.gca().invert_yaxis()
    plt.title(color_key)
    plt.xlabel("x_centroid"); plt.ylabel("y_centroid")
    plt.savefig(fname_png, dpi=200, bbox_inches="tight")
    plt.close()

for tag, cfg in configs.items():
    print(f"→ neighbors/UMAP/Leiden for {tag}")
    leiden_key = umap_and_leiden(adata, tag, cfg["use_rep"], cfg["metric"])
    umap_png    = os.path.join(CFG.out_dir, f"umap_{tag}.png")
    spatial_png = os.path.join(CFG.out_dir, f"spatial_{tag}.png")
    print(f"   saving UMAP → {umap_png}")
    save_umap(adata, leiden_key, umap_png)
    print(f"   saving spatial → {spatial_png}")
    save_spatial(adata, leiden_key, spatial_png)

# -----------------------
# 3) Save updated AnnData with embeddings + clusters
# -----------------------
out_h5ad = os.path.join(CFG.out_dir, f"adata_with_leiden_{CFG.cancer}.h5ad")
adata.write(out_h5ad)
print(f"✓ wrote {out_h5ad}")


Aligned cells: 244659
{'X_v_raw': (244659, 1536), 'X_v_proj': (244659, 256), 'X_t_raw': (244659, 768), 'X_t_proj': (244659, 256)}
→ neighbors/UMAP/Leiden for v_proj


2025-08-12 20:09:27.491034: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-08-12 20:09:27.517496: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-08-12 20:09:27.517574: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-08-12 20:09:27.533778: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

 To achieve the future defaults please pass: flavor

   saving UMAP → /rsrch9/home/plm/idso_fa1_pathology/TIER2/paul-xenium/embeddings/public_data/Xenium_Prime_Human_Lung_Cancer_FFPE_outs/GoCLIP/leiden_eval/umap_v_proj.png
   saving spatial → /rsrch9/home/plm/idso_fa1_pathology/TIER2/paul-xenium/embeddings/public_data/Xenium_Prime_Human_Lung_Cancer_FFPE_outs/GoCLIP/leiden_eval/spatial_v_proj.png
→ neighbors/UMAP/Leiden for t_proj
   saving UMAP → /rsrch9/home/plm/idso_fa1_pathology/TIER2/paul-xenium/embeddings/public_data/Xenium_Prime_Human_Lung_Cancer_FFPE_outs/GoCLIP/leiden_eval/umap_t_proj.png
   saving spatial → /rsrch9/home/plm/idso_fa1_pathology/TIER2/paul-xenium/embeddings/public_data/Xenium_Prime_Human_Lung_Cancer_FFPE_outs/GoCLIP/leiden_eval/spatial_t_proj.png
→ neighbors/UMAP/Leiden for v_raw
   saving UMAP → /rsrch9/home/plm/idso_fa1_pathology/TIER2/paul-xenium/embeddings/public_data/Xenium_Prime_Human_Lung_Cancer_FFPE_outs/GoCLIP/leiden_eval/umap_v_raw.png
   saving spatial → /rsrch9/home/plm/idso_fa1_pathology/TIER2/paul-xen