diff --git a/.gitignore b/.gitignore index 309271d..b88f162 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,7 @@ /output trace-* .ipynb_checkpoints -/temp \ No newline at end of file +/temp +__pycache__/ +*.pyc +*.pyo \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index f9aec7f..f056dc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## NEW FUNCTIONALITY +* Added CellMapper method (two variants: simple PCA/CCA fallback and modality-specific scvi-tools models for joint mod1 representation) (PR #10) + * Added Novel method (PR #2). * Added Simple MLP method (PR #3). diff --git a/src/methods/cellmapper_linear/config.vsh.yaml b/src/methods/cellmapper_linear/config.vsh.yaml new file mode 100644 index 0000000..bb18e26 --- /dev/null +++ b/src/methods/cellmapper_linear/config.vsh.yaml @@ -0,0 +1,73 @@ +__merge__: ../../api/comp_method.yaml +name: cellmapper_linear +label: CellMapper+PCA/CCA +summary: "Modality prediction in a PCA/CCA space using CellMapper" +description: | + CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics. + This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset (test) in a PCA/CCA latent space. +references: + doi: + - 10.5281/zenodo.15683594 +links: + documentation: https://cellmapper.readthedocs.io/en/latest/ + repository: https://github.com/quadbio/cellmapper +info: + preferred_normalization: log_cp10k + variants: + cellmapper-pca: + fallback_representation: joint_pca + mask_var: None + kernel_method: hnoca + cellmapper-pca-hvg: + fallback_representation: joint_pca + mask_var: "hvg" + kernel_method: hnoca + cellmapper-pca-hvg-gauss: + fallback_representation: joint_pca + mask_var: "hvg" + kernel_method: gauss + cellmapper-cca: + fallback_representation: fast_cca + mask_var: None + kernel_method: hnoca + cellmapper-cca-hvg: + fallback_representation: fast_cca + mask_var: "hvg" + kernel_method: hnoca + cellmapper-cca-hvg-gauss: + fallback_representation: fast_cca + mask_var: "hvg" + kernel_method: gauss +arguments: + - name: "--fallback_representation" + type: "string" + choices: ["joint_pca", "fast_cca"] + default: "fast_cca" + description: Fallback representation to use for k-NN mapping (computed if use_rep is None). + - name: "--mask_var" + type: "string" + description: Variable to mask for fallback representation. + - name: "--kernel_method" + type: "string" + choices: ["hnoca", "gauss"] + default: "hnoca" + description: Kernel function to compute k-NN edge weights. + - name: "--n_neighbors" + type: "integer" + default: 30 + description: Number of neighbors to consider for k-NN graph construction. +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + packages: + - cellmapper>=0.2.2 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime,midmem,midcpu] diff --git a/src/methods/cellmapper_linear/script.py b/src/methods/cellmapper_linear/script.py new file mode 100644 index 0000000..8863a22 --- /dev/null +++ b/src/methods/cellmapper_linear/script.py @@ -0,0 +1,60 @@ +import anndata as ad +import cellmapper as cm +from scipy.sparse import csc_matrix + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad', + 'output': 'output.h5ad', + 'fallback_representation': 'joint_pca', # or None for fallback_representation + 'n_neighbors': 30, + 'kernel_method': 'gauss', + 'mask_var': "hvg" # variable to mask for fallback representation +} +meta = { + 'name': 'cellmapper_linear', +} +## VIASH END + +print('Reading input files', flush=True) +input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) + +print('Prepare the data', flush=True) +# Make sure we have normalized data in .X for mod1 +input_train_mod1.X = input_train_mod1.layers["normalized"].copy() +input_test_mod1.X = input_test_mod1.layers["normalized"].copy() + +# copy the normalized layer to obsm for mod2 +input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] + +print("Set up and prepare Cellmapper", flush=True) +cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1) +cmap.compute_neighbors( + use_rep=None, + fallback_representation=par['fallback_representation'], + n_neighbors=par['n_neighbors'], + fallback_kwargs={"mask_var": par['mask_var']}, + ) +cmap.compute_mapping_matrix(kernel_method=par['kernel_method']) + +print("Predict on test data", flush=True) +cmap.map_obsm(key="mod2", prediction_postfix="pred") +mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + layers={"normalized": mod2_pred}, + obs=input_test_mod1.obs, + var=input_train_mod2.var, + uns={ + 'dataset_id': input_train_mod1.uns['dataset_id'], + 'method_id': meta["name"], + }, +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/cellmapper_scvi/config.vsh.yaml b/src/methods/cellmapper_scvi/config.vsh.yaml new file mode 100644 index 0000000..359579a --- /dev/null +++ b/src/methods/cellmapper_scvi/config.vsh.yaml @@ -0,0 +1,84 @@ +__merge__: ../../api/comp_method.yaml +name: cellmapper_scvi +label: CellMapper+scVI +summary: "Modality prediction in an scVI latent space using CellMapper" +description: | + CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics. + This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset + (test) in a modality-specific latent space computed with suitable scvi-tools models. For gene expression data, + we use the scVI model on raw counts (nb likelihood), for ADT data, we use the scVI models on normalized counts + (gaussian likelihood), and for ATAC data, we use the PeakVI model on raw counts. The actual CellMapper pipeline is + modality-agnostic. +references: + doi: + - 10.5281/zenodo.15683594 +links: + documentation: https://cellmapper.readthedocs.io/en/latest/ + repository: https://github.com/quadbio/cellmapper +info: + preferred_normalization: log_cp10k + variants: + cellmapper_hnoca_hvg: + kernel_method: hnoca + use_hvg: true + adt_normalization: clr + cellmapper_hnoca_all_genes: + kernel_method: hnoca + use_hvg: false + adt_normalization: clr + cellmapper_gauss_hvg: + kernel_method: gauss + use_hvg: true + adt_normalization: clr + cellmapper_gauss_hvg_log_cp10k: + kernel_method: gauss + use_hvg: true + adt_normalization: log_cp10k + cellmapper_gauss_all_genes: + kernel_method: gauss + use_hvg: false + adt_normalization: clr + +arguments: + - name: "--kernel_method" + type: "string" + choices: ["hnoca", "gauss"] + default: "hnoca" + description: Kernel function to compute k-NN edge weights (CellMapper parameter). + - name: "--n_neighbors" + type: "integer" + default: 30 + description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter). + - name: "--use_hvg" + type: boolean + default: true + description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter). + - name: "--adt_normalization" + type: "string" + choices: ["clr", "log_cp10k"] + default: "clr" + description: Normalization method for ADT data, clr = centered log ratio. + - name: "--plot_umap" + type: boolean + default: false + description: Whether to plot the UMAP embedding of the latent space (for diagnoscic purposes) +resources: + - type: python_script + path: script.py + - path: utils.py + dest: utils.py +engines: + - type: docker + image: openproblems/base_pytorch_nvidia:1.0.0 + setup: + - type: python + packages: + - cellmapper>=0.2.2 + - scvi-tools>=1.3.0 + - muon>=0.1.6 + +runners: + - type: executable + - type: nextflow + directives: + label: [midtime,midmem,midcpu,gpu] diff --git a/src/methods/cellmapper_scvi/script.py b/src/methods/cellmapper_scvi/script.py new file mode 100644 index 0000000..1f9e74b --- /dev/null +++ b/src/methods/cellmapper_scvi/script.py @@ -0,0 +1,80 @@ +import sys +import anndata as ad +import cellmapper as cm +from scipy.sparse import csc_matrix + +## VIASH START +# Note: this section is auto-generated by viash at runtime. To edit it, make changes +# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. +par = { + 'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad', + 'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad', + 'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad', + 'output': 'output.h5ad', + 'n_neighbors': 30, + 'kernel_method': 'hnoca', + 'use_hvg': False, + 'adt_normalization': 'clr', # Normalization method for ADT data + 'plot_umap': True, + +} +meta = { + 'name': 'cellmapper_scvi', + 'resources_dir': 'target/executable/methods/cellmapper_scvi', +} +## VIASH END + +sys.path.append(meta['resources_dir']) +from utils import get_representation + +print('Reading input files', flush=True) +input_train_mod1 = ad.read_h5ad(par['input_train_mod1']) +input_train_mod2 = ad.read_h5ad(par['input_train_mod2']) +input_test_mod1 = ad.read_h5ad(par['input_test_mod1']) + +mod1 = input_train_mod1.uns['modality'] +mod2 = input_train_mod2.uns['modality'] +print(f"Modality 1: {mod1}, n_features: {input_train_mod1.n_vars}", flush=True) +print(f"Modality 2: {mod2}, n_features: {input_train_mod2.n_vars}", flush=True) + +print("Concatenating train and test data", flush=True) +adata = ad.concat( + [input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"] + ) + +# Compute a latent representation using an appropriate model based on the modality +print("Get latent representation", flush=True) +adata = get_representation( + adata=adata, modality=mod1, use_hvg=par['use_hvg'], adt_normalization=par['adt_normalization'], plot_umap=par['plot_umap'] + ) + +# Place the representation back into individual objects +input_train_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "train"].obsm["X_scvi"].copy() +input_test_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "test"].obsm["X_scvi"].copy() + +# copy the normalized layer to obsm for mod2 +input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"] + +print('Setup and prepare Cellmapper', flush=True) +cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1) +cmap.compute_neighbors( + use_rep="X_scvi", + n_neighbors=par['n_neighbors'], + ) +cmap.compute_mapping_matrix(kernel_method=par['kernel_method']) + +print("Predict on test data", flush=True) +cmap.map_obsm(key="mod2", prediction_postfix="pred") +mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"]) + +print("Write output AnnData to file", flush=True) +output = ad.AnnData( + layers={"normalized": mod2_pred}, + obs=input_test_mod1.obs, + var=input_train_mod2.var, + uns={ + 'dataset_id': input_train_mod1.uns['dataset_id'], + 'method_id': meta["name"], + }, +) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/cellmapper_scvi/utils.py b/src/methods/cellmapper_scvi/utils.py new file mode 100644 index 0000000..f059877 --- /dev/null +++ b/src/methods/cellmapper_scvi/utils.py @@ -0,0 +1,100 @@ +from typing import Literal +import anndata as ad +import scvi +from scipy.sparse import issparse, csr_matrix, csc_matrix +import muon +import scanpy as sc + + +def get_representation( + adata: ad.AnnData, + modality: Literal["GEX", "ADT", "ATAC"], + use_hvg: bool = True, + adt_normalization: Literal["clr", "log_cp10k"] = "clr", + plot_umap: bool = False, + ) -> ad.AnnData: + """ + Get a joint latent space representation of the data based on the modality. + + Parameters + ---------- + adata + AnnData object containing concateneted train and test data from the same modality. + modality + The modality of the data, one of "GEX", "ADT", or "ATAC". Depeding on the modality, we fit the following models: + + - "GEX": scVI model for gene expression data with ZINB likelihood on raw counts. + - "ADT": scVI model for ADT data (surface proteins) with Gaussian likelihood on normalized data. + - "ATAC": PeakVI model for ATAC data with Bernoulli likelihood on binarized count data. + + We assume that regardless of the modality, the raw data will be stored in the `counts` layer + (e.g. UMI counts for GEX and peak counts for ATAC), and the normalized data in the `normalized` layer. + use_hvg + Whether to subset the data to highly variable genes (HVGs) before training the model + adt_normalization + Normalization method for ADT data. Options are: + - "clr" (centered log-ratio transformation) + - "log_cp10k" (normalization to 10k counts per cell and logarithm transformation) + plot_umap + Purely for diagnostic purposes, to see whether the data integration looks ok, this optionally computes + a UMAP in shared latent space and stores a plot. + + Returns + ------- + ad.AnnData + AnnData object with the latent representation in `obsm["X_scvi"]`, regardless of the modality. + """ + # Subset to highly variable features + if "hvg" in adata.var.columns and use_hvg: + n_hvg = adata.var["hvg"].sum() + print(f"Subsetting to {n_hvg} highly variable features ({n_hvg/adata.n_vars:.2%})", flush=True) + adata = adata[:, adata.var["hvg"]].copy() + else: + print("Training on all available features", flush=True) + + # Setup the AnnData object for scVI + if modality == "GEX": + layer = "counts" + scvi.model.SCVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) + model = scvi.model.SCVI(adata) + + elif modality == "ADT": + print(f"Normalizing the ADT data using method '{adt_normalization}'") + if adt_normalization == "clr": + adata.X = csc_matrix(adata.layers["counts"]) # Use raw counts for ADT + muon.prot.pp.clr(adata) + adata.layers["adt_normalized"] = csr_matrix(adata.X) + elif adt_normalization == "log_cp10k": + adata.layers["adt_normalized"] = adata.layers["normalized"] + else: + raise ValueError(f"Unknown ADT normalization method: {adt_normalization}") + + layer = "adt_normalized" + scvi.model.SCVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) + model = scvi.model.SCVI(adata, gene_likelihood="normal", n_layers=1, n_latent=10) + elif modality == "ATAC": + layer = "counts" + scvi.model.PEAKVI.setup_anndata(adata, layer=layer, categorical_covariate_keys=["split", "batch"]) + model = scvi.model.PEAKVI(adata) + else: + raise ValueError(f"Unknown modality: {modality}") + + example_data = adata.layers[layer].data if issparse(adata.layers[layer]) else adata.layers[layer] + print(f"Set up AnnData for modality: '{modality}' using layer: '{layer}'", flush=True) + print(f"Data looks like this: {example_data}", flush=True) + print(model, flush=True) + + # Train the model + model.train(early_stopping=True) + + # Get the latent representation + adata.obsm["X_scvi"] = model.get_latent_representation() + + if plot_umap: + sc.pp.neighbors(adata, use_rep="X_scvi") + sc.tl.umap(adata) + + plot_name = f"_{modality}_{adt_normalization}_use_hvg_{use_hvg}.png" + sc.pl.embedding(adata, basis="umap", color=["batch", "split"], show=False, save=plot_name) + + return adata \ No newline at end of file diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 75a7429..ea4396c 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -68,6 +68,8 @@ dependencies: - name: control_methods/solution - name: methods/knnr_py - name: methods/knnr_r + - name: methods/cellmapper_linear + - name: methods/cellmapper_scvi - name: methods/lm - name: methods/guanlab_dengkw_pm - name: methods/novel diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index a032696..6a7989d 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -13,6 +13,8 @@ methods = [ solution, knnr_py, knnr_r, + cellmapper_linear, + cellmapper_scvi, lm, guanlab_dengkw_pm, novel,