Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,7 @@
/output
trace-*
.ipynb_checkpoints
/temp
/temp
__pycache__/
*.pyc
*.pyo
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## NEW FUNCTIONALITY

* Added CellMapper method (two variants: simple PCA/CCA fallback and modality-specific scvi-tools models for joint mod1 representation) (PR #10)

* Added Novel method (PR #2).

* Added Simple MLP method (PR #3).
Expand Down
73 changes: 73 additions & 0 deletions src/methods/cellmapper_linear/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
__merge__: ../../api/comp_method.yaml
name: cellmapper_linear
label: CellMapper+PCA/CCA
summary: "Modality prediction in a PCA/CCA space using CellMapper"
description: |
CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics.
This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset (test) in a PCA/CCA latent space.
references:
doi:
- 10.5281/zenodo.15683594
links:
documentation: https://cellmapper.readthedocs.io/en/latest/
repository: https://github.com/quadbio/cellmapper
info:
preferred_normalization: log_cp10k
variants:
cellmapper-pca:
fallback_representation: joint_pca
mask_var: None
kernel_method: hnoca
cellmapper-pca-hvg:
fallback_representation: joint_pca
mask_var: "hvg"
kernel_method: hnoca
cellmapper-pca-hvg-gauss:
fallback_representation: joint_pca
mask_var: "hvg"
kernel_method: gauss
cellmapper-cca:
fallback_representation: fast_cca
mask_var: None
kernel_method: hnoca
cellmapper-cca-hvg:
fallback_representation: fast_cca
mask_var: "hvg"
kernel_method: hnoca
cellmapper-cca-hvg-gauss:
fallback_representation: fast_cca
mask_var: "hvg"
kernel_method: gauss
arguments:
- name: "--fallback_representation"
type: "string"
choices: ["joint_pca", "fast_cca"]
default: "fast_cca"
description: Fallback representation to use for k-NN mapping (computed if use_rep is None).
- name: "--mask_var"
type: "string"
description: Variable to mask for fallback representation.
- name: "--kernel_method"
type: "string"
choices: ["hnoca", "gauss"]
default: "hnoca"
description: Kernel function to compute k-NN edge weights.
- name: "--n_neighbors"
type: "integer"
default: 30
description: Number of neighbors to consider for k-NN graph construction.
resources:
- type: python_script
path: script.py
engines:
- type: docker
image: openproblems/base_python:1
setup:
- type: python
packages:
- cellmapper>=0.2.2
runners:
- type: executable
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
60 changes: 60 additions & 0 deletions src/methods/cellmapper_linear/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import anndata as ad
import cellmapper as cm
from scipy.sparse import csc_matrix

## VIASH START
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
par = {
'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod1.h5ad',
'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/train_mod2.h5ad',
'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_cite/normal/test_mod1.h5ad',
'output': 'output.h5ad',
'fallback_representation': 'joint_pca', # or None for fallback_representation
'n_neighbors': 30,
'kernel_method': 'gauss',
'mask_var': "hvg" # variable to mask for fallback representation
}
meta = {
'name': 'cellmapper_linear',
}
## VIASH END

print('Reading input files', flush=True)
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])

print('Prepare the data', flush=True)
# Make sure we have normalized data in .X for mod1
input_train_mod1.X = input_train_mod1.layers["normalized"].copy()
input_test_mod1.X = input_test_mod1.layers["normalized"].copy()

# copy the normalized layer to obsm for mod2
input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"]

print("Set up and prepare Cellmapper", flush=True)
cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
cmap.compute_neighbors(
use_rep=None,
fallback_representation=par['fallback_representation'],
n_neighbors=par['n_neighbors'],
fallback_kwargs={"mask_var": par['mask_var']},
)
cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])

print("Predict on test data", flush=True)
cmap.map_obsm(key="mod2", prediction_postfix="pred")
mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"])

print("Write output AnnData to file", flush=True)
output = ad.AnnData(
layers={"normalized": mod2_pred},
obs=input_test_mod1.obs,
var=input_train_mod2.var,
uns={
'dataset_id': input_train_mod1.uns['dataset_id'],
'method_id': meta["name"],
},
)
output.write_h5ad(par['output'], compression='gzip')
84 changes: 84 additions & 0 deletions src/methods/cellmapper_scvi/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
__merge__: ../../api/comp_method.yaml
name: cellmapper_scvi
label: CellMapper+scVI
summary: "Modality prediction in an scVI latent space using CellMapper"
description: |
CellMapper is a general framework for k-NN based mapping tasks in single-cell and spatial genomics.
This variant uses CellMapper to project modalities from a reference dataset (train) onto a query dataset
(test) in a modality-specific latent space computed with suitable scvi-tools models. For gene expression data,
we use the scVI model on raw counts (nb likelihood), for ADT data, we use the scVI models on normalized counts
(gaussian likelihood), and for ATAC data, we use the PeakVI model on raw counts. The actual CellMapper pipeline is
modality-agnostic.
references:
doi:
- 10.5281/zenodo.15683594
links:
documentation: https://cellmapper.readthedocs.io/en/latest/
repository: https://github.com/quadbio/cellmapper
info:
preferred_normalization: log_cp10k
variants:
cellmapper_hnoca_hvg:
kernel_method: hnoca
use_hvg: true
adt_normalization: clr
cellmapper_hnoca_all_genes:
kernel_method: hnoca
use_hvg: false
adt_normalization: clr
cellmapper_gauss_hvg:
kernel_method: gauss
use_hvg: true
adt_normalization: clr
cellmapper_gauss_hvg_log_cp10k:
kernel_method: gauss
use_hvg: true
adt_normalization: log_cp10k
cellmapper_gauss_all_genes:
kernel_method: gauss
use_hvg: false
adt_normalization: clr

arguments:
- name: "--kernel_method"
type: "string"
choices: ["hnoca", "gauss"]
default: "hnoca"
description: Kernel function to compute k-NN edge weights (CellMapper parameter).
- name: "--n_neighbors"
type: "integer"
default: 30
description: Number of neighbors to consider for k-NN graph construction (CellMapper parameter).
- name: "--use_hvg"
type: boolean
default: true
description: Whether to use highly variable genes (HVG) for the mapping (Generic analysis parameter).
- name: "--adt_normalization"
type: "string"
choices: ["clr", "log_cp10k"]
default: "clr"
description: Normalization method for ADT data, clr = centered log ratio.
- name: "--plot_umap"
type: boolean
default: false
description: Whether to plot the UMAP embedding of the latent space (for diagnoscic purposes)
resources:
- type: python_script
path: script.py
- path: utils.py
dest: utils.py
engines:
- type: docker
image: openproblems/base_pytorch_nvidia:1.0.0
setup:
- type: python
packages:
- cellmapper>=0.2.2
- scvi-tools>=1.3.0
- muon>=0.1.6

runners:
- type: executable
- type: nextflow
directives:
label: [midtime,midmem,midcpu,gpu]
80 changes: 80 additions & 0 deletions src/methods/cellmapper_scvi/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import sys
import anndata as ad
import cellmapper as cm
from scipy.sparse import csc_matrix

## VIASH START
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
par = {
'input_train_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod1.h5ad',
'input_train_mod2': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/train_mod2.h5ad',
'input_test_mod1': 'resources_test/task_predict_modality/openproblems_neurips2021/bmmc_multiome/swap/test_mod1.h5ad',
'output': 'output.h5ad',
'n_neighbors': 30,
'kernel_method': 'hnoca',
'use_hvg': False,
'adt_normalization': 'clr', # Normalization method for ADT data
'plot_umap': True,

}
meta = {
'name': 'cellmapper_scvi',
'resources_dir': 'target/executable/methods/cellmapper_scvi',
}
## VIASH END

sys.path.append(meta['resources_dir'])
from utils import get_representation

print('Reading input files', flush=True)
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])

mod1 = input_train_mod1.uns['modality']
mod2 = input_train_mod2.uns['modality']
print(f"Modality 1: {mod1}, n_features: {input_train_mod1.n_vars}", flush=True)
print(f"Modality 2: {mod2}, n_features: {input_train_mod2.n_vars}", flush=True)

print("Concatenating train and test data", flush=True)
adata = ad.concat(
[input_train_mod1, input_test_mod1], merge = "same", label="split", keys=["train", "test"]
)

# Compute a latent representation using an appropriate model based on the modality
print("Get latent representation", flush=True)
adata = get_representation(
adata=adata, modality=mod1, use_hvg=par['use_hvg'], adt_normalization=par['adt_normalization'], plot_umap=par['plot_umap']
)

# Place the representation back into individual objects
input_train_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "train"].obsm["X_scvi"].copy()
input_test_mod1.obsm["X_scvi"] = adata[adata.obs["split"] == "test"].obsm["X_scvi"].copy()

# copy the normalized layer to obsm for mod2
input_train_mod1.obsm["mod2"] = input_train_mod2.layers["normalized"]

print('Setup and prepare Cellmapper', flush=True)
cmap = cm.CellMapper(query=input_test_mod1, reference=input_train_mod1)
cmap.compute_neighbors(
use_rep="X_scvi",
n_neighbors=par['n_neighbors'],
)
cmap.compute_mapping_matrix(kernel_method=par['kernel_method'])

print("Predict on test data", flush=True)
cmap.map_obsm(key="mod2", prediction_postfix="pred")
mod2_pred = csc_matrix(cmap.query.obsm["mod2_pred"])

print("Write output AnnData to file", flush=True)
output = ad.AnnData(
layers={"normalized": mod2_pred},
obs=input_test_mod1.obs,
var=input_train_mod2.var,
uns={
'dataset_id': input_train_mod1.uns['dataset_id'],
'method_id': meta["name"],
},
)
output.write_h5ad(par['output'], compression='gzip')
Loading