diff --git a/CHANGELOG.md b/CHANGELOG.md index 84c45bce40d..6a1ae297595 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -52,6 +52,15 @@ multisample workflows are executed in-tandem. If you wish to execute the singlesample workflows in a seperate manner and still include count based statistics, please run the `qc` pipeline on the result of the singlesample workflow (PR #604). + * `filter/filter_with_hvg` has been renamed to `feature_annotation/highly_variable_features_scanpy`, along with the following changes (PR #667). + - `--do_filter` was removed + - `--n_top_genes` has been renamed to `--n_top_features` + * `full_pipeline`, `multisample` and `rna_multisample`: Renamed arguments (PR #667). + - `--filter_with_hvg_var_output` became `--highly_variable_features_obs_batch_key` + - `--filter_with_hvg_obs_batch_key` became `--highly_variable_features_var_output` + * `rna_multisample`: Renamed arguments (PR #667). + - `--filter_with_hvg_n_top_genes` became `--highly_variable_features_n_top_features` + - `--filter_with_hvg_flavor` became `--highly_variable_features_flavor` * Renamed `obsm_metrics` to `uns_metrics` for the `cellranger_mapping` workflow because the cellranger metrics are stored in `.uns` and not `.obsm` (PR #610). diff --git a/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml b/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml new file mode 100644 index 00000000000..a83bb4fe6f7 --- /dev/null +++ b/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml @@ -0,0 +1,132 @@ +functionality: + name: highly_variable_features_scanpy + namespace: feature_annotation + description: | + Annotate highly variable features [Satija15] [Zheng17] [Stuart19]. + + Expects logarithmized data, except when flavor='seurat_v3' in which count data is expected. + + Depending on flavor, this reproduces the R-implementations of Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19]. + + For the dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected. + + For [Stuart19], a normalized variance for each feature is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each feature after the transformation. Features are ranked by the normalized variance. + authors: + - __merge__: /src/authors/dries_de_maeyer.yaml + roles: [ contributor ] + - __merge__: /src/authors/robrecht_cannoodt.yaml + roles: [ maintainer, contributor ] + arguments: + # input + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + + - name: "--modality" + type: string + default: "rna" + required: false + - name: "--layer" + type: string + description: use adata.layers[layer] for expression values instead of adata.X. + required: false + # output + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + + - name: "--var_name_filter" + type: string + default: "filter_with_hvg" + description: In which .var slot to store a boolean array corresponding to which observations should be filtered out. + + - name: "--varm_name" + type: string + default: "hvg" + description: In which .varm slot to store additional metadata. + + # arguments + - name: "--flavor" + type: string + default: "seurat" + choices: ["seurat", "cell_ranger", "seurat_v3"] + description: | + Choose the flavor for identifying highly variable features. For the dispersion based methods + in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features. + + - name: "--n_top_features" + type: integer + description: Number of highly-variable features to keep. Mandatory if flavor='seurat_v3'. + + - name: "--min_mean" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. + default: 0.0125 + + - name: "--max_mean" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. + default: 3 + + - name: "--min_disp" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. + default: 0.5 + + - name: "--max_disp" + type: double + description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. Default is +inf. + # default: "+inf" + + - name: "--span" + type: double + description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'. + default: 0.3 + + - name: "--n_bins" + type: integer + description: Number of bins for binning the mean feature expression. Normalization is done with respect to each bin. If just a single feature falls into a bin, the normalized dispersion is artificially set to 1. + default: 20 + + - name: "--obs_batch_key" + type: string + description: | + If specified, highly-variable features are selected within each batch separately and merged. This simple + process avoids the selection of batch-specific features and acts as a lightweight batch correction method. + For all flavors, features are first sorted by how many batches they are a HVG. For dispersion-based flavors + ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across + batches) rank based on within-batch normalized variance. + resources: + - type: python_script + path: script.py + - path: /src/utils/setup_logger.py + test_resources: + - type: python_script + path: test.py + - path: /resources_test/pbmc_1k_protein_v3 +platforms: + - type: docker + image: python:3.9 + setup: + - type: python + __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .] + packages: + - scikit-misc + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + - type: nextflow + directives: + label: [singlecpu, lowmem] diff --git a/src/feature_annotation/highly_variable_features_scanpy/script.py b/src/feature_annotation/highly_variable_features_scanpy/script.py new file mode 100644 index 00000000000..8aaa7ead812 --- /dev/null +++ b/src/feature_annotation/highly_variable_features_scanpy/script.py @@ -0,0 +1,145 @@ +import scanpy as sc +import mudata as mu +import sys +import re + +## VIASH START +par = { + 'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu', + 'modality': 'rna', + 'output': 'output.h5mu', + 'var_name_filter': 'filter_with_hvg', + 'do_subset': False, + 'flavor': 'seurat', + 'n_top_features': None, + 'min_mean': 0.0125, + 'max_mean': 3.0, + 'min_disp': 0.5, + 'span': 0.3, + 'n_bins': 20, + 'varm_name': 'hvg', + 'obs_batch_key': None, + 'layer': 'log_transformed' +} + +mu_in = mu.read_h5mu('resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu') +rna_in = mu_in.mod["rna"] +assert "filter_with_hvg" not in rna_in.var.columns +log_transformed = sc.pp.log1p(rna_in, copy=True) +rna_in.layers['log_transformed'] = log_transformed.X +rna_in.uns['log1p'] = log_transformed.uns['log1p'] +temp_h5mu = "lognormed.h5mu" +rna_in.obs['batch'] = 'A' +column_index = rna_in.obs.columns.get_indexer(['batch']) +rna_in.obs.iloc[slice(rna_in.n_obs//2, None), column_index] = 'B' +mu_in.write_h5mu(temp_h5mu) +par['input'] = temp_h5mu +## VIASH END + +sys.path.append(meta["resources_dir"]) +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + +mdata = mu.read_h5mu(par["input"]) +mdata.var_names_make_unique() + +mod = par['modality'] +logger.info(f"Processing modality '%s'", mod) +data = mdata.mod[mod] + +# Workaround for issue +# https://github.com/scverse/scanpy/issues/2239 +# https://github.com/scverse/scanpy/issues/2181 +if par['flavor'] != "seurat_v3": + # This component requires log normalized data when flavor is not seurat_v3 + # We assume that the data is correctly normalized but scanpy will look at + # .uns to check the transformations performed on the data. + # To prevent scanpy from automatically tranforming the counts when they are + # already transformed, we set the appropriate values to .uns. + if 'log1p' not in data.uns: + logger.warning("When flavor is not set to 'seurat_v3', " + "the input data for this component must be log-transformed. " + "However, the 'log1p' dictionairy in .uns has not been set. " + "This is fine if you did not log transform your data with scanpy." + "Otherwise, please check if you are providing log transformed " + "data using --layer.") + data.uns['log1p'] = {'base': None} + elif 'log1p' in data.uns and 'base' not in data.uns['log1p']: + data.uns['log1p']['base'] = None + +logger.info("\tUnfiltered data: %s", data) + +logger.info("\tComputing hvg") +# construct arguments +hvg_args = { + 'adata': data, + 'n_top_genes': par["n_top_features"], + 'min_mean': par["min_mean"], + 'max_mean': par["max_mean"], + 'min_disp': par["min_disp"], + 'span': par["span"], + 'n_bins': par["n_bins"], + 'flavor': par["flavor"], + 'subset': False, + 'inplace': False, + 'layer': par['layer'], +} + +optional_parameters = { + "max_disp": "max_disp", + "obs_batch_key": "batch_key", + "n_top_genes": "n_top_features" +} +# only add parameter if it's passed +for par_name, dest_name in optional_parameters.items(): + if par.get(par_name): + hvg_args[dest_name] = par[par_name] + +# scanpy does not do this check, although it is stated in the documentation +if par['flavor'] == "seurat_v3" and not par['n_top_features']: + raise ValueError("When flavor is set to 'seurat_v3', you are required to set 'n_top_features'.") + +if par["layer"] and not par['layer'] in data.layers: + raise ValueError(f"Layer '{par['layer']}' not found in layers for modality '{mod}'. " + f"Found layers are: {','.join(data.layers)}") +# call function +try: + out = sc.pp.highly_variable_genes(**hvg_args) + if par['obs_batch_key'] is not None: + assert (out.index == data.var.index).all(), "Expected output index values to be equivalent to the input index" +except ValueError as err: + if str(err) == "cannot specify integer `bins` when input data contains infinity": + err.args = ("Cannot specify integer `bins` when input data contains infinity. " + "Perhaps input data has not been log normalized?",) + if re.search("Bin edges must be unique:", str(err)): + raise RuntimeError("Scanpy failed to calculate hvg. The error " + "returned by scanpy (see above) could be the " + "result from trying to use this component on unfiltered data.") from err + raise err + +out.index = data.var.index +logger.info("\tStoring output into .var") +if par.get("var_name_filter", None) is not None: + data.var[par["var_name_filter"]] = out["highly_variable"] + +if par.get("varm_name", None) is not None and 'mean_bin' in out: + # drop mean_bin as mudata/anndata doesn't support tuples + data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1) + +logger.info("Writing h5mu to file") +mdata.write_h5mu(par["output"], compression=par["output_compression"]) diff --git a/src/feature_annotation/highly_variable_features_scanpy/test.py b/src/feature_annotation/highly_variable_features_scanpy/test.py new file mode 100644 index 00000000000..b5deb8bf0ac --- /dev/null +++ b/src/feature_annotation/highly_variable_features_scanpy/test.py @@ -0,0 +1,158 @@ +import os +import subprocess +import scanpy as sc +import mudata as mu +import sys +import pytest +import re +import pandas as pd + + +## VIASH START +meta = { + 'resources_dir': 'resources_test/', + 'config': './src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml', + 'executable': './target/docker/feature_annotation/highly_variable_features_scanpy/highly_variable_features_scanpy' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + +@pytest.fixture +def input_path(): + return f"{meta['resources_dir']}/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu" + +@pytest.fixture +def input_data(input_path): + mu_in = mu.read_h5mu(input_path) + return mu_in + +@pytest.fixture +def lognormed_test_data(input_data): + rna_in = input_data.mod["rna"] + assert "filter_with_hvg" not in rna_in.var.columns + log_transformed = sc.pp.log1p(rna_in, copy=True) + rna_in.layers['log_transformed'] = log_transformed.X + rna_in.uns['log1p'] = log_transformed.uns['log1p'] + return input_data + +@pytest.fixture +def lognormed_test_data_path(tmp_path, lognormed_test_data): + temp_h5mu = tmp_path / "lognormed.h5mu" + lognormed_test_data.write_h5mu(temp_h5mu) + return temp_h5mu + +@pytest.fixture +def lognormed_batch_test_data_path(tmp_path, lognormed_test_data): + temp_h5mu = tmp_path / "lognormed_batch.h5mu" + rna_mod = lognormed_test_data.mod['rna'] + rna_mod.obs['batch'] = 'A' + column_index = rna_mod.obs.columns.get_indexer(['batch']) + rna_mod.obs.iloc[slice(rna_mod.n_obs//2, None), column_index] = 'B' + lognormed_test_data.write_h5mu(temp_h5mu) + return temp_h5mu + +@pytest.fixture() +def filter_data_path(tmp_path, input_data): + temp_h5mu = tmp_path / "filtered.h5mu" + rna_in = input_data.mod["rna"] + sc.pp.filter_genes(rna_in, min_counts=20) + input_data.write_h5mu(temp_h5mu) + return temp_h5mu + + +def test_filter_with_hvg(run_component, lognormed_test_data_path): + out = run_component([ + "--flavor", "seurat", + "--input", lognormed_test_data_path, + "--output", "output.h5mu", + "--layer", "log_transformed", + "--output_compression", "gzip"]) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + +def test_filter_with_hvg_batch_with_batch(run_component, lognormed_batch_test_data_path): + """ + Make sure that selecting a layer works together with obs_batch_key. + https://github.com/scverse/scanpy/issues/2396 + """ + run_component([ + "--flavor", "seurat", + "--input", lognormed_batch_test_data_path, + "--output", "output.h5mu", + "--obs_batch_key", "batch", + "--layer", "log_transformed"]) + assert os.path.exists("output.h5mu") + output_data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in output_data.mod["rna"].var.columns + + # Check the contents of the output to check if the correct layer was selected + input_data = mu.read_h5mu(lognormed_batch_test_data_path).mod['rna'].copy() + input_data.X = input_data.layers['log_transformed'].copy() + del input_data.layers['log_transformed'] + input_data.uns['log1p']['base'] = None + expected_output = sc.pp.highly_variable_genes(input_data, batch_key="batch", inplace=False, subset=False) + pd.testing.assert_series_equal(expected_output['highly_variable'], + output_data.mod['rna'].var['filter_with_hvg'], + check_names=False) + +def test_filter_with_hvg_seurat_v3_requires_n_top_features(run_component, input_path): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component([ + "--input", input_path, + "--flavor", "seurat_v3", # Uses raw data. + "--output", "output.h5mu"]) + assert re.search(f"When flavor is set to 'seurat_v3', you are required to set 'n_top_features'\.", + err.value.stdout.decode('utf-8')) + +def test_filter_with_hvg_seurat_v3(run_component, input_path): + run_component([ + "--input", input_path, + "--flavor", "seurat_v3", # Uses raw data. + "--output", "output.h5mu", + "--n_top_features", "50"]) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + +def test_filter_with_hvg_cell_ranger(run_component, filter_data_path): + run_component([ + "--input", filter_data_path, + "--flavor", "cell_ranger", # Must use filtered data. + "--output", "output.h5mu"]) + assert os.path.exists("output.h5mu") + data = mu.read_h5mu("output.h5mu") + assert "filter_with_hvg" in data.mod["rna"].var.columns + +def test_filter_with_hvg_cell_ranger_unfiltered_data_change_error_message(run_component, input_path): + with pytest.raises(subprocess.CalledProcessError) as err: + run_component([ + "--input", input_path, + "--flavor", "cell_ranger", # Must use filtered data, but in this test we use unfiltered data + "--output", "output.h5mu"]) + assert re.search(r"Scanpy failed to calculate hvg\. The error " + r"returned by scanpy \(see above\) could be the " + r"result from trying to use this component on unfiltered data\.", + err.value.stdout.decode('utf-8')) + + +if __name__ == '__main__': + sys.exit(pytest.main([__file__])) diff --git a/src/filter/filter_with_hvg/config.vsh.yaml b/src/filter/filter_with_hvg/config.vsh.yaml index 63006139424..eea6a7a6aac 100644 --- a/src/filter/filter_with_hvg/config.vsh.yaml +++ b/src/filter/filter_with_hvg/config.vsh.yaml @@ -1,5 +1,6 @@ functionality: name: filter_with_hvg + status: deprecated namespace: filter description: | Annotate highly variable genes [Satija15] [Zheng17] [Stuart19]. diff --git a/src/workflows/multiomics/process_batches/config.vsh.yaml b/src/workflows/multiomics/process_batches/config.vsh.yaml index 28ce5884d5b..2254c2cdd5c 100644 --- a/src/workflows/multiomics/process_batches/config.vsh.yaml +++ b/src/workflows/multiomics/process_batches/config.vsh.yaml @@ -33,14 +33,16 @@ functionality: direction: output description: Destination path to the output. example: output.h5mu - - name: "Highly variable gene detection" + - name: "Highly variable features detection" arguments: - - name: "--filter_with_hvg_var_output" + - name: "--highly_variable_features_var_output" + alternatives: ["--filter_with_hvg_var_output"] required: false type: string default: "filter_with_hvg" description: In which .var slot to store a boolean array corresponding to the highly variable genes. - - name: "--filter_with_hvg_obs_batch_key" + - name: "--highly_variable_features_obs_batch_key" + alternatives: [--filter_with_hvg_obs_batch_key] type: string default: "sample_id" required: false diff --git a/src/workflows/multiomics/process_batches/main.nf b/src/workflows/multiomics/process_batches/main.nf index ca98623adba..d2a06f5ac35 100644 --- a/src/workflows/multiomics/process_batches/main.nf +++ b/src/workflows/multiomics/process_batches/main.nf @@ -61,8 +61,8 @@ workflow run_wf { // def multisample_arguments = [ "rna": [ - "filter_with_hvg_var_output": "filter_with_hvg_var_output", - "filter_with_hvg_obs_batch_key": "filter_with_hvg_obs_batch_key", + "highly_variable_features_var_output": "highly_variable_features_var_output", + "highly_variable_features_obs_batch_key": "highly_variable_features_obs_batch_key", "var_qc_metrics": "var_qc_metrics", "top_n_vars": "top_n_vars" ], @@ -159,7 +159,7 @@ workflow run_wf { "input": state.input, "layer": "log_normalized", "modality": "rna", - "var_pca_feature_selection": state.filter_with_hvg_var_output, // run PCA on highly variable genes only + "var_pca_feature_selection": state.highly_variable_features_var_output, // run PCA on highly variable genes only "pca_overwrite": state.pca_overwrite, ], "dimensionality_reduction_prot": diff --git a/src/workflows/multiomics/process_samples/config.vsh.yaml b/src/workflows/multiomics/process_samples/config.vsh.yaml index f579ac7b6c6..5375741cb65 100644 --- a/src/workflows/multiomics/process_samples/config.vsh.yaml +++ b/src/workflows/multiomics/process_samples/config.vsh.yaml @@ -105,6 +105,7 @@ functionality: example: 3 type: integer description: Minimum of non-zero values per protein. + - name: "GDO filtering options" arguments: - name: "--gdo_min_counts" @@ -127,14 +128,17 @@ functionality: example: 3 type: integer description: Minimum of non-zero values per guide. - - name: "Highly variable gene detection" + + - name: "Highly variable features detection" arguments: - - name: "--filter_with_hvg_var_output" + - name: "--highly_variable_features_var_output" + alternatives: ["--filter_with_hvg_var_output"] required: false type: string default: "filter_with_hvg" description: In which .var slot to store a boolean array corresponding to the highly variable genes. - - name: "--filter_with_hvg_obs_batch_key" + - name: "--highly_variable_features_obs_batch_key" + alternatives: ["--filter_with_hvg_obs_batch_key"] type: string default: "sample_id" required: false @@ -177,7 +181,7 @@ functionality: Keys to select a boolean (containing only True or False) column from .var. For each cell, calculate the proportion of total values for genes which are labeled 'True', compared to the total sum of the values for all genes. Defaults to the combined values specified for - --var_name_mitochondrial_genes and --filter_with_hvg_var_output. + --var_name_mitochondrial_genes and --highly_variable_features_var_output. type: string multiple: True multiple_sep: ',' @@ -230,5 +234,6 @@ functionality: entrypoint: test_wf4 - path: /resources_test/concat_test_data - path: /resources_test/pbmc_1k_protein_v3 + - path: /resources_test/10x_5k_lung_crispr platforms: - type: nextflow diff --git a/src/workflows/multiomics/process_samples/main.nf b/src/workflows/multiomics/process_samples/main.nf index 4e8ac73bbcd..576b4f640f6 100644 --- a/src/workflows/multiomics/process_samples/main.nf +++ b/src/workflows/multiomics/process_samples/main.nf @@ -13,7 +13,7 @@ workflow run_wf { // If requested to be detected, make sure the mitochondrial genes // are added to the input of the qc metrics calculation | map {id, state -> - def var_qc_default = [state.filter_with_hvg_var_output] + def var_qc_default = [state.highly_variable_features_var_output] if (state.var_name_mitochondrial_genes) { var_qc_default.add(state.var_name_mitochondrial_genes) } @@ -192,8 +192,8 @@ workflow run_wf { [ "id": id, "input": state.input, - "filter_with_hvg_var_output": state.filter_with_hvg_var_output, - "filter_with_hvg_obs_batch_key": state.filter_with_hvg_obs_batch_key, + "highly_variable_features_var_output": state.highly_variable_features_var_output, + "highly_variable_features_obs_batch_key": state.highly_variable_features_obs_batch_key, "var_qc_metrics": state.var_qc_metrics, "top_n_vars": state.top_n_vars, "pca_overwrite": state.pca_overwrite diff --git a/src/workflows/multiomics/process_samples/test.nf b/src/workflows/multiomics/process_samples/test.nf index aba28620eb8..ee880af55b7 100644 --- a/src/workflows/multiomics/process_samples/test.nf +++ b/src/workflows/multiomics/process_samples/test.nf @@ -100,7 +100,7 @@ workflow test_wf3 { publish_dir: "foo/", rna_min_counts: 2, var_qc_metrics: "highly_variable", - filter_with_hvg_var_output: "highly_variable", + highly_variable_features_var_output: "highly_variable", ], [ id: "human", @@ -108,7 +108,7 @@ workflow test_wf3 { publish_dir: "foo/", rna_min_counts: 2, var_qc_metrics: "highly_variable", - filter_with_hvg_var_output: "highly_variable", + highly_variable_features_var_output: "highly_variable", ] ]) | map{ state -> [state.id, state] } diff --git a/src/workflows/rna/rna_multisample/config.vsh.yaml b/src/workflows/rna/rna_multisample/config.vsh.yaml index cefe003f792..443a565d2c1 100644 --- a/src/workflows/rna/rna_multisample/config.vsh.yaml +++ b/src/workflows/rna/rna_multisample/config.vsh.yaml @@ -36,34 +36,38 @@ functionality: direction: output description: Destination path to the output. example: output.h5mu - - name: "Filtering highly variable genes" + - name: "Filtering highly variable features" arguments: - - name: "--filter_with_hvg_var_output" + - name: "--highly_variable_features_var_output" + alternatives: ["--filter_with_hvg_var_output"] required: false type: string default: "filter_with_hvg" - description: In which .var slot to store a boolean array corresponding to the highly variable genes. - - name: "--filter_with_hvg_obs_batch_key" + description: In which .var slot to store a boolean array corresponding to the highly variable features. + - name: "--highly_variable_features_obs_batch_key" + alternatives: ["--filter_with_hvg_obs_batch_key"] type: string default: "sample_id" required: false description: | - If specified, highly-variable genes are selected within each batch separately and merged. This simple - process avoids the selection of batch-specific genes and acts as a lightweight batch correction method. - For all flavors, genes are first sorted by how many batches they are a HVG. For dispersion-based flavors + If specified, highly-variable features are selected within each batch separately and merged. This simple + process avoids the selection of batch-specific features and acts as a lightweight batch correction method. + For all flavors, featues are first sorted by how many batches they are highly variable. For dispersion-based flavors ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across batches) rank based on within-batch normalized variance. - - name: "--filter_with_hvg_flavor" + - name: "--highly_variable_features_flavor" + alternatives: ["--filter_with_hvg_flavor"] type: string default: "seurat" choices: ["seurat", "cell_ranger", "seurat_v3"] description: | - Choose the flavor for identifying highly variable genes. For the dispersion based methods - in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_genes. - - name: "--filter_with_hvg_n_top_genes" + Choose the flavor for identifying highly variable features. For the dispersion based methods + in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features. + - name: "--highly_variable_features_n_top_features" + alternatives: ["--filter_with_hvg_n_top_genes"] required: false type: integer - description: Number of highly-variable genes to keep. Mandatory if filter_with_hvg_flavor is set to 'seurat_v3'. + description: Number of highly-variable features to keep. Mandatory if filter_with_hvg_flavor is set to 'seurat_v3'. - name: "QC metrics calculation options" arguments: - name: "--var_qc_metrics" @@ -133,7 +137,7 @@ functionality: dependencies: - name: transform/normalize_total - name: transform/log1p - - name: filter/filter_with_hvg + - name: feature_annotation/highly_variable_features_scanpy - name: workflows/qc/qc alias: rna_qc - name: transform/delete_layer diff --git a/src/workflows/rna/rna_multisample/main.nf b/src/workflows/rna/rna_multisample/main.nf index 65e32c3c7df..bdeb7ae6c73 100644 --- a/src/workflows/rna/rna_multisample/main.nf +++ b/src/workflows/rna/rna_multisample/main.nf @@ -39,16 +39,16 @@ workflow run_wf { }, toState: ["input": "output"] ) - | filter_with_hvg.run( + | highly_variable_features_scanpy.run( fromState: {id, state -> [ "input": state.input, "layer": "log_normalized", "modality": state.modality, - "var_name_filter": state.filter_with_hvg_var_output, - "n_top_genes": state.filter_with_hvg_n_top_genes, - "flavor": state.filter_with_hvg_flavor, - "obs_batch_key": state.filter_with_hvg_obs_batch_key + "var_name_filter": state.highly_variable_features_var_output, + "n_top_features": state.highly_variable_features_n_top_features, + "flavor": state.highly_variable_features_flavor, + "obs_batch_key": state.highly_variable_features_obs_batch_key ] }, toState: ["input": "output"],