openpipelines-bio · DriesSchaumont · Jan 26, 2024 · Jan 24, 2024 · Jan 24, 2024 · Jan 26, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -52,6 +52,15 @@
        multisample workflows are executed in-tandem. If you wish to execute the singlesample workflows
        in a seperate manner and still include count based statistics, please run the `qc` pipeline
        on the result of the singlesample workflow (PR #604).
+    * `filter/filter_with_hvg` has been renamed to `feature_annotation/highly_variable_features_scanpy`, along with the following changes (PR #667).
+      - `--do_filter` was removed
+      - `--n_top_genes` has been renamed to `--n_top_features`
+    * `full_pipeline`, `multisample` and `rna_multisample`: Renamed arguments (PR #667).
+      - `--filter_with_hvg_var_output` became `--highly_variable_features_obs_batch_key`
+      - `--filter_with_hvg_obs_batch_key` became `--highly_variable_features_var_output`
+    * `rna_multisample`: Renamed arguments (PR #667).
+      - `--filter_with_hvg_n_top_genes` became `--highly_variable_features_n_top_features`
+      - `--filter_with_hvg_flavor` became `--highly_variable_features_flavor`
 
 * Renamed `obsm_metrics` to `uns_metrics` for the `cellranger_mapping` workflow because the cellranger metrics are stored in `.uns` and not `.obsm` (PR #610).
 

diff --git a/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml b/src/feature_annotation/highly_variable_features_scanpy/config.vsh.yaml
@@ -0,0 +1,132 @@
+functionality:
+  name: highly_variable_features_scanpy
+  namespace: feature_annotation
+  description: |
+    Annotate highly variable features [Satija15] [Zheng17] [Stuart19].
+
+    Expects logarithmized data, except when flavor='seurat_v3' in which count data is expected.
+
+    Depending on flavor, this reproduces the R-implementations of Seurat [Satija15], Cell Ranger [Zheng17], and Seurat v3 [Stuart19].
+
+    For the dispersion-based methods ([Satija15] and [Zheng17]), the normalized dispersion is obtained by scaling with the mean and standard deviation of the dispersions for features falling into a given bin for mean expression of features. This means that for each bin of mean expression, highly variable features are selected.
+
+    For [Stuart19], a normalized variance for each feature is computed. First, the data are standardized (i.e., z-score normalization per feature) with a regularized standard deviation. Next, the normalized variance is computed as the variance of each feature after the transformation. Features are ranked by the normalized variance.
+  authors:
+    - __merge__: /src/authors/dries_de_maeyer.yaml
+      roles: [ contributor ]
+    - __merge__: /src/authors/robrecht_cannoodt.yaml
+      roles: [ maintainer, contributor ]
+  arguments:
+    # input
+    - name: "--input"
+      type: file
+      description: Input h5mu file
+      direction: input
+      required: true
+      example: input.h5mu
+
+    - name: "--modality"
+      type: string
+      default: "rna"
+      required: false
+    - name: "--layer"
+      type: string
+      description: use adata.layers[layer] for expression values instead of adata.X.
+      required: false
+    # output
+    - name: "--output"
+      type: file
+      description: Output h5mu file.
+      direction: output
+      example: output.h5mu
+
+    - name: "--output_compression"
+      type: string
+      description: The compression format to be used on the output h5mu object.
+      choices: ["gzip", "lzf"]
+      required: false
+      example: "gzip"
+
+    - name: "--var_name_filter"
+      type: string
+      default: "filter_with_hvg"
+      description: In which .var slot to store a boolean array corresponding to which observations should be filtered out.
+
+    - name: "--varm_name"
+      type: string
+      default: "hvg"
+      description: In which .varm slot to store additional metadata.
+
+    # arguments
+    - name: "--flavor"
+      type: string
+      default: "seurat"
+      choices: ["seurat", "cell_ranger", "seurat_v3"]
+      description: |
+        Choose the flavor for identifying highly variable features. For the dispersion based methods
+        in their default workflows, Seurat passes the cutoffs whereas Cell Ranger passes n_top_features.
+
+    - name: "--n_top_features"
+      type: integer
+      description: Number of highly-variable features to keep. Mandatory if flavor='seurat_v3'.
+
+    - name: "--min_mean"
+      type: double
+      description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.
+      default: 0.0125
+
+    - name: "--max_mean"
+      type: double
+      description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.
+      default: 3
+
+    - name: "--min_disp"
+      type: double
+      description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'.
+      default: 0.5
+
+    - name: "--max_disp"
+      type: double
+      description: If n_top_features is defined, this and all other cutoffs for the means and the normalized dispersions are ignored. Ignored if flavor='seurat_v3'. Default is +inf.
+      # default: "+inf"
+
+    - name: "--span"
+      type: double
+      description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'. 
+      default: 0.3
+
+    - name: "--n_bins"
+      type: integer
+      description: Number of bins for binning the mean feature expression. Normalization is done with respect to each bin. If just a single feature falls into a bin, the normalized dispersion is artificially set to 1.
+      default: 20
+
+    - name: "--obs_batch_key"
+      type: string
+      description: |
+        If specified, highly-variable features are selected within each batch separately and merged. This simple 
+        process avoids the selection of batch-specific features and acts as a lightweight batch correction method. 
+        For all flavors, features are first sorted by how many batches they are a HVG. For dispersion-based flavors 
+        ties are broken by normalized dispersion. If flavor = 'seurat_v3', ties are broken by the median (across
+        batches) rank based on within-batch normalized variance.
+  resources:
+    - type: python_script
+      path: script.py
+    - path: /src/utils/setup_logger.py
+  test_resources:
+    - type: python_script
+      path: test.py
+    - path: /resources_test/pbmc_1k_protein_v3
+platforms:
+  - type: docker
+    image: python:3.9
+    setup: 
+      - type: python
+        __merge__: [/src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml, .]
+        packages:
+          - scikit-misc
+    test_setup:
+      - type: python
+        __merge__: [ /src/base/requirements/viashpy.yaml, .]
+  - type: nextflow
+    directives:
+      label: [singlecpu, lowmem]
diff --git a/src/feature_annotation/highly_variable_features_scanpy/script.py b/src/feature_annotation/highly_variable_features_scanpy/script.py
@@ -0,0 +1,145 @@
+import scanpy as sc
+import mudata as mu
+import sys
+import re
+
+## VIASH START
+par = {
+  'input': 'resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu',
+  'modality': 'rna',
+  'output': 'output.h5mu',
+  'var_name_filter': 'filter_with_hvg',
+  'do_subset': False,
+  'flavor': 'seurat',
+  'n_top_features': None,
+  'min_mean': 0.0125,
+  'max_mean': 3.0,
+  'min_disp': 0.5,
+  'span': 0.3,
+  'n_bins': 20,
+  'varm_name': 'hvg',
+  'obs_batch_key': None,
+  'layer': 'log_transformed'
+}
+
+mu_in = mu.read_h5mu('resources_test/pbmc_1k_protein_v3/pbmc_1k_protein_v3_filtered_feature_bc_matrix.h5mu')
+rna_in = mu_in.mod["rna"]
+assert "filter_with_hvg" not in rna_in.var.columns
+log_transformed = sc.pp.log1p(rna_in, copy=True)
+rna_in.layers['log_transformed'] = log_transformed.X
+rna_in.uns['log1p'] = log_transformed.uns['log1p']
+temp_h5mu = "lognormed.h5mu"
+rna_in.obs['batch'] = 'A'
+column_index = rna_in.obs.columns.get_indexer(['batch'])
+rna_in.obs.iloc[slice(rna_in.n_obs//2, None), column_index] = 'B'
+mu_in.write_h5mu(temp_h5mu)
+par['input'] = temp_h5mu
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+# START TEMPORARY WORKAROUND setup_logger
+# reason: resources aren't available when using Nextflow fusion
+# from setup_logger import setup_logger
+def setup_logger():
+    import logging
+    from sys import stdout
+
+    logger = logging.getLogger()
+    logger.setLevel(logging.INFO)
+    console_handler = logging.StreamHandler(stdout)
+    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
+    console_handler.setFormatter(logFormatter)
+    logger.addHandler(console_handler)
+
+    return logger
+# END TEMPORARY WORKAROUND setup_logger
+logger = setup_logger()
+
+mdata = mu.read_h5mu(par["input"])
+mdata.var_names_make_unique()
+
+mod = par['modality']
+logger.info(f"Processing modality '%s'", mod)
+data = mdata.mod[mod]
+
+# Workaround for issue
+# https://github.com/scverse/scanpy/issues/2239
+# https://github.com/scverse/scanpy/issues/2181
+if par['flavor'] != "seurat_v3":
+    # This component requires log normalized data when flavor is not seurat_v3
+    # We assume that the data is correctly normalized but scanpy will look at
+    # .uns to check the transformations performed on the data.
+    # To prevent scanpy from automatically tranforming the counts when they are
+    # already transformed, we set the appropriate values to .uns.
+    if 'log1p' not in data.uns:
+        logger.warning("When flavor is not set to 'seurat_v3', "
+                       "the input data for this component must be log-transformed. "
+                       "However, the 'log1p' dictionairy in .uns has not been set. "
+                       "This is fine if you did not log transform your data with scanpy."
+                       "Otherwise, please check if you are providing log transformed "
+                       "data using --layer.")
+        data.uns['log1p'] = {'base': None}
+    elif 'log1p' in data.uns and 'base' not in data.uns['log1p']:
+        data.uns['log1p']['base'] = None
+
+logger.info("\tUnfiltered data: %s", data)
+
+logger.info("\tComputing hvg")
+# construct arguments
+hvg_args = {
+    'adata': data,
+    'n_top_genes': par["n_top_features"],
+    'min_mean': par["min_mean"],
+    'max_mean': par["max_mean"],
+    'min_disp': par["min_disp"],
+    'span': par["span"],
+    'n_bins': par["n_bins"],
+    'flavor': par["flavor"],
+    'subset': False,
+    'inplace': False,
+    'layer': par['layer'],
+}
+
+optional_parameters = {
+    "max_disp": "max_disp",
+    "obs_batch_key": "batch_key",
+    "n_top_genes": "n_top_features"
+}
+# only add parameter if it's passed
+for par_name, dest_name in optional_parameters.items():
+    if par.get(par_name):
+        hvg_args[dest_name] = par[par_name]
+
+# scanpy does not do this check, although it is stated in the documentation
+if par['flavor'] == "seurat_v3" and not par['n_top_features']:
+    raise ValueError("When flavor is set to 'seurat_v3', you are required to set 'n_top_features'.")
+
+if par["layer"] and not par['layer'] in data.layers:
+    raise ValueError(f"Layer '{par['layer']}' not found in layers for modality '{mod}'. "
+                     f"Found layers are: {','.join(data.layers)}")
+# call function
+try:
+    out = sc.pp.highly_variable_genes(**hvg_args)
+    if par['obs_batch_key'] is not None:
+        assert (out.index == data.var.index).all(), "Expected output index values to be equivalent to the input index"
+except ValueError as err:
+    if str(err) == "cannot specify integer `bins` when input data contains infinity":
+        err.args = ("Cannot specify integer `bins` when input data contains infinity. "
+                    "Perhaps input data has not been log normalized?",)
+    if re.search("Bin edges must be unique:", str(err)):
+        raise RuntimeError("Scanpy failed to calculate hvg. The error "
+                           "returned by scanpy (see above) could be the "
+                           "result from trying to use this component on unfiltered data.") from err
+    raise err
+
+out.index = data.var.index
+logger.info("\tStoring output into .var")
+if par.get("var_name_filter", None) is not None:
+    data.var[par["var_name_filter"]] = out["highly_variable"]
+
+if par.get("varm_name", None) is not None and 'mean_bin' in out:
+    # drop mean_bin as mudata/anndata doesn't support tuples
+    data.varm[par["varm_name"]] = out.drop("mean_bin", axis=1)
+
+logger.info("Writing h5mu to file")
+mdata.write_h5mu(par["output"], compression=par["output_compression"])