target/nextflow/annotate/popv/main.nf

// popv main_build
// 
// This wrapper script is auto-generated by viash 0.7.5 and is thus a derivative
// work thereof. This software comes with ABSOLUTELY NO WARRANTY from Data
// Intuitive.
// 
// The component may contain files which fall under a different license. The
// authors of this component should specify the license in the header of such
// files, or include a separate license file detailing the licenses of all included
// files.
// 
// Component authors:
//  * Matthias Beyens (author)
//  * Robrecht Cannoodt (author)

nextflow.enable.dsl=2

// Required imports
import groovy.json.JsonSlurper

// initialise slurper
def jsonSlurper = new JsonSlurper()

// DEFINE CUSTOM CODE

// functionality metadata
thisConfig = processConfig(jsonSlurper.parseText('''{
  "functionality" : {
    "name" : "popv",
    "namespace" : "annotate",
    "version" : "main_build",
    "authors" : [
      {
        "name" : "Matthias Beyens",
        "roles" : [
          "author"
        ],
        "info" : {
          "role" : "Contributor",
          "links" : {
            "github" : "MatthiasBeyens",
            "orcid" : "0000-0003-3304-0706",
            "email" : "matthias.beyens@gmail.com",
            "linkedin" : "mbeyens"
          },
          "organizations" : [
            {
              "name" : "Janssen Pharmaceuticals",
              "href" : "https://www.janssen.com",
              "role" : "Principal Scientist"
            }
          ]
        }
      },
      {
        "name" : "Robrecht Cannoodt",
        "roles" : [
          "author"
        ],
        "info" : {
          "role" : "Core Team Member",
          "links" : {
            "email" : "robrecht@data-intuitive.com",
            "github" : "rcannood",
            "orcid" : "0000-0003-3641-729X",
            "linkedin" : "robrechtcannoodt"
          },
          "organizations" : [
            {
              "name" : "Data Intuitive",
              "href" : "https://www.data-intuitive.com",
              "role" : "Data Science Engineer"
            },
            {
              "name" : "Open Problems",
              "href" : "https://openproblems.bio",
              "role" : "Core Member"
            }
          ]
        }
      }
    ],
    "argument_groups" : [
      {
        "name" : "Inputs",
        "description" : "Arguments related to the input (aka query) dataset.",
        "arguments" : [
          {
            "type" : "file",
            "name" : "--input",
            "alternatives" : [
              "-i"
            ],
            "description" : "Input h5mu file.",
            "example" : [
              "input.h5mu"
            ],
            "must_exist" : true,
            "create_parent" : true,
            "required" : true,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--modality",
            "description" : "Which modality to process.",
            "default" : [
              "rna"
            ],
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--input_layer",
            "description" : "Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[input_layer]`.",
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--input_obs_batch",
            "description" : "Key in obs field of input adata for batch information. If no value is provided, batch label is assumed to be unknown.",
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--input_var_subset",
            "description" : "Subset the input object with this column.",
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--input_obs_label",
            "description" : "Key in obs field of input adata for label information. This is only used for training scANVI. Unlabelled cells should be set to `\\"unknown_celltype_label\\"`.",
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--unknown_celltype_label",
            "description" : "If `input_obs_label` is specified, cells with this value will be treated as unknown and will be predicted by the model.",
            "default" : [
              "unknown"
            ],
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          }
        ]
      },
      {
        "name" : "Reference",
        "description" : "Arguments related to the reference dataset.",
        "arguments" : [
          {
            "type" : "file",
            "name" : "--reference",
            "description" : "User-provided reference tissue. The data that will be used as reference to call cell types.",
            "example" : [
              "TS_Bladder_filtered.h5ad"
            ],
            "must_exist" : true,
            "create_parent" : true,
            "required" : true,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--reference_layer",
            "description" : "Which layer to use. If no value is provided, the counts are assumed to be in the `.X` slot. Otherwise, count data is expected to be in `.layers[reference_layer]`.",
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--reference_obs_label",
            "description" : "Key in obs field of reference AnnData with cell-type information.",
            "default" : [
              "cell_ontology_class"
            ],
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--reference_obs_batch",
            "description" : "Key in obs field of input adata for batch information.",
            "default" : [
              "donor_assay"
            ],
            "required" : false,
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          }
        ]
      },
      {
        "name" : "Outputs",
        "description" : "Output arguments.",
        "arguments" : [
          {
            "type" : "file",
            "name" : "--output",
            "description" : "Output h5mu file.",
            "example" : [
              "output.h5mu"
            ],
            "must_exist" : true,
            "create_parent" : true,
            "required" : true,
            "direction" : "output",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          },
          {
            "type" : "string",
            "name" : "--output_compression",
            "example" : [
              "gzip"
            ],
            "required" : false,
            "choices" : [
              "gzip",
              "lzf"
            ],
            "direction" : "input",
            "multiple" : false,
            "multiple_sep" : ":",
            "dest" : "par"
          }
        ]
      },
      {
        "name" : "Arguments",
        "description" : "Other arguments.",
        "arguments" : [
          {
            "type" : "string",
            "name" : "--methods",
            "description" : "Methods to call cell types. By default, runs to knn_on_scvi and scanvi.",
            "example" : [
              "knn_on_scvi",
              "scanvi"
            ],
            "required" : true,
            "choices" : [
              "celltypist",
              "knn_on_bbknn",
              "knn_on_scanorama",
              "knn_on_scvi",
              "onclass",
              "rf",
              "scanvi",
              "svm"
            ],
            "direction" : "input",
            "multiple" : true,
            "multiple_sep" : ":",
            "dest" : "par"
          }
        ]
      }
    ],
    "resources" : [
      {
        "type" : "python_script",
        "path" : "script.py",
        "is_executable" : true,
        "parent" : "file:/home/runner/work/openpipeline/openpipeline/src/annotate/popv/"
      },
      {
        "type" : "file",
        "path" : "src/utils/setup_logger.py",
        "parent" : "file:///home/runner/work/openpipeline/openpipeline/"
      }
    ],
    "description" : "Performs popular major vote cell typing on single cell sequence data using multiple algorithms. Note that this is a one-shot version of PopV.",
    "test_resources" : [
      {
        "type" : "python_script",
        "path" : "test.py",
        "is_executable" : true,
        "parent" : "file:/home/runner/work/openpipeline/openpipeline/src/annotate/popv/"
      },
      {
        "type" : "file",
        "path" : "resources_test/annotation_test_data/",
        "parent" : "file:///home/runner/work/openpipeline/openpipeline/"
      },
      {
        "type" : "file",
        "path" : "resources_test/pbmc_1k_protein_v3/",
        "parent" : "file:///home/runner/work/openpipeline/openpipeline/"
      }
    ],
    "status" : "enabled",
    "requirements" : {
      "commands" : [
        "ps"
      ]
    },
    "set_wd_to_resources_dir" : false
  },
  "platforms" : [
    {
      "type" : "docker",
      "id" : "docker",
      "image" : "python:3.9-slim",
      "target_organization" : "openpipelines-bio",
      "target_registry" : "ghcr.io",
      "namespace_separator" : "_",
      "resolve_volume" : "Automatic",
      "chown" : true,
      "setup_strategy" : "ifneedbepullelsecachedbuild",
      "target_image_source" : "https://github.com/openpipelines-bio/openpipeline",
      "setup" : [
        {
          "type" : "apt",
          "packages" : [
            "procps",
            "git",
            "build-essential",
            "wget"
          ],
          "interactive" : false
        },
        {
          "type" : "python",
          "user" : false,
          "packages" : [
            "scanpy~=1.9.2",
            "scvi-tools~=0.20.3",
            "popv~=0.3.2"
          ],
          "upgrade" : true
        },
        {
          "type" : "python",
          "user" : false,
          "packages" : [
            "mudata~=0.2.3",
            "anndata~=0.9.1"
          ],
          "upgrade" : true
        },
        {
          "type" : "docker",
          "run" : [
            "cd /opt && git clone --depth 1 https://github.com/YosefLab/PopV.git && \\\\\n  cd PopV && git fetch --depth 1 origin tag v0.2 && git checkout v0.2\n"
          ]
        },
        {
          "type" : "python",
          "user" : false,
          "packages" : [
            "jax==0.4.9",
            "jaxlib==0.4.9"
          ],
          "upgrade" : true
        }
      ],
      "test_setup" : [
        {
          "type" : "python",
          "user" : false,
          "packages" : [
            "viashpy"
          ],
          "upgrade" : true
        }
      ]
    },
    {
      "type" : "nextflow",
      "id" : "nextflow",
      "directives" : {
        "label" : [
          "highmem",
          "highcpu"
        ],
        "tag" : "$id"
      },
      "auto" : {
        "simplifyInput" : true,
        "simplifyOutput" : true,
        "transcript" : false,
        "publish" : false
      },
      "config" : {
        "labels" : {
          "mem1gb" : "memory = 1.GB",
          "mem2gb" : "memory = 2.GB",
          "mem4gb" : "memory = 4.GB",
          "mem8gb" : "memory = 8.GB",
          "mem16gb" : "memory = 16.GB",
          "mem32gb" : "memory = 32.GB",
          "mem64gb" : "memory = 64.GB",
          "mem128gb" : "memory = 128.GB",
          "mem256gb" : "memory = 256.GB",
          "mem512gb" : "memory = 512.GB",
          "mem1tb" : "memory = 1.TB",
          "mem2tb" : "memory = 2.TB",
          "mem4tb" : "memory = 4.TB",
          "mem8tb" : "memory = 8.TB",
          "mem16tb" : "memory = 16.TB",
          "mem32tb" : "memory = 32.TB",
          "mem64tb" : "memory = 64.TB",
          "mem128tb" : "memory = 128.TB",
          "mem256tb" : "memory = 256.TB",
          "mem512tb" : "memory = 512.TB",
          "cpu1" : "cpus = 1",
          "cpu2" : "cpus = 2",
          "cpu5" : "cpus = 5",
          "cpu10" : "cpus = 10",
          "cpu20" : "cpus = 20",
          "cpu50" : "cpus = 50",
          "cpu100" : "cpus = 100",
          "cpu200" : "cpus = 200",
          "cpu500" : "cpus = 500",
          "cpu1000" : "cpus = 1000"
        }
      },
      "debug" : false,
      "container" : "docker"
    }
  ],
  "info" : {
    "config" : "/home/runner/work/openpipeline/openpipeline/src/annotate/popv/config.vsh.yaml",
    "platform" : "nextflow",
    "output" : "/home/runner/work/openpipeline/openpipeline/target/nextflow/annotate/popv",
    "viash_version" : "0.7.5",
    "git_commit" : "679aa4eb97581c8dbc9fb9d68214dd2b579f1288",
    "git_remote" : "https://github.com/openpipelines-bio/openpipeline"
  }
}'''))

thisScript = '''set -e
tempscript=".viash_script.sh"
cat > "$tempscript" << VIASHMAIN
import sys
import re
import tempfile
import typing
import numpy as np
import mudata as mu
import anndata as ad
import popv

# todo: is this still needed?
from torch.cuda import is_available as cuda_is_available
try:
    from torch.backends.mps import is_available as mps_is_available
except ModuleNotFoundError:
    # Older pytorch versions
    # MacOS GPUs
    def mps_is_available():
        return False

# where to find the obo files
cl_obo_folder = "/opt/PopV/ontology/"

## VIASH START
# The following code has been auto-generated by Viash.
par = {
  'input': $( if [ ! -z ${VIASH_PAR_INPUT+x} ]; then echo "r'${VIASH_PAR_INPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'modality': $( if [ ! -z ${VIASH_PAR_MODALITY+x} ]; then echo "r'${VIASH_PAR_MODALITY//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'input_layer': $( if [ ! -z ${VIASH_PAR_INPUT_LAYER+x} ]; then echo "r'${VIASH_PAR_INPUT_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'input_obs_batch': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'input_var_subset': $( if [ ! -z ${VIASH_PAR_INPUT_VAR_SUBSET+x} ]; then echo "r'${VIASH_PAR_INPUT_VAR_SUBSET//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'input_obs_label': $( if [ ! -z ${VIASH_PAR_INPUT_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_INPUT_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'unknown_celltype_label': $( if [ ! -z ${VIASH_PAR_UNKNOWN_CELLTYPE_LABEL+x} ]; then echo "r'${VIASH_PAR_UNKNOWN_CELLTYPE_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'reference': $( if [ ! -z ${VIASH_PAR_REFERENCE+x} ]; then echo "r'${VIASH_PAR_REFERENCE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'reference_layer': $( if [ ! -z ${VIASH_PAR_REFERENCE_LAYER+x} ]; then echo "r'${VIASH_PAR_REFERENCE_LAYER//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'reference_obs_label': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_LABEL+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_LABEL//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'reference_obs_batch': $( if [ ! -z ${VIASH_PAR_REFERENCE_OBS_BATCH+x} ]; then echo "r'${VIASH_PAR_REFERENCE_OBS_BATCH//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'output': $( if [ ! -z ${VIASH_PAR_OUTPUT+x} ]; then echo "r'${VIASH_PAR_OUTPUT//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'output_compression': $( if [ ! -z ${VIASH_PAR_OUTPUT_COMPRESSION+x} ]; then echo "r'${VIASH_PAR_OUTPUT_COMPRESSION//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'methods': $( if [ ! -z ${VIASH_PAR_METHODS+x} ]; then echo "r'${VIASH_PAR_METHODS//\\'/\\'\\"\\'\\"r\\'}'.split(':')"; else echo None; fi )
}
meta = {
  'functionality_name': $( if [ ! -z ${VIASH_META_FUNCTIONALITY_NAME+x} ]; then echo "r'${VIASH_META_FUNCTIONALITY_NAME//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'resources_dir': $( if [ ! -z ${VIASH_META_RESOURCES_DIR+x} ]; then echo "r'${VIASH_META_RESOURCES_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'executable': $( if [ ! -z ${VIASH_META_EXECUTABLE+x} ]; then echo "r'${VIASH_META_EXECUTABLE//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'config': $( if [ ! -z ${VIASH_META_CONFIG+x} ]; then echo "r'${VIASH_META_CONFIG//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'temp_dir': $( if [ ! -z ${VIASH_META_TEMP_DIR+x} ]; then echo "r'${VIASH_META_TEMP_DIR//\\'/\\'\\"\\'\\"r\\'}'"; else echo None; fi ),
  'cpus': $( if [ ! -z ${VIASH_META_CPUS+x} ]; then echo "int(r'${VIASH_META_CPUS//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ),
  'memory_b': $( if [ ! -z ${VIASH_META_MEMORY_B+x} ]; then echo "int(r'${VIASH_META_MEMORY_B//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ),
  'memory_kb': $( if [ ! -z ${VIASH_META_MEMORY_KB+x} ]; then echo "int(r'${VIASH_META_MEMORY_KB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ),
  'memory_mb': $( if [ ! -z ${VIASH_META_MEMORY_MB+x} ]; then echo "int(r'${VIASH_META_MEMORY_MB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ),
  'memory_gb': $( if [ ! -z ${VIASH_META_MEMORY_GB+x} ]; then echo "int(r'${VIASH_META_MEMORY_GB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ),
  'memory_tb': $( if [ ! -z ${VIASH_META_MEMORY_TB+x} ]; then echo "int(r'${VIASH_META_MEMORY_TB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi ),
  'memory_pb': $( if [ ! -z ${VIASH_META_MEMORY_PB+x} ]; then echo "int(r'${VIASH_META_MEMORY_PB//\\'/\\'\\"\\'\\"r\\'}')"; else echo None; fi )
}

## VIASH END

sys.path.append(meta["resources_dir"])
# START TEMPORARY WORKAROUND setup_logger
# reason: resources aren't available when using Nextflow fusion
# from setup_logger import setup_logger
def setup_logger():
    import logging
    from sys import stdout

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    console_handler = logging.StreamHandler(stdout)
    logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s")
    console_handler.setFormatter(logFormatter)
    logger.addHandler(console_handler)

    return logger
# END TEMPORARY WORKAROUND setup_logger
logger = setup_logger()

use_gpu = cuda_is_available() or mps_is_available()
logger.info("GPU enabled? %s", use_gpu)

# Helper functions
def get_X(adata: ad.AnnData, layer: typing.Optional[str], var_index: typing.Optional[str]):
    """Fetch the counts data from X or a layer. Subset columns by var_index if so desired."""
    if var_index:
        adata = adata[:, var_index]
    if layer:
        return adata.layers[layer]
    else:
        return adata.X
def get_obs(adata: ad.AnnData, obs_par_names):
    """Subset the obs dataframe to just the columns defined by the obs_label and obs_batch."""
    obs_columns = [par[x] for x in obs_par_names if par[x]]
    return adata.obs[obs_columns]
def get_var(adata: ad.AnnData, var_index: list[str]):
    """Fetch the var dataframe. Subset rows by var_index if so desired."""
    return adata.var.loc[var_index]

def main(par, meta):
    assert len(par["methods"]) >= 1, "Please, specify at least one method for cell typing."
    logger.info("Cell typing methods: {}".format(par["methods"]))

    ### PREPROCESSING REFERENCE ###
    logger.info("### PREPROCESSING REFERENCE ###")
    
    # take a look at reference data
    logger.info("Reading reference data '%s'", par["reference"])
    reference = ad.read_h5ad(par["reference"])
    
    logger.info("Setting reference var index to Ensembl IDs")
    reference.var["gene_symbol"] = list(reference.var.index)
    reference.var.index = [re.sub("\\\\\\\\.[0-9]+\\$", "", s) for s in reference.var["ensemblid"]]

    logger.info("Detect number of samples per label")
    min_celltype_size = np.min(reference.obs.groupby(par["reference_obs_batch"]).size())
    n_samples_per_label = np.max((min_celltype_size, 100))

    ### PREPROCESSING INPUT ###
    logger.info("### PREPROCESSING INPUT ###")
    logger.info("Reading '%s'", par["input"])
    input = mu.read_h5mu(par["input"])
    input_modality = input.mod[par["modality"]]

    # subset with var column
    if par["input_var_subset"]:
        logger.info("Subset input with .var['%s']", par["input_var_subset"])
        assert par["input_var_subset"] in input_modality.var, f"--input_var_subset='{par['input_var_subset']}' needs to be a column in .var"
        input_modality = input_modality[:,input_modality.var[par["input_var_subset"]]]

    ### ALIGN REFERENCE AND INPUT ###
    logger.info("### ALIGN REFERENCE AND INPUT ###")

    logger.info("Detecting common vars based on ensembl ids")
    common_ens_ids = list(set(reference.var.index).intersection(set(input_modality.var.index)))
    
    logger.info("  reference n_vars: %i", reference.n_vars)
    logger.info("  input n_vars: %i", input_modality.n_vars)
    logger.info("  intersect n_vars: %i", len(common_ens_ids))
    assert len(common_ens_ids) >= 100, "The intersection of genes is too small."

    # subset input objects to make sure popv is using the data we expect
    input_modality = ad.AnnData(
        X = get_X(input_modality, par["input_layer"], common_ens_ids),
        obs = get_obs(input_modality, ["input_obs_label", "input_obs_batch"]),
        var = get_var(input_modality, common_ens_ids)
    )
    reference = ad.AnnData(
        X = get_X(reference, par["reference_layer"], common_ens_ids),
        obs = get_obs(reference, ["reference_obs_label", "reference_obs_batch"]),
        var = get_var(reference, common_ens_ids)
    )

    # remove layers that 
    
    ### ALIGN REFERENCE AND INPUT ###
    logger.info("### ALIGN REFERENCE AND INPUT ###")

    with tempfile.TemporaryDirectory(prefix="popv-", dir=meta["temp_dir"]) as temp_dir:
        logger.info("Run PopV processing")
        pq = popv.preprocessing.Process_Query(
            # input
            query_adata=input_modality,
            query_labels_key=par["input_obs_label"],
            query_batch_key=par["input_obs_batch"],
            query_layers_key=None, # this is taken care of by subset
            # reference
            ref_adata=reference,
            ref_labels_key=par["reference_obs_label"],
            ref_batch_key=par["reference_obs_batch"],
            # options
            unknown_celltype_label=par["unknown_celltype_label"],
            n_samples_per_label=n_samples_per_label,
            # pretrained model
            # Might need to be parameterized at some point
            prediction_mode="retrain",
            pretrained_scvi_path=None,
            # outputs
            # Might need to be parameterized at some point
            save_path_trained_models=temp_dir,
            # hardcoded values
            cl_obo_folder=cl_obo_folder,
            use_gpu=use_gpu
        )
        method_kwargs = {}
        if 'scanorama' in par['methods']:
            method_kwargs['scanorama'] = {'approx': False}
        logger.info("Annotate data")
        popv.annotation.annotate_data(
            adata=pq.adata,
            methods=par["methods"],
            methods_kwargs=method_kwargs
        )

    popv_input = pq.adata[input_modality.obs_names]

    # select columns starting with "popv_"
    popv_obs_cols = popv_input.obs.columns[popv_input.obs.columns.str.startswith("popv_")]

    # create new data frame with selected columns
    df_popv = popv_input.obs[popv_obs_cols]

    # remove prefix from column names
    df_popv.columns = df_popv.columns.str.replace("popv_", "")

    # store output in mudata .obsm
    input.mod[par["modality"]].obsm["popv_output"] = df_popv

    # copy important output in mudata .obs
    for col in ["popv_prediction"]:
        if col in popv_input.obs.columns:
            input.mod[par["modality"]].obs[col] = popv_input.obs[col]

    # code to explore how the output differs from the original
    # for attr in ["obs", "var", "uns", "obsm", "layers", "obsp"]:
    #     old_keys = set(getattr(pq_adata_orig, attr).keys())
    #     new_keys = set(getattr(pq.adata, attr).keys())
    #     diff_keys = list(new_keys.difference(old_keys))
    #     diff_keys.sort()
    #     print(f"{attr}:", flush=True)
    #     for key in diff_keys:
    #         print(f"  {key}", flush=True)
    
    # write output
    logger.info("Writing %s", par["output"])
    input.write_h5mu(par["output"], compression=par["output_compression"])

if __name__ == "__main__":
    main(par, meta)
VIASHMAIN
python -B "$tempscript"
'''

thisDefaultProcessArgs = [
  // key to be used to trace the process and determine output names
  key: thisConfig.functionality.name,
  // fixed arguments to be passed to script
  args: [:],
  // default directives
  directives: jsonSlurper.parseText('''{
  "container" : {
    "registry" : "ghcr.io",
    "image" : "openpipelines-bio/annotate_popv",
    "tag" : "main_build"
  },
  "label" : [
    "highmem",
    "highcpu"
  ],
  "tag" : "$id"
}'''),
  // auto settings
  auto: jsonSlurper.parseText('''{
  "simplifyInput" : true,
  "simplifyOutput" : true,
  "transcript" : false,
  "publish" : false
}'''),

  // Apply a map over the incoming tuple
  // Example: `{ tup -> [ tup[0], [input: tup[1].output] ] + tup.drop(2) }`
  map: null,

  // Apply a map over the ID element of a tuple (i.e. the first element)
  // Example: `{ id -> id + "_foo" }`
  mapId: null,

  // Apply a map over the data element of a tuple (i.e. the second element)
  // Example: `{ data -> [ input: data.output ] }`
  mapData: null,

  // Apply a map over the passthrough elements of a tuple (i.e. the tuple excl. the first two elements)
  // Example: `{ pt -> pt.drop(1) }`
  mapPassthrough: null,

  // Filter the channel
  // Example: `{ tup -> tup[0] == "foo" }`
  filter: null,

  // Rename keys in the data field of the tuple (i.e. the second element)
  // Will likely be deprecated in favour of `fromState`.
  // Example: `[ "new_key": "old_key" ]`
  renameKeys: null,

  // Fetch data from the state and pass it to the module without altering the current state.
  // 
  // `fromState` should be `null`, `List[String]`, `Map[String, String]` or a function. 
  // 
  // - If it is `null`, the state will be passed to the module as is.
  // - If it is a `List[String]`, the data will be the values of the state at the given keys.
  // - If it is a `Map[String, String]`, the data will be the values of the state at the given keys, with the keys renamed according to the map.
  // - If it is a function, the tuple (`[id, state]`) in the channel will be passed to the function, and the result will be used as the data.
  // 
  // Example: `{ id, state -> [input: state.fastq_file] }`
  // Default: `null`
  fromState: null,

  // Determine how the state should be updated after the module has been run.
  // 
  // `toState` should be `null`, `List[String]`, `Map[String, String]` or a function.
  // 
  // - If it is `null`, the state will be replaced with the output of the module.
  // - If it is a `List[String]`, the state will be updated with the values of the data at the given keys.
  // - If it is a `Map[String, String]`, the state will be updated with the values of the data at the given keys, with the keys renamed according to the map.
  // - If it is a function, a tuple (`[id, output, state]`) will be passed to the function, and the result will be used as the new state.
  //
  // Example: `{ id, output, state -> state + [counts: state.output] }`
  // Default: `{ id, output, state -> output }`
  toState: null,

  // Whether or not to print debug messages
  // Default: `false`
  debug: false
]

// END CUSTOM CODE

/////////////////////////////////////
// Viash Workflow helper functions //
/////////////////////////////////////

import java.util.regex.Pattern
import java.io.BufferedReader
import java.io.FileReader
import java.nio.file.Paths
import java.nio.file.Files
import groovy.json.JsonSlurper
import groovy.text.SimpleTemplateEngine
import org.yaml.snakeyaml.Yaml

// param helpers //
def paramExists(name) {
  return params.containsKey(name) && params[name] != ""
}

def assertParamExists(name, description) {
  if (!paramExists(name)) {
    exit 1, "ERROR: Please provide a --${name} parameter ${description}"
  }
}

// helper functions for reading params from file //
def getChild(parent, child) {
  if (child.contains("://") || Paths.get(child).isAbsolute()) {
    child
  } else {
    def parentAbsolute = Paths.get(parent).toAbsolutePath().toString()
    parentAbsolute.replaceAll('/[^/]*$', "/") + child
  }
}

def readCsv(file_path) {
  def output = []
  def inputFile = file_path !instanceof Path ? file(file_path) : file_path

  // todo: allow escaped quotes in string
  // todo: allow single quotes?
  def splitRegex = Pattern.compile(''',(?=(?:[^"]*"[^"]*")*[^"]*$)''')
  def removeQuote = Pattern.compile('''"(.*)"''')

  def br = Files.newBufferedReader(inputFile)

  def row = -1
  def header = null
  while (br.ready() && header == null) {
    def line = br.readLine()
    row++
    if (!line.startsWith("#")) {
      header = splitRegex.split(line, -1).collect{field ->
        m = removeQuote.matcher(field)
        m.find() ? m.replaceFirst('$1') : field
      }
    }
  }
  assert header != null: "CSV file should contain a header"

  while (br.ready()) {
    def line = br.readLine()
    row++
    if (line == null) {
      br.close()
      break
    }

    if (!line.startsWith("#")) {
      def predata = splitRegex.split(line, -1)
      def data = predata.collect{field ->
        if (field == "") {
          return null
        }
        m = removeQuote.matcher(field)
        if (m.find()) {
          return m.replaceFirst('$1')
        } else {
          return field
        }
      }
      assert header.size() == data.size(): "Row $row should contain the same number as fields as the header"
      
      def dataMap = [header, data].transpose().collectEntries().findAll{it.value != null}
      output.add(dataMap)
    }
  }

  output
}

def readJsonBlob(str) {
  def jsonSlurper = new JsonSlurper()
  jsonSlurper.parseText(str)
}

def readJson(file_path) {
  def inputFile = file_path !instanceof Path ? file(file_path) : file_path
  def jsonSlurper = new JsonSlurper()
  jsonSlurper.parse(inputFile)
}

def readYamlBlob(str) {
  def yamlSlurper = new Yaml()
  yamlSlurper.load(str)
}

def readYaml(file_path) {
  def inputFile = file_path !instanceof Path ? file(file_path) : file_path
  def yamlSlurper = new Yaml()
  yamlSlurper.load(inputFile)
}

// helper functions for reading a viash config in groovy //

// based on how Functionality.scala is implemented
def processArgument(arg) {
  arg.multiple = arg.multiple != null ? arg.multiple : false
  arg.required = arg.required != null ? arg.required : false
  arg.direction = arg.direction != null ? arg.direction : "input"
  arg.multiple_sep = arg.multiple_sep != null ? arg.multiple_sep : ":"
  arg.plainName = arg.name.replaceAll("^-*", "")

  if (arg.type == "file") {
    arg.must_exist = arg.must_exist != null ? arg.must_exist : true
    arg.create_parent = arg.create_parent != null ? arg.create_parent : true
  }

  if (arg.type == "file" && arg.direction == "output") {
    def mult = arg.multiple ? "_*" : ""
    def extSearch = ""
    if (arg.default != null) {
      extSearch = arg.default
    } else if (arg.example != null) {
      extSearch = arg.example
    }
    if (extSearch instanceof List) {
      extSearch = extSearch[0]
    }
    def extSearchResult = extSearch.find("\\.[^\\.]+\$")
    def ext = extSearchResult != null ? extSearchResult : ""
    arg.default = "\$id.\$key.${arg.plainName}${mult}${ext}"
  }

  if (!arg.multiple) {
    if (arg.default != null && arg.default instanceof List) {
      arg.default = arg.default[0]
    }
    if (arg.example != null && arg.example instanceof List) {
      arg.example = arg.example[0]
    }
  }

  if (arg.type == "boolean_true") {
    arg.default = false
  }
  if (arg.type == "boolean_false") {
    arg.default = true
  }

  arg
}

// based on how Functionality.scala is implemented
def processArgumentGroup(argumentGroups, name, arguments) {
  def argNamesInGroups = argumentGroups.collectMany{it.arguments.findAll{it instanceof String}}.toSet()

  // Check if 'arguments' is in 'argumentGroups'. 
  def argumentsNotInGroup = arguments.findAll{arg -> !(argNamesInGroups.contains(arg.plainName))}

  // Check whether an argument group of 'name' exists.
  def existing = argumentGroups.find{gr -> name == gr.name}

  // if there are no arguments missing from the argument group, just return the existing group (if any)
  if (argumentsNotInGroup.isEmpty()) {
    return existing == null ? [] : [existing]
  
  // if there are missing arguments and there is an existing group, add the missing arguments to it
  } else if (existing != null) {
    def newEx = existing.clone()
    newEx.arguments.addAll(argumentsNotInGroup.findAll{it !instanceof String})
    return [newEx]

  // else create a new group
  } else {
    def newEx = [name: name, arguments: argumentsNotInGroup.findAll{it !instanceof String}]
    return [newEx]
  }
}

// based on how Functionality.scala is implemented
def processConfig(config) {
  // TODO: assert .functionality etc.
  if (config.functionality.inputs) {
    System.err.println("Warning: .functionality.inputs is deprecated. Please use .functionality.arguments instead.")
  }
  if (config.functionality.outputs) {
    System.err.println("Warning: .functionality.outputs is deprecated. Please use .functionality.arguments instead.")
  }

  // set defaults for inputs
  config.functionality.inputs = 
    (config.functionality.inputs != null ? config.functionality.inputs : []).collect{arg ->
      arg.type = arg.type != null ? arg.type : "file"
      arg.direction = "input"
      processArgument(arg)
    }
  // set defaults for outputs
  config.functionality.outputs = 
    (config.functionality.outputs != null ? config.functionality.outputs : []).collect{arg ->
      arg.type = arg.type != null ? arg.type : "file"
      arg.direction = "output"
      processArgument(arg)
    }
  // set defaults for arguments
  config.functionality.arguments = 
    (config.functionality.arguments != null ? config.functionality.arguments : []).collect{arg ->
      processArgument(arg)
    }
  // set defaults for argument_group arguments
  config.functionality.argument_groups =
    (config.functionality.argument_groups != null ? config.functionality.argument_groups : []).collect{grp ->
      grp.arguments = (grp.arguments != null ? grp.arguments : []).collect{arg ->
        arg instanceof String ? arg.replaceAll("^-*", "") : processArgument(arg)
      }
      grp
    }

  // create combined arguments list
  config.functionality.allArguments = 
    config.functionality.inputs +
    config.functionality.outputs +
    config.functionality.arguments +
    config.functionality.argument_groups.collectMany{ group ->
      group.arguments.findAll{ it !instanceof String }
    }
  
  // add missing argument groups (based on Functionality::allArgumentGroups())
  def argGroups = config.functionality.argument_groups
  def inputGroup = processArgumentGroup(argGroups, "Inputs", config.functionality.inputs)
  def outputGroup = processArgumentGroup(argGroups, "Outputs", config.functionality.outputs)
  def defaultGroup = processArgumentGroup(argGroups, "Arguments", config.functionality.arguments)
  def groupsFiltered = argGroups.findAll(gr -> !(["Inputs", "Outputs", "Arguments"].contains(gr.name)))
  config.functionality.allArgumentGroups = inputGroup + outputGroup + defaultGroup + groupsFiltered

  config
}

def readConfig(file) {
  def config = readYaml(file != null ? file : "$projectDir/config.vsh.yaml")
  processConfig(config)
}

// recursively merge two maps
def mergeMap(Map lhs, Map rhs) {
  return rhs.inject(lhs.clone()) { map, entry ->
    if (map[entry.key] instanceof Map && entry.value instanceof Map) {
      map[entry.key] = mergeMap(map[entry.key], entry.value)
    } else if (map[entry.key] instanceof Collection && entry.value instanceof Collection) {
      map[entry.key] += entry.value
    } else {
      map[entry.key] = entry.value
    }
    return map
  }
}

def addGlobalParams(config) {
  def localConfig = [
    "functionality" : [
      "argument_groups": [
        [
          "name": "Nextflow input-output arguments",
          "description": "Input/output parameters for Nextflow itself. Please note that both publishDir and publish_dir are supported but at least one has to be configured.",
          "arguments" : [
            [
              'name': '--publish_dir',
              'required': true,
              'type': 'string',
              'description': 'Path to an output directory.',
              'example': 'output/',
              'multiple': false
            ],
            [
              'name': '--param_list',
              'required': false,
              'type': 'string',
              'description': '''Allows inputting multiple parameter sets to initialise a Nextflow channel. A `param_list` can either be a list of maps, a csv file, a json file, a yaml file, or simply a yaml blob.
              |
              |* A list of maps (as-is) where the keys of each map corresponds to the arguments of the pipeline. Example: in a `nextflow.config` file: `param_list: [ ['id': 'foo', 'input': 'foo.txt'], ['id': 'bar', 'input': 'bar.txt'] ]`.
              |* A csv file should have column names which correspond to the different arguments of this pipeline. Example: `--param_list data.csv` with columns `id,input`.
              |* A json or a yaml file should be a list of maps, each of which has keys corresponding to the arguments of the pipeline. Example: `--param_list data.json` with contents `[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]`.
              |* A yaml blob can also be passed directly as a string. Example: `--param_list "[ {'id': 'foo', 'input': 'foo.txt'}, {'id': 'bar', 'input': 'bar.txt'} ]"`.
              |
              |When passing a csv, json or yaml file, relative path names are relativized to the location of the parameter file. No relativation is performed when `param_list` is a list of maps (as-is) or a yaml blob.'''.stripMargin(),
              'example': 'my_params.yaml',
              'multiple': false,
              'hidden': true
            ],
          ]
        ]
      ]
    ]
  ]

  return processConfig(mergeMap(config, localConfig))
}

// helper functions for generating help // 

// based on io.viash.helpers.Format.wordWrap
def formatWordWrap(str, maxLength) {
  def words = str.split("\\s").toList()

  def word = null
  def line = ""
  def lines = []
  while(!words.isEmpty()) {
    word = words.pop()
    if (line.length() + word.length() + 1 <= maxLength) {
      line = line + " " + word
    } else {
      lines.add(line)
      line = word
    }
    if (words.isEmpty()) {
      lines.add(line)
    }
  }
  return lines
}

// based on Format.paragraphWrap
def paragraphWrap(str, maxLength) {
  def outLines = []
  str.split("\n").each{par ->
    def words = par.split("\\s").toList()

    def word = null
    def line = words.pop()
    while(!words.isEmpty()) {
      word = words.pop()
      if (line.length() + word.length() + 1 <= maxLength) {
        line = line + " " + word
      } else {
        outLines.add(line)
        line = word
      }
    }
    if (words.isEmpty()) {
      outLines.add(line)
    }
  }
  return outLines
}

def generateArgumentHelp(param) {
  // alternatives are not supported
  // def names = param.alternatives ::: List(param.name)

  def unnamedProps = [
    ["required parameter", param.required],
    ["multiple values allowed", param.multiple],
    ["output", param.direction.toLowerCase() == "output"],
    ["file must exist", param.type == "file" && param.must_exist]
  ].findAll{it[1]}.collect{it[0]}
  
  def dflt = null
  if (param.default != null) {
    if (param.default instanceof List) {
      dflt = param.default.join(param.multiple_sep != null ? param.multiple_sep : ", ")
    } else {
      dflt = param.default.toString()
    }
  }
  def example = null
  if (param.example != null) {
    if (param.example instanceof List) {
      example = param.example.join(param.multiple_sep != null ? param.multiple_sep : ", ")
    } else {
      example = param.example.toString()
    }
  }
  def min = param.min?.toString()
  def max = param.max?.toString()

  def escapeChoice = { choice ->
    def s1 = choice.replaceAll("\\n", "\\\\n")
    def s2 = s1.replaceAll("\"", """\\\"""")
    s2.contains(",") || s2 != choice ? "\"" + s2 + "\"" : s2
  }
  def choices = param.choices == null ? 
    null : 
    "[ " + param.choices.collect{escapeChoice(it.toString())}.join(", ") + " ]"

  def namedPropsStr = [
    ["type", ([param.type] + unnamedProps).join(", ")],
    ["default", dflt],
    ["example", example],
    ["choices", choices],
    ["min", min],
    ["max", max]
  ]
    .findAll{it[1]}
    .collect{"\n        " + it[0] + ": " + it[1].replaceAll("\n", "\\n")}
    .join("")
  
  def descStr = param.description == null ?
    "" :
    paragraphWrap("\n" + param.description.trim(), 80 - 8).join("\n        ")
  
  "\n    --" + param.plainName +
    namedPropsStr +
    descStr
}

// Based on Helper.generateHelp() in Helper.scala
def generateHelp(config) {
  def fun = config.functionality

  // PART 1: NAME AND VERSION
  def nameStr = fun.name + 
    (fun.version == null ? "" : " " + fun.version)

  // PART 2: DESCRIPTION
  def descrStr = fun.description == null ? 
    "" :
    "\n\n" + paragraphWrap(fun.description.trim(), 80).join("\n")

  // PART 3: Usage
  def usageStr = fun.usage == null ? 
    "" :
    "\n\nUsage:\n" + fun.usage.trim()

  // PART 4: Options
  def argGroupStrs = fun.allArgumentGroups.collect{argGroup ->
    def name = argGroup.name
    def descriptionStr = argGroup.description == null ?
      "" :
      "\n    " + paragraphWrap(argGroup.description.trim(), 80-4).join("\n    ") + "\n"
    def arguments = argGroup.arguments.collect{arg -> 
      arg instanceof String ? fun.allArguments.find{it.plainName == arg} : arg
    }.findAll{it != null}
    def argumentStrs = arguments.collect{param -> generateArgumentHelp(param)}
    
    "\n\n$name:" +
      descriptionStr +
      argumentStrs.join("\n")
  }

  // FINAL: combine
  def out = nameStr + 
    descrStr +
    usageStr + 
    argGroupStrs.join("")

  return out
}

def helpMessage(config) {
  if (paramExists("help")) {
    def mergedConfig = addGlobalParams(config)
    def helpStr = generateHelp(mergedConfig)
    println(helpStr)
    exit 0
  }
}

def _guessParamListFormat(params) {
  if (!params.containsKey("param_list") || params.param_list == null) {
    "none"
  } else {
    def param_list = params.param_list

    if (param_list !instanceof String) {
      "asis"
    } else if (param_list.endsWith(".csv")) {
      "csv"
    } else if (param_list.endsWith(".json") || param_list.endsWith(".jsn")) {
      "json"
    } else if (param_list.endsWith(".yaml") || param_list.endsWith(".yml")) {
      "yaml"
    } else {
      "yaml_blob"
    }
  }
}

viashChannelDeprecationWarningPrinted = false

def paramsToList(params, config) {
  if (!viashChannelDeprecationWarningPrinted) {
    viashChannelDeprecationWarningPrinted = true
    System.err.println("Warning: paramsToList has deprecated in Viash 0.7.0. " +
                      "Please use a combination of channelFromParams and preprocessInputs.")
  }
  // fetch default params from functionality
  def defaultArgs = config.functionality.allArguments
    .findAll { it.containsKey("default") }
    .collectEntries { [ it.plainName, it.default ] }

  // fetch overrides in params
  def paramArgs = config.functionality.allArguments
    .findAll { params.containsKey(it.plainName) }
    .collectEntries { [ it.plainName, params[it.plainName] ] }
  
  // check multi input params
  // objects should be closures and not functions, thanks to FunctionDef
  def multiParamFormat = _guessParamListFormat(params)

  def multiOptionFunctions = [ 
    "csv": {[it, readCsv(it)]},
    "json": {[it, readJson(it)]},
    "yaml": {[it, readYaml(it)]},
    "yaml_blob": {[null, readYamlBlob(it)]},
    "asis": {[null, it]},
    "none": {[null, [[:]]]}
  ]
  assert multiOptionFunctions.containsKey(multiParamFormat): 
    "Format of provided --param_list not recognised.\n" +
    "You can use '--param_list_format' to manually specify the format.\n" +
    "Found: '$multiParamFormat'. Expected: one of 'csv', 'json', 'yaml', 'yaml_blob', 'asis' or 'none'"

  // fetch multi param inputs
  def multiOptionFun = multiOptionFunctions.get(multiParamFormat)
  // todo: add try catch
  def multiOptionOut = multiOptionFun(params.containsKey("param_list") ? params.param_list : "")
  def paramList = multiOptionOut[1]
  def multiFile = multiOptionOut[0]

  // data checks
  assert paramList instanceof List: "--param_list should contain a list of maps"
  for (value in paramList) {
    assert value instanceof Map: "--param_list should contain a list of maps"
  }
  
  // combine parameters
  def processedParams = paramList.collect{ multiParam ->
    // combine params
    def combinedArgs = defaultArgs + paramArgs + multiParam

    if (workflow.stubRun) {
      // if stub run, explicitly add an id if missing
      combinedArgs = [id: "stub"] + combinedArgs
    } else {
      // else check whether required arguments exist
      config.functionality.allArguments
        .findAll { it.required }
        .forEach { par ->
          assert combinedArgs.containsKey(par.plainName): "Argument ${par.plainName} is required but does not have a value"
        }
    }
    
    // process arguments
    def inputs = config.functionality.allArguments
      .findAll{ par -> combinedArgs.containsKey(par.plainName) }
      .collectEntries { par ->
        // split on 'multiple_sep'
        if (par.multiple) {
          parData = combinedArgs[par.plainName]
          if (parData instanceof List) {
            parData = parData.collect{it instanceof String ? it.split(par.multiple_sep) : it }
          } else if (parData instanceof String) {
            parData = parData.split(par.multiple_sep)
          } else if (parData == null) {
            parData = []
          } else {
            parData = [ parData ]
          }
        } else {
          parData = [ combinedArgs[par.plainName] ]
        }

        // flatten
        parData = parData.flatten()

        // cast types
        if (par.type == "file" && ((par.direction != null ? par.direction : "input") == "input")) {
          parData = parData.collect{path ->
            if (path !instanceof String) {
              path
            } else if (multiFile) {
              file(getChild(multiFile, path))
            } else {
              file(path)
            }
          }.flatten()
        } else if (par.type == "integer") {
          parData = parData.collect{it as Integer}
        } else if (par.type == "double") {
          parData = parData.collect{it as Double}
        } else if (par.type == "boolean" || par.type == "boolean_true" || par.type == "boolean_false") {
          parData = parData.collect{it as Boolean}
        }
        // simplify list to value if need be
        if (!par.multiple) {
          assert parData.size() == 1 : 
            "Error: argument ${par.plainName} has too many values.\n" +
            "  Expected amount: 1. Found: ${parData.size()}"
          parData = parData[0]
        }

        // return pair
        [ par.plainName, parData ]
      }
      // remove parameters which were explicitly set to null
      .findAll{ par -> par != null }
    }
    
  
  // check processed params
  processedParams.forEach { args ->
    assert args.containsKey("id"): "Each argument set should have an 'id'. Argument set: $args"
  }
  def ppIds = processedParams.collect{it.id}
  assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds"

  processedParams
}

def paramsToChannel(params, config) {
  if (!viashChannelDeprecationWarningPrinted) {
    viashChannelDeprecationWarningPrinted = true
    System.err.println("Warning: paramsToChannel has deprecated in Viash 0.7.0. " +
                      "Please use a combination of channelFromParams and preprocessInputs.")
  }
  Channel.fromList(paramsToList(params, config))
}

def viashChannel(params, config) {
  if (!viashChannelDeprecationWarningPrinted) {
    viashChannelDeprecationWarningPrinted = true
    System.err.println("Warning: viashChannel has deprecated in Viash 0.7.0. " +
                      "Please use a combination of channelFromParams and preprocessInputs.")
  }
  paramsToChannel(params, config)
    | map{tup -> [tup.id, tup]}
}

/**
 * Split parameters for arguments that accept multiple values using their separator
 *
 * @param paramList A Map containing parameters to split.
 * @param config A Map of the Viash configuration. This Map can be generated from the config file
 *               using the readConfig() function.
 *
 * @return A Map of parameters where the parameter values have been split into a list using
 *         their seperator.
 */
Map<String, Object> _splitParams(Map<String, Object> parValues, Map config){
  def parsedParamValues = parValues.collectEntries { parName, parValue ->
    def parameterSettings = config.functionality.allArguments.find({it.plainName == parName})

    if (!parameterSettings) {
      // if argument is not found, do not alter 
      return [parName, parValue]
    }
    if (parameterSettings.multiple) { // Check if parameter can accept multiple values
      if (parValue instanceof Collection) {
          parValue = parValue.collect{it instanceof String ? it.split(parameterSettings.multiple_sep) : it }
      } else if (parValue instanceof String) {
          parValue = parValue.split(parameterSettings.multiple_sep)
      } else if (parValue == null) {
          parValue = []
      } else {
          parValue = [ parValue ]
      }
      parValue = parValue.flatten()
    }
    // For all parameters check if multiple values are only passed for
    // arguments that allow it. Quietly simplify lists of length 1.
    if (!parameterSettings.multiple && parValue instanceof Collection) {
      assert parValue.size() == 1 : 
      "Error: argument ${parName} has too many values.\n" +
      "  Expected amount: 1. Found: ${parValue.size()}"
      parValue = parValue[0]
    }
    [parName, parValue]
  }
  return parsedParamValues
}

/**
 * Check if the ids are unique across parameter sets
 *
 * @param parameterSets a list of parameter sets.
 */
private void _checkUniqueIds(List<Tuple2<String, Map<String, Object>>> parameterSets) {
  def ppIds = parameterSets.collect{it[0]}
  assert ppIds.size() == ppIds.unique().size() : "All argument sets should have unique ids. Detected ids: $ppIds"
}

/**
 * Resolve the file paths in the parameters relative to given path
 *
 * @param paramList A Map containing parameters to process.
 *                  This function assumes that files are still of type String.
 * @param config A Map of the Viash configuration. This Map can be generated from the config file
 *               using the readConfig() function.
 * @param relativeTo path of a file to resolve the parameters values to.
 *
 * @return A map of parameters where the location of the input file parameters have been resolved
 *         resolved relatively to the provided path.
 */
private Map<String, Object> _resolvePathsRelativeTo(Map paramList, Map config, String relativeTo) {
  paramList.collectEntries { parName, parValue ->
    argSettings = config.functionality.allArguments.find{it.plainName == parName}
    if (argSettings && argSettings.type == "file" && argSettings.direction == "input") {
      if (parValue instanceof Collection) {
        parValue = parValue.collect({path -> 
          path !instanceof String ? path : file(getChild(relativeTo, path))
        })
      } else {
        parValue = parValue !instanceof String ? path : file(getChild(relativeTo, parValue))
      }
    }
    [parName, parValue]
  }
}

/**
 * Parse nextflow parameters based on settings defined in a viash config 
 * and return a nextflow channel.
 *
 * @param params Input parameters from nextflow.
 * @param config A Map of the Viash configuration. This Map can be generated from the config file
 *               using the readConfig() function.
 *
 * @return A list of parameter sets that were parsed from the 'param_list' argument value.
 */
private List<Tuple2<String, Map>> _parseParamListArguments(Map params, Map config){
  // first try to guess the format (if not set in params)
  def paramListFormat = _guessParamListFormat(params)

  // get the correct parser function for the detected params_list format
  def paramListParsers = [ 
    "csv": {[it, readCsv(it)]},
    "json": {[it, readJson(it)]},
    "yaml": {[it, readYaml(it)]},
    "yaml_blob": {[null, readYamlBlob(it)]},
    "asis": {[null, it]},
    "none": {[null, [[:]]]}
  ]
  assert paramListParsers.containsKey(paramListFormat):
    "Format of provided --param_list not recognised.\n" +
    "You can use '--param_list_format' to manually specify the format.\n" +
    "Found: '$paramListFormat'. Expected: one of 'csv', 'json', "+
    "'yaml', 'yaml_blob', 'asis' or 'none'"
  def paramListParser = paramListParsers.get(paramListFormat)

  // fetch multi param inputs
  def paramListOut = paramListParser(params.containsKey("param_list") ? params.param_list : "")
  // multiFile is null if the value passed to param_list was not a file (e.g a blob)
  // If the value was indeed a file, multiFile contains the location that file (used later).
  def paramListFile = paramListOut[0]
  def paramSets = paramListOut[1] // these are the actual parameters from reading the blob/file

  // data checks
  assert paramSets instanceof List: "--param_list should contain a list of maps"
  for (value in paramSets) {
    assert value instanceof Map: "--param_list should contain a list of maps"
  }

  // Reformat from List<Map> to List<Tuple2<String, Map>> by adding the ID as first element of a Tuple2
  paramSets = paramSets.collect({ paramValues ->
    [paramValues.get("id", null), paramValues.findAll{it.key != 'id'}]
  })
  // Split parameters with 'multiple: true'
  paramSets = paramSets.collect({ id, paramValues ->
    def splitParamValues = _splitParams(paramValues, config)
    [id, splitParamValues]
  })
  
  // The paths of input files inside a param_list file may have been specified relatively to the
  // location of the param_list file. These paths must be made absolute.
  if (paramListFile){
    paramSets = paramSets.collect({ id, paramValues ->
      def relativeParamValues = _resolvePathsRelativeTo(paramValues, config, paramListFile)
      [id, relativeParamValues]
    })
  }

  return paramSets
}

/**
 * Cast parameters to the correct type as defined in the Viash config
 *
 * @param parValues A Map of input arguments.
 *
 * @return The input arguments that have been cast to the type from the viash config.
 */

private Map<String, Object> _castParamTypes(Map<String, Object> parValues, Map config) {
  // Cast the input to the correct type according to viash config
  def castParValues = parValues.collectEntries({ parName, parValue ->
    paramSettings = config.functionality.allArguments.find({it.plainName == parName})
    // dont parse parameters like publish_dir ( in which case paramSettings = null)
    parType = paramSettings ? paramSettings.get("type", null) : null
    if (parValue !instanceof Collection) {
      parValue = [parValue]
    }
    if (parType == "file" && ((paramSettings.direction != null ? paramSettings.direction : "input") == "input")) {
      parValue = parValue.collect{ path ->
        if (path !instanceof String) {
          path
        } else {
          file(path)
        }
      }
    } else if (parType == "integer") {
      parValue = parValue.collect{it as Integer}
    } else if (parType == "double") {
      parValue = parValue.collect{it as Double}
    } else if (parType == "boolean" || 
                parType == "boolean_true" || 
                parType == "boolean_false") {
      parValue = parValue.collect{it as Boolean}
    }

    // simplify list to value if need be
    if (paramSettings && !paramSettings.multiple) {
      assert parValue.size() == 1 : 
        "Error: argument ${parName} has too many values.\n" +
        "  Expected amount: 1. Found: ${parValue.size()}"
      parValue = parValue[0]
    }
    [parName, parValue]
  })
  return castParValues
}

/**
 * Apply the argument settings specified in a Viash config to a single parameter set.
 *    - Split the parameter values according to their seperator if 
 *       the parameter accepts multiple values
 *    - Cast the parameters to their corect types.
 *    - Assertions:
 *        ~ Check if any unknown parameters are found
 * 
 * @param paramValues A Map of parameter to be processed. All parameters must 
 *                    also be specified in the Viash config.
 * @param config: A Map of the Viash configuration. This Map can be generated from 
 *                the config file using the readConfig() function.
 * @return The input parameters that have been processed.
 */
Map<String, Object> applyConfigToOneParameterSet(Map<String, Object> paramValues, Map config){
  def splitParamValues = _splitParams(paramValues, config)
  def castParamValues = _castParamTypes(splitParamValues, config)

  // Check if any unexpected arguments were passed
  def knownParams = config.functionality.allArguments.collect({it.plainName}) + ["publishDir", "publish_dir"]
  castParamValues.each({parName, parValue ->
      assert parName in knownParams: "Unknown parameter. Parameter $parName should be in $knownParams"
  })
  return castParamValues
}

/**
 * Apply the argument settings specified in a Viash config to a list of parameter sets.
 *    - Split the parameter values according to their seperator if 
 *       the parameter accepts multiple values
 *    - Cast the parameters to their corect types.
 *    - Assertions:
 *        ~ Check if any unknown parameters are found
 *        ~ Check if the ID of the parameter set is unique across all sets.
 * 
 * @return The input parameters that have been processed.
 */

List<Tuple> applyConfig(List<Tuple> parameterSets, Map config){
  def processedparameterSets = parameterSets.collect({ parameterSet ->
    def id = parameterSet[0]
    def paramValues = parameterSet[1]
    def passthrough = parameterSet.drop(2)
    def processedSet = applyConfigToOneParameterSet(paramValues, config)
    [id, processedSet] + passthrough
  })

  _checkUniqueIds(processedparameterSets)
  return processedparameterSets
}

/**
 * Parse nextflow parameters based on settings defined in a viash config.
 * Return a list of parameter sets, each parameter set corresponding to 
 * an event in a nextflow channel. The output from this function can be used
 * with Channel.fromList to create a nextflow channel with Vdsl3 formatted 
 * events.
 *
 * This function performs:
 *   - A filtering of the params which can be found in the config file.
 *   - Process the params_list argument which allows a user to to initialise 
 *     a Vsdl3 channel with multiple parameter sets. Possible formats are 
 *     csv, json, yaml, or simply a yaml_blob. A csv should have column names 
 *     which correspond to the different arguments of this pipeline. A json or a yaml
 *     file should be a list of maps, each of which has keys corresponding to the
 *     arguments of the pipeline. A yaml blob can also be passed directly as a parameter.
 *     When passing a csv, json or yaml, relative path names are relativized to the
 *     location of the parameter file.
 *   - Combine the parameter sets into a vdsl3 Channel.
 *
 * @param params Input parameters. Can optionaly contain a 'param_list' key that
 *               provides a list of arguments that can be split up into multiple events
 *               in the output channel possible formats of param_lists are: a csv file, 
 *               json file, a yaml file or a yaml blob. Each parameters set (event) must
 *               have a unique ID.
 * @param config A Map of the Viash configuration. This Map can be generated from the config file
 *               using the readConfig() function.
 * 
 * @return A list of parameters with the first element of the event being
 *         the event ID and the second element containing a map of the parsed parameters.
 */
 
private List<Tuple2<String, Map<String, Object>>> _paramsToParamSets(Map params, Map config){
  /* parse regular parameters (not in param_list)  */
  /*************************************************/
  def globalParams = config.functionality.allArguments
    .findAll { params.containsKey(it.plainName) }
    .collectEntries { [ it.plainName, params[it.plainName] ] }
  def globalID = params.get("id", null)
  def globalParamsValues = applyConfigToOneParameterSet(globalParams.findAll{it.key != 'id'}, config)

  /* process params_list arguments */
  /*********************************/
  def paramSets = _parseParamListArguments(params, config)
  def parameterSetsWithConfigApplied = applyConfig(paramSets, config)

  /* combine arguments into channel */
  /**********************************/
  def processedParams = parameterSetsWithConfigApplied.indexed().collect{ index, paramSet ->
    def id = paramSet[0]
    def parValues = paramSet[1]
    id = [id, globalID].find({it != null}) // first non-null element
  
    if (workflow.stubRun) {
      // if stub run, explicitly add an id if missing
      id = id ? id : "stub" + index
    }
    assert id != null: "Each parameter set should have at least an ID."
    // Add regular parameters together with parameters passed with 'param_list'
    def combinedArgsValues = globalParamsValues + parValues

    // Remove parameters which are null, if the default is also null
    combinedArgsValues = combinedArgsValues.collectEntries{paramName, paramValue ->
      parameterSettings = config.functionality.allArguments.find({it.plainName == paramName})
      if ( paramValue != null || parameterSettings.get("default", null) != null ) {
        [paramName, paramValue]
      }
    }
    [id, combinedArgsValues]
  }

  // Check if ids (first element of each list) is unique
  _checkUniqueIds(processedParams)
  return processedParams
}

/**
 * Parse nextflow parameters based on settings defined in a viash config 
 * and return a nextflow channel.
 * 
 * @param params Input parameters. Can optionaly contain a 'param_list' key that
 *               provides a list of arguments that can be split up into multiple events
 *               in the output channel possible formats of param_lists are: a csv file, 
 *               json file, a yaml file or a yaml blob. Each parameters set (event) must
 *               have a unique ID.
 * @param config A Map of the Viash configuration. This Map can be generated from the config file
 *               using the readConfig() function.
 * 
 * @return A nextflow Channel with events. Events are formatted as a tuple that contains 
 *         first contains the ID of the event and as second element holds a parameter map.
 *       
 *
 */
def channelFromParams(Map params, Map config) {
  processedParams = _paramsToParamSets(params, config)
  return Channel.fromList(processedParams)
}

/**
 * Process a list of Vdsl3 formatted parameters and apply a Viash config to them:
 *    - Gather default parameters from the Viash config and make 
 *      sure that they are correctly formatted (see applyConfig method).
 *    - Format the input parameters (also using the applyConfig method).
 *    - Apply the default parameter to the input parameters.
 *    - Do some assertions:
 *        ~ Check if the event IDs in the channel are unique.
 *  
 * @param params A list of parameter sets as Tuples. The first element of the tuples
 *                must be a unique id of the parameter set, and the second element 
 *                must contain the parameters themselves. Optional extra elements 
 *                of the tuples will be passed to the output as is.
 * @param config A Map of the Viash configuration. This Map can be generated from 
 *                the config file using the readConfig() function.           
 *
 * @return A list of processed parameters sets as tuples.
 */

private List<Tuple> _preprocessInputsList(List<Tuple> params, Map config) {
  // Get different parameter types (used throughout this function)
  def defaultArgs = config.functionality.allArguments
    .findAll { it.containsKey("default") }
    .collectEntries { [ it.plainName, it.default ] }

  // Apply config to default parameters
  def parsedDefaultValues = applyConfigToOneParameterSet(defaultArgs, config)

  // Apply config to input parameters
  def parsedInputParamSets = applyConfig(params, config)

  // Merge two parameter sets together
  def parsedArgs = parsedInputParamSets.collect({ parsedInputParamSet ->
    def id = parsedInputParamSet[0]
    def parValues = parsedInputParamSet[1]
    def passthrough = parsedInputParamSet.drop(2)
    def parValuesWithDefault = parsedDefaultValues + parValues
    [id, parValuesWithDefault] + passthrough
  })
  _checkUniqueIds(parsedArgs)

  return parsedArgs
}

/**
 * Generate a nextflow Workflow that allows processing a channel of 
 * Vdsl3 formatted events and apply a Viash config to them:
 *    - Gather default parameters from the Viash config and make 
 *      sure that they are correctly formatted (see applyConfig method).
 *    - Format the input parameters (also using the applyConfig method).
 *    - Apply the default parameter to the input parameters.
 *    - Do some assertions:
 *        ~ Check if the event IDs in the channel are unique.
 * 
 * The events in the channel are formatted as tuples, with the 
 * first element of the tuples being a unique id of the parameter set, 
 * and the second element containg the the parameters themselves.
 * Optional extra elements of the tuples will be passed to the output as is.
 *
 * @param args A map that must contain a 'config' key that points
 *              to a parsed config (see readConfig()). Optionally, a
 *              'key' key can be provided which can be used to create a unique
 *              name for the workflow process.
 *
 * @return A workflow that allows processing a channel of Vdsl3 formatted events
 * and apply a Viash config to them.
 */
def preprocessInputs(Map args) {
  wfKey = args.key != null ? args.key : "preprocessInputs"
  config = args.config
  workflow preprocessInputsInstance {
    take: 
    input_ch

    main:
    assert config instanceof Map : 
      "Error in preprocessInputs: config must be a map. " +
      "Expected class: Map. Found: config.getClass() is ${config.getClass()}"

    output_ch = input_ch
      | toSortedList
      | map { paramList -> _preprocessInputsList(paramList, config) }
      | flatMap
    emit:
    output_ch
  }

  return preprocessInputsInstance.cloneWithName(wfKey)
}

////////////////////////////
// VDSL3 helper functions //
////////////////////////////

import nextflow.Nextflow
import nextflow.script.IncludeDef
import nextflow.script.ScriptBinding
import nextflow.script.ScriptMeta
import nextflow.script.ScriptParser

// retrieve resourcesDir here to make sure the correct path is found
resourcesDir = ScriptMeta.current().getScriptPath().getParent()

def assertMapKeys(map, expectedKeys, requiredKeys, mapName) {
  assert map instanceof Map : "Expected argument '$mapName' to be a Map. Found: class ${map.getClass()}"
  map.forEach { key, val -> 
    assert key in expectedKeys : "Unexpected key '$key' in ${mapName ? mapName + " " : ""}map"
  }
  requiredKeys.forEach { requiredKey -> 
    assert map.containsKey(requiredKey) : "Missing required key '$key' in ${mapName ? mapName + " " : ""}map"
  }
}

// TODO: unit test processDirectives
def processDirectives(Map drctv) {
  // remove null values
  drctv = drctv.findAll{k, v -> v != null}

  /* DIRECTIVE accelerator
    accepted examples:
    - [ limit: 4, type: "nvidia-tesla-k80" ]
  */
  if (drctv.containsKey("accelerator")) {
    assertMapKeys(drctv["accelerator"], ["type", "limit", "request", "runtime"], [], "accelerator")
  }

  /* DIRECTIVE afterScript
    accepted examples:
    - "source /cluster/bin/cleanup"
  */
  if (drctv.containsKey("afterScript")) {
    assert drctv["afterScript"] instanceof CharSequence
  }

  /* DIRECTIVE beforeScript
    accepted examples:
    - "source /cluster/bin/setup"
  */
  if (drctv.containsKey("beforeScript")) {
    assert drctv["beforeScript"] instanceof CharSequence
  }

  /* DIRECTIVE cache
    accepted examples:
    - true
    - false
    - "deep"
    - "lenient"
  */
  if (drctv.containsKey("cache")) {
    assert drctv["cache"] instanceof CharSequence || drctv["cache"] instanceof Boolean
    if (drctv["cache"] instanceof CharSequence) {
      assert drctv["cache"] in ["deep", "lenient"] : "Unexpected value for cache"
    }
  }

  /* DIRECTIVE conda
    accepted examples:
    - "bwa=0.7.15"
    - "bwa=0.7.15 fastqc=0.11.5"
    - ["bwa=0.7.15", "fastqc=0.11.5"]
  */
  if (drctv.containsKey("conda")) {
    if (drctv["conda"] instanceof List) {
      drctv["conda"] = drctv["conda"].join(" ")
    }
    assert drctv["conda"] instanceof CharSequence
  }

  /* DIRECTIVE container
    accepted examples:
    - "foo/bar:tag"
    - [ registry: "reg", image: "im", tag: "ta" ]
      is transformed to "reg/im:ta"
    - [ image: "im" ] 
      is transformed to "im:latest"
  */
  if (drctv.containsKey("container")) {
    assert drctv["container"] instanceof Map || drctv["container"] instanceof CharSequence
    if (drctv["container"] instanceof Map) {
      def m = drctv["container"]
      assertMapKeys(m, [ "registry", "image", "tag" ], ["image"], "container")
      def part1 = 
        System.getenv('OVERRIDE_CONTAINER_REGISTRY') ? System.getenv('OVERRIDE_CONTAINER_REGISTRY') + "/" : 
        params.containsKey("override_container_registry") ? params["override_container_registry"] + "/" : // todo: remove?
        m.registry ? m.registry + "/" : 
        ""
      def part2 = m.image
      def part3 = m.tag ? ":" + m.tag : ":latest"
      drctv["container"] = part1 + part2 + part3
    }
  }

  /* DIRECTIVE containerOptions
    accepted examples:
    - "--foo bar"
    - ["--foo bar", "-f b"]
  */
  if (drctv.containsKey("containerOptions")) {
    if (drctv["containerOptions"] instanceof List) {
      drctv["containerOptions"] = drctv["containerOptions"].join(" ")
    }
    assert drctv["containerOptions"] instanceof CharSequence
  }

  /* DIRECTIVE cpus
    accepted examples:
    - 1
    - 10
  */
  if (drctv.containsKey("cpus")) {
    assert drctv["cpus"] instanceof Integer
  }

  /* DIRECTIVE disk
    accepted examples:
    - "1 GB"
    - "2TB"
    - "3.2KB"
    - "10.B"
  */
  if (drctv.containsKey("disk")) {
    assert drctv["disk"] instanceof CharSequence
    // assert drctv["disk"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B")
    // ^ does not allow closures
  }

  /* DIRECTIVE echo
    accepted examples:
    - true
    - false
  */
  if (drctv.containsKey("echo")) {
    assert drctv["echo"] instanceof Boolean
  }

  /* DIRECTIVE errorStrategy
    accepted examples:
    - "terminate"
    - "finish"
  */
  if (drctv.containsKey("errorStrategy")) {
    assert drctv["errorStrategy"] instanceof CharSequence
    assert drctv["errorStrategy"] in ["terminate", "finish", "ignore", "retry"] : "Unexpected value for errorStrategy"
  }

  /* DIRECTIVE executor
    accepted examples:
    - "local"
    - "sge"
  */
  if (drctv.containsKey("executor")) {
    assert drctv["executor"] instanceof CharSequence
    assert drctv["executor"] in ["local", "sge", "uge", "lsf", "slurm", "pbs", "pbspro", "moab", "condor", "nqsii", "ignite", "k8s", "awsbatch", "google-pipelines"] : "Unexpected value for executor"
  }

  /* DIRECTIVE machineType
    accepted examples:
    - "n1-highmem-8"
  */
  if (drctv.containsKey("machineType")) {
    assert drctv["machineType"] instanceof CharSequence
  }

  /* DIRECTIVE maxErrors
    accepted examples:
    - 1
    - 3
  */
  if (drctv.containsKey("maxErrors")) {
    assert drctv["maxErrors"] instanceof Integer
  }

  /* DIRECTIVE maxForks
    accepted examples:
    - 1
    - 3
  */
  if (drctv.containsKey("maxForks")) {
    assert drctv["maxForks"] instanceof Integer
  }

  /* DIRECTIVE maxRetries
    accepted examples:
    - 1
    - 3
  */
  if (drctv.containsKey("maxRetries")) {
    assert drctv["maxRetries"] instanceof Integer
  }

  /* DIRECTIVE memory
    accepted examples:
    - "1 GB"
    - "2TB"
    - "3.2KB"
    - "10.B"
  */
  if (drctv.containsKey("memory")) {
    assert drctv["memory"] instanceof CharSequence
    // assert drctv["memory"].matches("[0-9]+(\\.[0-9]*)? *[KMGTPEZY]?B")
    // ^ does not allow closures
  }

  /* DIRECTIVE module
    accepted examples:
    - "ncbi-blast/2.2.27"
    - "ncbi-blast/2.2.27:t_coffee/10.0"
    - ["ncbi-blast/2.2.27", "t_coffee/10.0"]
  */
  if (drctv.containsKey("module")) {
    if (drctv["module"] instanceof List) {
      drctv["module"] = drctv["module"].join(":")
    }
    assert drctv["module"] instanceof CharSequence
  }

  /* DIRECTIVE penv
    accepted examples:
    - "smp"
  */
  if (drctv.containsKey("penv")) {
    assert drctv["penv"] instanceof CharSequence
  }

  /* DIRECTIVE pod
    accepted examples:
    - [ label: "key", value: "val" ]
    - [ annotation: "key", value: "val" ]
    - [ env: "key", value: "val" ]
    - [ [label: "l", value: "v"], [env: "e", value: "v"]]
  */
  if (drctv.containsKey("pod")) {
    if (drctv["pod"] instanceof Map) {
      drctv["pod"] = [ drctv["pod"] ]
    }
    assert drctv["pod"] instanceof List
    drctv["pod"].forEach { pod ->
      assert pod instanceof Map
      // TODO: should more checks be added?
      // See https://www.nextflow.io/docs/latest/process.html?highlight=directives#pod
      // e.g. does it contain 'label' and 'value', or 'annotation' and 'value', or ...?
    }
  }

  /* DIRECTIVE publishDir
    accepted examples:
    - []
    - [ [ path: "foo", enabled: true ], [ path: "bar", enabled: false ] ]
    - "/path/to/dir" 
      is transformed to [[ path: "/path/to/dir" ]]
    - [ path: "/path/to/dir", mode: "cache" ]
      is transformed to [[ path: "/path/to/dir", mode: "cache" ]]
  */
  // TODO: should we also look at params["publishDir"]?
  if (drctv.containsKey("publishDir")) {
    def pblsh = drctv["publishDir"]
    
    // check different options
    assert pblsh instanceof List || pblsh instanceof Map || pblsh instanceof CharSequence
    
    // turn into list if not already so
    // for some reason, 'if (!pblsh instanceof List) pblsh = [ pblsh ]' doesn't work.
    pblsh = pblsh instanceof List ? pblsh : [ pblsh ]

    // check elements of publishDir
    pblsh = pblsh.collect{ elem ->
      // turn into map if not already so
      elem = elem instanceof CharSequence ? [ path: elem ] : elem

      // check types and keys
      assert elem instanceof Map : "Expected publish argument '$elem' to be a String or a Map. Found: class ${elem.getClass()}"
      assertMapKeys(elem, [ "path", "mode", "overwrite", "pattern", "saveAs", "enabled" ], ["path"], "publishDir")

      // check elements in map
      assert elem.containsKey("path")
      assert elem["path"] instanceof CharSequence
      if (elem.containsKey("mode")) {
        assert elem["mode"] instanceof CharSequence
        assert elem["mode"] in [ "symlink", "rellink", "link", "copy", "copyNoFollow", "move" ]
      }
      if (elem.containsKey("overwrite")) {
        assert elem["overwrite"] instanceof Boolean
      }
      if (elem.containsKey("pattern")) {
        assert elem["pattern"] instanceof CharSequence
      }
      if (elem.containsKey("saveAs")) {
        assert elem["saveAs"] instanceof CharSequence //: "saveAs as a Closure is currently not supported. Surround your closure with single quotes to get the desired effect. Example: '\{ foo \}'"
      }
      if (elem.containsKey("enabled")) {
        assert elem["enabled"] instanceof Boolean
      }

      // return final result
      elem
    }
    // store final directive
    drctv["publishDir"] = pblsh
  }

  /* DIRECTIVE queue
    accepted examples:
    - "long"
    - "short,long"
    - ["short", "long"]
  */
  if (drctv.containsKey("queue")) {
    if (drctv["queue"] instanceof List) {
      drctv["queue"] = drctv["queue"].join(",")
    }
    assert drctv["queue"] instanceof CharSequence
  }

  /* DIRECTIVE label
    accepted examples:
    - "big_mem"
    - "big_cpu"
    - ["big_mem", "big_cpu"]
  */
  if (drctv.containsKey("label")) {
    if (drctv["label"] instanceof CharSequence) {
      drctv["label"] = [ drctv["label"] ]
    }
    assert drctv["label"] instanceof List
    drctv["label"].forEach { label ->
      assert label instanceof CharSequence
      // assert label.matches("[a-zA-Z0-9]([a-zA-Z0-9_]*[a-zA-Z0-9])?")
      // ^ does not allow closures
    }
  }

  /* DIRECTIVE scratch
    accepted examples:
    - true
    - "/path/to/scratch"
    - '$MY_PATH_TO_SCRATCH'
    - "ram-disk"
  */
  if (drctv.containsKey("scratch")) {
    assert drctv["scratch"] == true || drctv["scratch"] instanceof CharSequence
  }

  /* DIRECTIVE storeDir
    accepted examples:
    - "/path/to/storeDir"
  */
  if (drctv.containsKey("storeDir")) {
    assert drctv["storeDir"] instanceof CharSequence
  }

  /* DIRECTIVE stageInMode
    accepted examples:
    - "copy"
    - "link"
  */
  if (drctv.containsKey("stageInMode")) {
    assert drctv["stageInMode"] instanceof CharSequence
    assert drctv["stageInMode"] in ["copy", "link", "symlink", "rellink"]
  }

  /* DIRECTIVE stageOutMode
    accepted examples:
    - "copy"
    - "link"
  */
  if (drctv.containsKey("stageOutMode")) {
    assert drctv["stageOutMode"] instanceof CharSequence
    assert drctv["stageOutMode"] in ["copy", "move", "rsync"]
  }

  /* DIRECTIVE tag
    accepted examples:
    - "foo"
    - '$id'
  */
  if (drctv.containsKey("tag")) {
    assert drctv["tag"] instanceof CharSequence
  }

  /* DIRECTIVE time
    accepted examples:
    - "1h"
    - "2days"
    - "1day 6hours 3minutes 30seconds"
  */
  if (drctv.containsKey("time")) {
    assert drctv["time"] instanceof CharSequence
    // todo: validation regex?
  }

  return drctv
}

// TODO: unit test processAuto
def processAuto(Map auto) {
  // remove null values
  auto = auto.findAll{k, v -> v != null}

  expectedKeys = ["simplifyInput", "simplifyOutput", "transcript", "publish"]

  // check whether expected keys are all booleans (for now)
  for (key in expectedKeys) {
    assert auto.containsKey(key)
    assert auto[key] instanceof Boolean
  }

  return auto.subMap(expectedKeys)
}

def processProcessArgs(Map args) {
  // override defaults with args
  def processArgs = thisDefaultProcessArgs + args

  // check whether 'key' exists
  assert processArgs.containsKey("key") : "Error in module '${thisConfig.functionality.name}': key is a required argument"

  // if 'key' is a closure, apply it to the original key
  if (processArgs["key"] instanceof Closure) {
    processArgs["key"] = processArgs["key"](thisConfig.functionality.name)
  }
  def key = processArgs["key"]
  assert key instanceof CharSequence : "Expected process argument 'key' to be a String. Found: class ${key.getClass()}"
  assert key ==~ /^[a-zA-Z_]\w*$/ : "Error in module '$key': Expected process argument 'key' to consist of only letters, digits or underscores. Found: ${key}"

  // check whether directives exists and apply defaults
  assert processArgs.containsKey("directives") : "Error in module '$key': directives is a required argument"
  assert processArgs["directives"] instanceof Map : "Error in module '$key': Expected process argument 'directives' to be a Map. Found: class ${processArgs['directives'].getClass()}"
  processArgs["directives"] = processDirectives(thisDefaultProcessArgs.directives + processArgs["directives"])

  // check whether directives exists and apply defaults
  assert processArgs.containsKey("auto") : "Error in module '$key': auto is a required argument"
  assert processArgs["auto"] instanceof Map : "Error in module '$key': Expected process argument 'auto' to be a Map. Found: class ${processArgs['auto'].getClass()}"
  processArgs["auto"] = processAuto(thisDefaultProcessArgs.auto + processArgs["auto"])

  // auto define publish, if so desired
  if (processArgs.auto.publish == true && (processArgs.directives.publishDir != null ? processArgs.directives.publishDir : [:]).isEmpty()) {
    // can't assert at this level thanks to the no_publish profile
    // assert params.containsKey("publishDir") || params.containsKey("publish_dir") : 
    //   "Error in module '${processArgs['key']}': if auto.publish is true, params.publish_dir needs to be defined.\n" +
    //   "  Example: params.publish_dir = \"./output/\""
    def publishDir = 
      params.containsKey("publish_dir") ? params.publish_dir : 
      params.containsKey("publishDir") ? params.publishDir : 
      null
    
    if (publishDir != null) {
      processArgs.directives.publishDir = [[ 
        path: publishDir, 
        saveAs: "{ it.startsWith('.') ? null : it }", // don't publish hidden files, by default
        mode: "copy"
      ]]
    }
  }

  // auto define transcript, if so desired
  if (processArgs.auto.transcript == true) {
    // can't assert at this level thanks to the no_publish profile
    // assert params.containsKey("transcriptsDir") || params.containsKey("transcripts_dir") || params.containsKey("publishDir") || params.containsKey("publish_dir") : 
    //   "Error in module '${processArgs['key']}': if auto.transcript is true, either params.transcripts_dir or params.publish_dir needs to be defined.\n" +
    //   "  Example: params.transcripts_dir = \"./transcripts/\""
    def transcriptsDir = 
      params.containsKey("transcripts_dir") ? params.transcripts_dir : 
      params.containsKey("transcriptsDir") ? params.transcriptsDir : 
      params.containsKey("publish_dir") ? params.publish_dir + "/_transcripts" :
      params.containsKey("publishDir") ? params.publishDir + "/_transcripts" : 
      null
    if (transcriptsDir != null) {
      def timestamp = Nextflow.getSession().getWorkflowMetadata().start.format('yyyy-MM-dd_HH-mm-ss')
      def transcriptsPublishDir = [ 
        path: "$transcriptsDir/$timestamp/\${task.process.replaceAll(':', '-')}/\${id}/",
        saveAs: "{ it.startsWith('.') ? it.replaceAll('^.', '') : null }", 
        mode: "copy"
      ]
      def publishDirs = processArgs.directives.publishDir != null ? processArgs.directives.publishDir : null ? processArgs.directives.publishDir : []
      processArgs.directives.publishDir = publishDirs + transcriptsPublishDir
    }
  }

  // if this is a stubrun, remove certain directives?
  if (workflow.stubRun) {
    processArgs.directives.keySet().removeAll(["publishDir", "cpus", "memory", "label"])
  }

  for (nam in ["map", "mapId", "mapData", "mapPassthrough", "filter"]) {
    if (processArgs.containsKey(nam) && processArgs[nam]) {
      assert processArgs[nam] instanceof Closure : "Error in module '$key': Expected process argument '$nam' to be null or a Closure. Found: class ${processArgs[nam].getClass()}"
    }
  }

  // check fromState
  assert processArgs.containsKey("fromState") : "Error in module '$key': fromState is a required argument"
  def fromState = processArgs["fromState"]
  assert fromState == null || fromState instanceof Closure || fromState instanceof Map || fromState instanceof List :
    "Error in module '$key': Expected process argument 'fromState' to be null, a Closure, a Map, or a List. Found: class ${fromState.getClass()}"
  if (fromState) {
    // if fromState is a List, convert to map
    if (fromState instanceof List) {
      // check whether fromstate is a list[string]
      assert fromState.every{it instanceof CharSequence} : "Error in module '$key': fromState is a List, but not all elements are Strings"
      fromState = fromState.collectEntries{[it, it]}
    }

    // if fromState is a map, convert to closure
    if (fromState instanceof Map) {
      // check whether fromstate is a map[string, string]
      assert fromState.values().every{it instanceof CharSequence} : "Error in module '$key': fromState is a Map, but not all values are Strings"
      assert fromState.keySet().every{it instanceof CharSequence} : "Error in module '$key': fromState is a Map, but not all keys are Strings"
      def fromStateMap = fromState.clone()
      // turn the map into a closure to be used later on
      fromState = { it ->
        def state = it[1] 
        assert state instanceof Map : "Error in module '$key': the state is not a Map"
        def data = fromStateMap.collectEntries{newkey, origkey ->
          // check whether all values of fromState are in state
          assert state.containsKey(origkey) : "Error in module '$key': fromState key '$origkey' not found in current state"
          [newkey, state[origkey]]
        }
        data
      }
    }

    processArgs["fromState"] = fromState
  }

  // check toState
  def toState = processArgs["toState"]

  if (toState == null) {
    toState = { tup -> tup[1] }
  }

  // toState should be a closure, map[string, string], or list[string]
  assert toState instanceof Closure || toState instanceof Map || toState instanceof List :
    "Error in module '$key': Expected process argument 'toState' to be a Closure, a Map, or a List. Found: class ${toState.getClass()}"

  // if toState is a List, convert to map
  if (toState instanceof List) {
    // check whether toState is a list[string]
    assert toState.every{it instanceof CharSequence} : "Error in module '$key': toState is a List, but not all elements are Strings"
    toState = toState.collectEntries{[it, it]}
  }

  // if toState is a map, convert to closure
  if (toState instanceof Map) {
    // check whether toState is a map[string, string]
    assert toState.values().every{it instanceof CharSequence} : "Error in module '$key': toState is a Map, but not all values are Strings"
    assert toState.keySet().every{it instanceof CharSequence} : "Error in module '$key': toState is a Map, but not all keys are Strings"
    def toStateMap = toState.clone()
    // turn the map into a closure to be used later on
    toState = { it ->
      def output = it[1]
      def state = it[2]
      assert output instanceof Map : "Error in module '$key': the output is not a Map"
      assert state instanceof Map : "Error in module '$key': the state is not a Map"
      def extraEntries = toStateMap.collectEntries{newkey, origkey ->
        // check whether all values of toState are in output
        assert output.containsKey(origkey) : "Error in module '$key': toState key '$origkey' not found in current output"
        [newkey, output[origkey]]
      }
      state + extraEntries
    }
  }

  processArgs["toState"] = toState

  // return output
  return processArgs
}

def processFactory(Map processArgs) {
  // autodetect process key
  def wfKey = processArgs["key"]
  def procKeyPrefix = "${wfKey}_process"
  def meta = ScriptMeta.current()
  def existing = meta.getProcessNames().findAll{it.startsWith(procKeyPrefix)}
  def numbers = existing.collect{it.replace(procKeyPrefix, "0").toInteger()}
  def newNumber = (numbers + [-1]).max() + 1

  def procKey = newNumber == 0 ? procKeyPrefix : "$procKeyPrefix$newNumber"

  if (newNumber > 0) {
    log.warn "Key for module '${wfKey}' is duplicated.\n",
      "If you run a component multiple times in the same workflow,\n" +
      "it's recommended you set a unique key for every call,\n" +
      "for example: ${wfKey}.run(key: \"foo\")."
  }

  // subset directives and convert to list of tuples
  def drctv = processArgs.directives

  // TODO: unit test the two commands below
  // convert publish array into tags
  def valueToStr = { val ->
    // ignore closures
    if (val instanceof CharSequence) {
      if (!val.matches('^[{].*[}]$')) {
        '"' + val + '"'
      } else {
        val
      }
    } else if (val instanceof List) {
      "[" + val.collect{valueToStr(it)}.join(", ") + "]"
    } else if (val instanceof Map) {
      "[" + val.collect{k, v -> k + ": " + valueToStr(v)}.join(", ") + "]"
    } else {
      val.inspect()
    }
  }

  // multiple entries allowed: label, publishdir
  def drctvStrs = drctv.collect { key, value ->
    if (key in ["label", "publishDir"]) {
      value.collect{ val ->
        if (val instanceof Map) {
          "\n$key " + val.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ")
        } else if (val == null) {
          ""
        } else {
          "\n$key " + valueToStr(val)
        }
      }.join()
    } else if (value instanceof Map) {
      "\n$key " + value.collect{ k, v -> k + ": " + valueToStr(v) }.join(", ")
    } else {
      "\n$key " + valueToStr(value)
    }
  }.join()

  def inputPaths = thisConfig.functionality.allArguments
    .findAll { it.type == "file" && it.direction == "input" }
    .collect { ', path(viash_par_' + it.plainName + ')' }
    .join()

  def outputPaths = thisConfig.functionality.allArguments
    .findAll { it.type == "file" && it.direction == "output" }
    .collect { par ->
      // insert dummy into every output (see nextflow-io/nextflow#2678)
      if (!par.multiple) {
        ', path{[".exitcode", args.' + par.plainName + ']}'
      } else {
        ', path{[".exitcode"] + args.' + par.plainName + '}'
      }
    }
    .join()

  // TODO: move this functionality somewhere else?
  if (processArgs.auto.transcript) {
    outputPaths = outputPaths + ', path{[".exitcode", ".command*"]}'
  } else {
    outputPaths = outputPaths + ', path{[".exitcode"]}'
  }

  // create dirs for output files (based on BashWrapper.createParentFiles)
  def createParentStr = thisConfig.functionality.allArguments
    .findAll { it.type == "file" && it.direction == "output" && it.create_parent }
    .collect { par -> 
      "\${ args.containsKey(\"${par.plainName}\") ? \"mkdir_parent \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"] : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }"
    }
    .join("\n")

  // construct inputFileExports
  def inputFileExports = thisConfig.functionality.allArguments
    .findAll { it.type == "file" && it.direction.toLowerCase() == "input" }
    .collect { par ->
      viash_par_contents = !par.required && !par.multiple ? "viash_par_${par.plainName}[0]" : "viash_par_${par.plainName}.join(\"${par.multiple_sep}\")"
      "\n\${viash_par_${par.plainName}.empty ? \"\" : \"export VIASH_PAR_${par.plainName.toUpperCase()}=\\\"\" + ${viash_par_contents} + \"\\\"\"}"
    }

  // NOTE: if using docker, use /tmp instead of tmpDir!
  def tmpDir = java.nio.file.Paths.get(
    System.getenv('NXF_TEMP') ?: 
    System.getenv('VIASH_TEMP') ?: 
    System.getenv('VIASH_TMPDIR') ?: 
    System.getenv('VIASH_TEMPDIR') ?: 
    System.getenv('VIASH_TMP') ?: 
    System.getenv('TEMP') ?: 
    System.getenv('TMPDIR') ?: 
    System.getenv('TEMPDIR') ?:
    System.getenv('TMP') ?: 
    '/tmp'
  ).toAbsolutePath()

  // construct stub
  def stub = thisConfig.functionality.allArguments
    .findAll { it.type == "file" && it.direction == "output" }
    .collect { par -> 
      "\${ args.containsKey(\"${par.plainName}\") ? \"touch2 \\\"\" + (args[\"${par.plainName}\"] instanceof String ? args[\"${par.plainName}\"].replace(\"_*\", \"_0\") : args[\"${par.plainName}\"].join('\" \"')) + \"\\\"\" : \"\" }"
    }
    .join("\n")

  // escape script
  def escapedScript = thisScript.replace('\\', '\\\\').replace('$', '\\$').replace('"""', '\\"\\"\\"')

  // publishdir assert
  def assertStr = processArgs.auto.publish || processArgs.auto.transcript ? 
    """\nassert task.publishDir.size() > 0: "if auto.publish is true, params.publish_dir needs to be defined.\\n  Example: --publish_dir './output/'" """ :
    ""

  // generate process string
  def procStr = 
  """nextflow.enable.dsl=2
  |
  |process $procKey {$drctvStrs
  |input:
  |  tuple val(id)$inputPaths, val(args), path(resourcesDir)
  |output:
  |  tuple val("\$id")$outputPaths, optional: true
  |stub:
  |\"\"\"
  |touch2() { mkdir -p "\\\$(dirname "\\\$1")" && touch "\\\$1" ; }
  |$stub
  |\"\"\"
  |script:$assertStr
  |def escapeText = { s -> s.toString().replaceAll('([`"])', '\\\\\\\\\$1') }
  |def parInject = args
  |  .findAll{key, value -> value != null}
  |  .collect{key, value -> "export VIASH_PAR_\${key.toUpperCase()}=\\\"\${escapeText(value)}\\\""}
  |  .join("\\n")
  |\"\"\"
  |# meta exports
  |export VIASH_META_RESOURCES_DIR="\${resourcesDir.toRealPath().toAbsolutePath()}"
  |export VIASH_META_TEMP_DIR="${['docker', 'podman', 'charliecloud'].any{ it == workflow.containerEngine } ? '/tmp' : tmpDir}"
  |export VIASH_META_FUNCTIONALITY_NAME="${thisConfig.functionality.name}"
  |export VIASH_META_EXECUTABLE="\\\$VIASH_META_RESOURCES_DIR/\\\$VIASH_META_FUNCTIONALITY_NAME"
  |export VIASH_META_CONFIG="\\\$VIASH_META_RESOURCES_DIR/.config.vsh.yaml"
  |\${task.cpus ? "export VIASH_META_CPUS=\$task.cpus" : "" }
  |\${task.memory?.bytes != null ? "export VIASH_META_MEMORY_B=\$task.memory.bytes" : "" }
  |if [ ! -z \\\${VIASH_META_MEMORY_B+x} ]; then
  |  export VIASH_META_MEMORY_KB=\\\$(( (\\\$VIASH_META_MEMORY_B+1023) / 1024 ))
  |  export VIASH_META_MEMORY_MB=\\\$(( (\\\$VIASH_META_MEMORY_KB+1023) / 1024 ))
  |  export VIASH_META_MEMORY_GB=\\\$(( (\\\$VIASH_META_MEMORY_MB+1023) / 1024 ))
  |  export VIASH_META_MEMORY_TB=\\\$(( (\\\$VIASH_META_MEMORY_GB+1023) / 1024 ))
  |  export VIASH_META_MEMORY_PB=\\\$(( (\\\$VIASH_META_MEMORY_TB+1023) / 1024 ))
  |fi
  |
  |# meta synonyms
  |export VIASH_TEMP="\\\$VIASH_META_TEMP_DIR"
  |export TEMP_DIR="\\\$VIASH_META_TEMP_DIR"
  |
  |# create output dirs if need be
  |function mkdir_parent {
  |  for file in "\\\$@"; do 
  |    mkdir -p "\\\$(dirname "\\\$file")"
  |  done
  |}
  |$createParentStr
  |
  |# argument exports${inputFileExports.join()}
  |\$parInject
  |
  |# process script
  |${escapedScript}
  |\"\"\"
  |}
  |""".stripMargin()

  // TODO: print on debug
  // if (processArgs.debug == true) {
  //   println("######################\n$procStr\n######################")
  // }

  // create runtime process
  def ownerParams = new ScriptBinding.ParamsMap()
  def binding = new ScriptBinding().setParams(ownerParams)
  def module = new IncludeDef.Module(name: procKey)
  def scriptParser = new ScriptParser(session)
    .setModule(true)
    .setBinding(binding)
  scriptParser.scriptPath = ScriptMeta.current().getScriptPath()
  def moduleScript = scriptParser.runScript(procStr)
    .getScript()

  // register module in meta
  meta.addModule(moduleScript, module.name, module.alias)

  // retrieve and return process from meta
  return meta.getProcess(procKey)
}

def debug(processArgs, debugKey) {
  if (processArgs.debug) {
    view { "process '${processArgs.key}' $debugKey tuple: $it"  }
  } else {
    map { it }
  }
}

def workflowFactory(Map args) {
  def processArgs = processProcessArgs(args)
  def key = processArgs["key"]
  def meta = ScriptMeta.current()

  def workflowKey = key

  def processObj = null
  
  workflow workflowInstance {
    take:
    input_

    main:
    if (processObj == null) {
      processObj = processFactory(processArgs)
    }

    mid1_ = input_
      | debug(processArgs, "input")
      | map { tuple ->
        tuple = tuple.clone()
        
        if (processArgs.map) {
          tuple = processArgs.map(tuple)
        }
        if (processArgs.mapId) {
          tuple[0] = processArgs.mapId(tuple[0])
        }
        if (processArgs.mapData) {
          tuple[1] = processArgs.mapData(tuple[1])
        }
        if (processArgs.mapPassthrough) {
          tuple = tuple.take(2) + processArgs.mapPassthrough(tuple.drop(2))
        }

        // check tuple
        assert tuple instanceof List : 
          "Error in module '${key}': element in channel should be a tuple [id, data, ...otherargs...]\n" +
          "  Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" +
          "  Expected class: List. Found: tuple.getClass() is ${tuple.getClass()}"
        assert tuple.size() >= 2 : 
          "Error in module '${key}': expected length of tuple in input channel to be two or greater.\n" +
          "  Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" +
          "  Found: tuple.size() == ${tuple.size()}"
        
        // check id field
        assert tuple[0] instanceof CharSequence : 
          "Error in module '${key}': first element of tuple in channel should be a String\n" +
          "  Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" +
          "  Found: ${tuple[0]}"
        
        // match file to input file
        if (processArgs.auto.simplifyInput && (tuple[1] instanceof Path || tuple[1] instanceof List)) {
          def inputFiles = thisConfig.functionality.allArguments
            .findAll { it.type == "file" && it.direction == "input" }
          
          assert inputFiles.size() == 1 : 
              "Error in module '${key}' id '${tuple[0]}'.\n" +
              "  Anonymous file inputs are only allowed when the process has exactly one file input.\n" +
              "  Expected: inputFiles.size() == 1. Found: inputFiles.size() is ${inputFiles.size()}"

          tuple[1] = [[ inputFiles[0].plainName, tuple[1] ]].collectEntries()
        }

        // check data field
        assert tuple[1] instanceof Map : 
          "Error in module '${key}' id '${tuple[0]}': second element of tuple in channel should be a Map\n" +
          "  Example: [\"id\", [input: file('foo.txt'), arg: 10]].\n" +
          "  Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}"

        // rename keys of data field in tuple
        if (processArgs.renameKeys) {
          assert processArgs.renameKeys instanceof Map : 
              "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" +
              "  Example: renameKeys: ['new_key': 'old_key'].\n" +
              "  Expected class: Map. Found: renameKeys.getClass() is ${processArgs.renameKeys.getClass()}"
          assert tuple[1] instanceof Map : 
              "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" +
              "  Expected class: Map. Found: tuple[1].getClass() is ${tuple[1].getClass()}"

          // TODO: allow renameKeys to be a function?
          processArgs.renameKeys.each { newKey, oldKey ->
            assert newKey instanceof CharSequence : 
              "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" +
              "  Example: renameKeys: ['new_key': 'old_key'].\n" +
              "  Expected class of newKey: String. Found: newKey.getClass() is ${newKey.getClass()}"
            assert oldKey instanceof CharSequence : 
              "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" +
              "  Example: renameKeys: ['new_key': 'old_key'].\n" +
              "  Expected class of oldKey: String. Found: oldKey.getClass() is ${oldKey.getClass()}"
            assert tuple[1].containsKey(oldKey) : 
              "Error renaming data keys in module '${key}' id '${tuple[0]}'.\n" +
              "  Key '$oldKey' is missing in the data map. tuple[1].keySet() is '${tuple[1].keySet()}'"
            tuple[1].put(newKey, tuple[1][oldKey])
          }
          tuple[1].keySet().removeAll(processArgs.renameKeys.collect{ newKey, oldKey -> oldKey })
        }
        tuple
      }

    if (processArgs.filter) {
      mid2_ = mid1_
        | filter{processArgs.filter(it)}
    } else {
      mid2_ = mid1_
    }

    if (processArgs.fromState) {
      mid3_ = mid2_
        | map{
          def new_data = processArgs["fromState"](it.take(2))
          [it[0], new_data]
        }
    } else {
      mid3_ = mid2_
    }

    out0_ = mid3_
      | debug(processArgs, "processed")
      | map { tuple ->
        def id = tuple[0]
        def data = tuple[1]

        // fetch default params from functionality
        def defaultArgs = thisConfig.functionality.allArguments
          .findAll { it.containsKey("default") }
          .collectEntries { [ it.plainName, it.default ] }

        // fetch overrides in params
        def paramArgs = thisConfig.functionality.allArguments
          .findAll { par ->
            def argKey = key + "__" + par.plainName
            params.containsKey(argKey) && params[argKey] != "viash_no_value"
          }
          .collectEntries { [ it.plainName, params[key + "__" + it.plainName] ] }
        
        // fetch overrides in data
        def dataArgs = thisConfig.functionality.allArguments
          .findAll { data.containsKey(it.plainName) }
          .collectEntries { [ it.plainName, data[it.plainName] ] }
        
        // combine params
        def combinedArgs = defaultArgs + paramArgs + processArgs.args + dataArgs

        // remove arguments with explicit null values
        combinedArgs.removeAll{it.value == null}

        if (workflow.stubRun) {
          // add id if missing
          combinedArgs = [id: 'stub'] + combinedArgs
        } else {
          // check whether required arguments exist
          thisConfig.functionality.allArguments
            .forEach { par ->
              if (par.required) {
                assert combinedArgs.containsKey(par.plainName): "Argument ${par.plainName} is required but does not have a value"
              }
            }
        }

        // TODO: check whether parameters have the right type

        // process input files separately
        def inputPaths = thisConfig.functionality.allArguments
          .findAll { it.type == "file" && it.direction == "input" }
          .collect { par ->
            def val = combinedArgs.containsKey(par.plainName) ? combinedArgs[par.plainName] : []
            def inputFiles = []
            if (val == null) {
              inputFiles = []
            } else if (val instanceof List) {
              inputFiles = val
            } else if (val instanceof Path) {
              inputFiles = [ val ]
            } else {
              inputFiles = []
            }
            if (!workflow.stubRun) {
              // throw error when an input file doesn't exist
              inputFiles.each{ file -> 
                assert file.exists() :
                  "Error in module '${key}' id '${id}' argument '${par.plainName}'.\n" +
                  "  Required input file does not exist.\n" +
                  "  Path: '$file'.\n" +
                  "  Expected input file to exist"
              }
            }
            inputFiles 
          } 

        // remove input files
        def argsExclInputFiles = thisConfig.functionality.allArguments
          .findAll { (it.type != "file" || it.direction != "input") && combinedArgs.containsKey(it.plainName) }
          .collectEntries { par ->
            def parName = par.plainName
            def val = combinedArgs[parName]
            if (par.multiple && val instanceof Collection) {
              val = val.join(par.multiple_sep)
            }
            if (par.direction == "output" && par.type == "file") {
              val = val.replaceAll('\\$id', id).replaceAll('\\$key', key)
            }
            [parName, val]
          }

        [ id ] + inputPaths + [ argsExclInputFiles, resourcesDir ]
      }
      | processObj
      | map { output ->
        def outputFiles = thisConfig.functionality.allArguments
          .findAll { it.type == "file" && it.direction == "output" }
          .indexed()
          .collectEntries{ index, par ->
            out = output[index + 1]
            // strip dummy '.exitcode' file from output (see nextflow-io/nextflow#2678)
            if (!out instanceof List || out.size() <= 1) {
              if (par.multiple) {
                out = []
              } else {
                assert !par.required :
                    "Error in module '${key}' id '${output[0]}' argument '${par.plainName}'.\n" +
                    "  Required output file is missing"
                out = null
              }
            } else if (out.size() == 2 && !par.multiple) {
              out = out[1]
            } else {
              out = out.drop(1)
            }
            [ par.plainName, out ]
          }
        
        // drop null outputs
        outputFiles.removeAll{it.value == null}

        if (processArgs.auto.simplifyOutput && outputFiles.size() == 1) {
          outputFiles = outputFiles.values()[0]
        }

        [ output[0], outputFiles ]
      }

    // join the output [id, output] with the previous state [id, state, ...]
    out1_ = out0_.join(mid2_, failOnDuplicate: true)
      // input tuple format: [id, output, prev_state, ...]
      // output tuple format: [id, new_state, ...]
      | map{
        def new_state = processArgs["toState"](it)
        [it[0], new_state] + it.drop(3)
      }
      | debug(processArgs, "output")


    emit:
    out1_
  }

  def wf = workflowInstance.cloneWithName(workflowKey)

  // add factory function
  wf.metaClass.run = { runArgs ->
    workflowFactory(runArgs)
  }
  // add config to module for later introspection
  wf.metaClass.config = thisConfig

  return wf
}

// initialise default workflow
myWfInstance = workflowFactory([:])

// add workflow to environment
ScriptMeta.current().addDefinition(myWfInstance)

// anonymous workflow for running this module as a standalone
workflow {
  def mergedConfig = thisConfig
  def mergedParams = [:] + params

  // add id argument if it's not already in the config
  if (mergedConfig.functionality.arguments.every{it.plainName != "id"}) {
    def idArg = [
      'name': '--id',
      'required': false,
      'type': 'string',
      'description': 'A unique id for every entry.',
      'multiple': false
    ]
    mergedConfig.functionality.arguments.add(0, idArg)
    mergedConfig = processConfig(mergedConfig)
  }
  if (!mergedParams.containsKey("id")) {
    mergedParams.id = "run"
  }

  helpMessage(mergedConfig)

  channelFromParams(mergedParams, mergedConfig)
    | preprocessInputs("config": mergedConfig)
    | view { "input: $it" }
    | myWfInstance.run(
      auto: [ publish: true ]
    )
    | view { "output: $it" }
}