From e280092671cd635ee47299ba042f02fab32e3246 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 26 Apr 2023 11:08:17 +0200 Subject: [PATCH 01/45] Start modifying samplesheet check (untested) --- bin/check_samplesheet.py | 64 +++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 7 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 16c8279a..4663a8f5 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -33,10 +33,23 @@ class RowChecker: ".fasta.gz", ) + VALID_PROTEIN_FORMATS = ( + ".faa", + ".fasta", + ".fa" + ) + + VALID_FEATURE_FORMATS = ( + ".gbk", + ".gff" + ) + def __init__( self, sample_col="sample", contig_col="fasta", + protein_col="protein" + feature_col="feature" **kwargs, ): """ @@ -46,18 +59,23 @@ def __init__( sample_col (str): The name of the column that contains a contig's identifier (default "sample"). contig_col (str): The name of the column that contains the contig's - FASTA file path (default "fastqa"). - + FASTA file path (default "fasta"). + protein_col (str): The name of the column that contains the contig's + amino acid FASTA file path (default "faa"). + feature_col (str): The name of the column that contains the contig's + feature file path (default "gbk"). """ super().__init__(**kwargs) self._sample_col = sample_col self._contig_col = contig_col + self._protein_col = protein_col + self._feature_col = feature_col self._seen = set() self.modified = [] def validate_and_transform(self, row): """ - Perform all validations on the given row and insert the read pairing status. + Perform all validations on the given row Args: row (dict): A mapping from column headers (keys) to elements of that row @@ -67,7 +85,11 @@ def validate_and_transform(self, row): self._validate_sample(row) self._validate_fasta(row) self._validate_fasta_format(row) - self._seen.add((row[self._sample_col], row[self._contig_col])) + self._validate_protein(row) + self._validate_protein_format(row) + self._validate_feature(row) + self._validate_feature_format(row) + self._seen.add((row[self._sample_col], row[self._contig_col], row[self._protein_col], row[self._feature_col])) self.modified.append(row) def _validate_sample(self, row): @@ -85,13 +107,41 @@ def _validate_fasta(self, row): ), f"The FASTA filename may not contain any spaces '{row[self._contig_col]}'." def _validate_fasta_format(self, row): - """Assert that a given filename has one of the expected FASTQ extensions.""" + """Assert that a given filename has one of the expected FASTA extensions.""" filename = Path(row[self._contig_col]).name assert any(filename.endswith(extension) for extension in self.VALID_FORMATS), ( f"The FASTA file has an unrecognized extension: {filename}\n" f"It should be one of: {', '.join(self.VALID_FORMATS)}" ) + def _validate_protein(self, row): + """Assert that the amino acid FASTA entry has the right format.""" + assert len(row[self._contig_col]) > 0 and ( + " " not in Path(row[self._protein_col]).name + ), f"The FASTA filename may not contain any spaces '{row[self._protein_col]}'." + + def _validate_protein_format(self, row): + """Assert that a given filename has one of the expected amino acid FASTA extensions.""" + filename = Path(row[self._contig_col]).name + assert any(filename.endswith(extension) for extension in self.VALID_PROTEIN_FORMATS), ( + f"The protein FASTA file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_PROTEIN_FORMATS)}" + ) + + def _validate_feature(self, row): + """Assert that the feature file entry has the right format.""" + assert len(row[self._contig_col]) > 0 and ( + " " not in Path(row[self._feature_col]).name + ), f"The feature GBK/GFF filename may not contain any spaces '{row[self._feature_col]}'." + + + def _validate_feature_format(self, row): + """Assert that a given filename has one of the expected feature extensions.""" + filename = Path(row[self._contig_col]).name + assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), ( + f"The FASTA file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FEATURE_FORMATS)}" + ) def read_head(handle, num_lines=10): """Read the specified number of lines from the current position in the file.""" @@ -141,8 +191,8 @@ def check_samplesheet(file_in, file_out): Example: This function checks that the samplesheet follows the following structure:: - sample,fasta - contig_1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/bacteroides_fragilis/genome/genome.fna.gz + sample,fasta,protein,feature + contig_1,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/bacteroides_fragilis/genome/genome.fna.gz,genome.faa.gz,genome.gbk """ required_columns = {"sample", "fasta"} From 9fbf52ced0553438e070523f16da6e1d7d5d0b49 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 10 May 2023 14:51:41 +0200 Subject: [PATCH 02/45] Made the samplesheet work if columns are existing (python error if the new columns not present) --- bin/check_samplesheet.py | 30 +++++++++++++++++------------- conf/base.config | 2 +- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 4663a8f5..00b6d70a 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -48,8 +48,8 @@ def __init__( self, sample_col="sample", contig_col="fasta", - protein_col="protein" - feature_col="feature" + protein_col="protein", + feature_col="feature", **kwargs, ): """ @@ -116,17 +116,19 @@ def _validate_fasta_format(self, row): def _validate_protein(self, row): """Assert that the amino acid FASTA entry has the right format.""" - assert len(row[self._contig_col]) > 0 and ( + if len(row[self._protein_col]) > 0: + assert ( " " not in Path(row[self._protein_col]).name - ), f"The FASTA filename may not contain any spaces '{row[self._protein_col]}'." + ), f"The protein FASTA filename may not contain any spaces '{row[self._protein_col]}'." def _validate_protein_format(self, row): """Assert that a given filename has one of the expected amino acid FASTA extensions.""" - filename = Path(row[self._contig_col]).name - assert any(filename.endswith(extension) for extension in self.VALID_PROTEIN_FORMATS), ( - f"The protein FASTA file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_PROTEIN_FORMATS)}" - ) + filename = Path(row[self._protein_col]).name + if len(row[self._protein_col]) > 0: + assert any(filename.endswith(extension) for extension in self.VALID_PROTEIN_FORMATS), ( + f"The protein FASTA file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_PROTEIN_FORMATS)}" + ) def _validate_feature(self, row): """Assert that the feature file entry has the right format.""" @@ -138,10 +140,11 @@ def _validate_feature(self, row): def _validate_feature_format(self, row): """Assert that a given filename has one of the expected feature extensions.""" filename = Path(row[self._contig_col]).name - assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), ( - f"The FASTA file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FEATURE_FORMATS)}" - ) + if len(row[self._feature_col]) > 0: + assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), ( + f"The feature file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FEATURE_FORMATS)}" + ) def read_head(handle, num_lines=10): """Read the specified number of lines from the current position in the file.""" @@ -212,6 +215,7 @@ def check_samplesheet(file_in, file_out): except AssertionError as error: logger.critical(f"{str(error)} On line {i + 2}.") sys.exit(1) + ## TODO: Update `validate_and_transform()` to not parse protein/gff if not presnet in file header = list(reader.fieldnames) header.insert(1, "single_end") # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. diff --git a/conf/base.config b/conf/base.config index 79c76f21..c517364b 100644 --- a/conf/base.config +++ b/conf/base.config @@ -96,7 +96,7 @@ process { cpus = 1 } - withName: BAKTA { + withName: BAKTA_BAKTA { memory = { check_max( 64.GB * task.attempt, 'memory' ) } cpus = { check_max( 8 * task.attempt, 'cpus' ) } time = { check_max( 8.h * task.attempt, 'time' ) } From cf2b8b7524572f55dda1b684a9a3b978717f9180 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 10 May 2023 15:20:21 +0200 Subject: [PATCH 03/45] Continue work --- bin/check_samplesheet.py | 4 ++-- conf/modules.config | 4 ++-- subworkflows/local/input_check.nf | 16 ++++++++++------ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 00b6d70a..92f03b2a 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -132,14 +132,14 @@ def _validate_protein_format(self, row): def _validate_feature(self, row): """Assert that the feature file entry has the right format.""" - assert len(row[self._contig_col]) > 0 and ( + assert len(row[self._feature_col]) > 0 and ( " " not in Path(row[self._feature_col]).name ), f"The feature GBK/GFF filename may not contain any spaces '{row[self._feature_col]}'." def _validate_feature_format(self, row): """Assert that a given filename has one of the expected feature extensions.""" - filename = Path(row[self._contig_col]).name + filename = Path(row[self._feature_col]).name if len(row[self._feature_col]) > 0: assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), ( f"The feature file has an unrecognized extension: {filename}\n" diff --git a/conf/modules.config b/conf/modules.config index a63789a2..e8ee1f4b 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -130,7 +130,7 @@ process { path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, mode: params.publish_dir_mode, enabled: params.save_annotations, - pattern: "*.{faa,fna,gff}", + pattern: "*.{faa.gz,fna.gz,gff.gz}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ @@ -146,7 +146,7 @@ process { path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, mode: params.publish_dir_mode, enabled: params.save_annotations, - pattern: "*.gbk", + pattern: "*.gbk.gz", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index d97e91e7..0517ae93 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -9,11 +9,11 @@ workflow INPUT_CHECK { samplesheet // file: /path/to/samplesheet.csv main: - SAMPLESHEET_CHECK ( samplesheet ) - .csv - .splitCsv ( header:true, sep:',' ) - .map { create_input_channels(it) } - .set { contigs } + contigs = SAMPLESHEET_CHECK ( samplesheet ) + .csv + .splitCsv ( header:true, sep:',' ) + .map { create_input_channels(it) } + .dump(tag: "output") emit: contigs // channel: [ val(meta), [ fasta ] ] @@ -29,7 +29,11 @@ def create_input_channels(LinkedHashMap row) { if (!file(row.fasta).exists()) { error("[funscan] error: please check input samplesheet. FASTA file does not exist for: \n${row.fasta}") } else { - array = [ meta, file(row.fasta) ] + array = [ + meta, + file(row.fasta), + file(row.protein, checkIfExists: true), + file(row.feature, checkIfExists: true) ] } return array From 80106d82c0f15df25270551fee90dd28a0eee6b2 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 24 May 2023 09:26:20 +0200 Subject: [PATCH 04/45] Apply suggestions from code review Co-authored-by: Moritz E. Beber --- bin/check_samplesheet.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 92f03b2a..db7ef64b 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -116,7 +116,7 @@ def _validate_fasta_format(self, row): def _validate_protein(self, row): """Assert that the amino acid FASTA entry has the right format.""" - if len(row[self._protein_col]) > 0: + if self._protein_col in row and len(row[self._protein_col]) > 0: assert ( " " not in Path(row[self._protein_col]).name ), f"The protein FASTA filename may not contain any spaces '{row[self._protein_col]}'." @@ -132,7 +132,8 @@ def _validate_protein_format(self, row): def _validate_feature(self, row): """Assert that the feature file entry has the right format.""" - assert len(row[self._feature_col]) > 0 and ( + if self._feature_col in row: + assert len(row[self._feature_col]) > 0 and ( " " not in Path(row[self._feature_col]).name ), f"The feature GBK/GFF filename may not contain any spaces '{row[self._feature_col]}'." From 1b60b24e42f9f8a168472bc7d91a6226178d2cb4 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 24 May 2023 12:55:51 +0200 Subject: [PATCH 05/45] Get most of the log working, needs more testing (particularly non FAA subwkflws) --- assets/multiqc_config.yml | 10 +++ bin/check_samplesheet.py | 44 +++++----- conf/modules.config | 2 +- docs/usage.md | 18 ++-- subworkflows/local/amp.nf | 6 +- subworkflows/local/annotation.nf | 90 ++++++++++++++++++++ subworkflows/local/input_check.nf | 6 +- workflows/funcscan.nf | 131 ++++++++++++++---------------- 8 files changed, 196 insertions(+), 111 deletions(-) create mode 100644 subworkflows/local/annotation.nf diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 356df32d..d4465939 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -10,6 +10,16 @@ report_section_order: "nf-core-funcscan-summary": order: -1002 +run_modules: + - prokka + - custom_content + +prokka_fn_snames: True + +table_columns_visible: + Prokka: + organism: False + export_plots: true custom_logo: "nf-core-funcscan_logo_flat_light.png" diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index db7ef64b..88e38429 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -33,16 +33,9 @@ class RowChecker: ".fasta.gz", ) - VALID_PROTEIN_FORMATS = ( - ".faa", - ".fasta", - ".fa" - ) + VALID_PROTEIN_FORMATS = (".faa", ".fasta", ".fa") - VALID_FEATURE_FORMATS = ( - ".gbk", - ".gff" - ) + VALID_FEATURE_FORMATS = (".gbk", ".gff") def __init__( self, @@ -115,14 +108,14 @@ def _validate_fasta_format(self, row): ) def _validate_protein(self, row): - """Assert that the amino acid FASTA entry has the right format.""" + """Assert that the optional amino acid FASTA entry has the right format.""" if self._protein_col in row and len(row[self._protein_col]) > 0: assert ( - " " not in Path(row[self._protein_col]).name + " " not in Path(row[self._protein_col]).name ), f"The protein FASTA filename may not contain any spaces '{row[self._protein_col]}'." def _validate_protein_format(self, row): - """Assert that a given filename has one of the expected amino acid FASTA extensions.""" + """Assert that a given filename has one of the expected (if supplied) amino acid FASTA extensions.""" filename = Path(row[self._protein_col]).name if len(row[self._protein_col]) > 0: assert any(filename.endswith(extension) for extension in self.VALID_PROTEIN_FORMATS), ( @@ -131,21 +124,22 @@ def _validate_protein_format(self, row): ) def _validate_feature(self, row): - """Assert that the feature file entry has the right format.""" - if self._feature_col in row: - assert len(row[self._feature_col]) > 0 and ( - " " not in Path(row[self._feature_col]).name - ), f"The feature GBK/GFF filename may not contain any spaces '{row[self._feature_col]}'." - + """Assert that the optional feature file entry has the right format.""" + if self._feature_col in row and len(row[self._feature_col]) > 0: + assert ( + " " not in Path(row[self._feature_col]).name + ), f"The feature GBK/GFF filename may not contain any spaces '{row[self._feature_col]}'." def _validate_feature_format(self, row): - """Assert that a given filename has one of the expected feature extensions.""" - filename = Path(row[self._feature_col]).name - if len(row[self._feature_col]) > 0: - assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), ( - f"The feature file has an unrecognized extension: {filename}\n" - f"It should be one of: {', '.join(self.VALID_FEATURE_FORMATS)}" - ) + """Assert that a given filename has one of the expected (if supplied) feature extensions.""" + if self._feature_col in row: + filename = Path(row[self._feature_col]).name + if len(row[self._feature_col]) > 0: + assert any(filename.endswith(extension) for extension in self.VALID_FEATURE_FORMATS), ( + f"The feature file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FEATURE_FORMATS)}" + ) + def read_head(handle, num_lines=10): """Read the specified number of lines from the current position in the file.""" diff --git a/conf/modules.config b/conf/modules.config index e8ee1f4b..a51b6e20 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -80,7 +80,7 @@ process { params.annotation_prokka_rawproduct ? '--rawproduct' : '', params.annotation_prokka_rnammer ? '--rnammer' : '', params.annotation_prokka_compliant ? '--compliant' : '', - params.annotation_prokka_addgenes ? '--addgenes' : '' + params.annotation_prokka_addgenes ? '--addgenes' : '', ].join(' ').trim() } diff --git a/docs/usage.md b/docs/usage.md index 6df046bf..e74488c7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -54,18 +54,22 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s The input samplesheet has to be a comma-separated file (`.csv`) with 2 columns (`sample`, and `fasta`), and a header row as shown in the examples below. ```bash -sample,fasta -sample_1,///wastewater_metagenome_contigs_1.fasta.gz -sample_2,///wastewater_metagenome_contigs_2.fasta.gz +sample,fasta,protein,feature +sample_1,///wastewater_metagenome_contigs_1.fasta.gz,, +sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa, ``` -| Column | Description | -| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | -| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +| Column | Description | +| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`. Leave empty if not available. | +| `feature` | Optional path to a pre-generated annotation file (`.gbk` or `.gff`) containing annotations information of `fasta`. Leave empty if not available. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +If you already have annotated contigs, you can supply these to the pipeline using optional `protein` and `feature` columns. If either of the two columns are supplied, pipeline annotation will not be performed for the corresponding FASTA file. + > ⚠️ We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. ## Notes on screening tools diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 6add2e7f..d0caa794 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -27,7 +27,7 @@ workflow AMP { // to ensure annotation is executed! ch_faa_for_amplify = faa ch_faa_for_amp_hmmsearch = faa - ch_faa_for_ampir = faa + ch_faa_for_ampir = faa.dump(tag: "amp_faa") ch_faa_for_ampcombi = faa // AMPLIFY @@ -109,10 +109,10 @@ workflow AMP { input: [ it[0] ] summary: it[1] } - + ch_tabix_input = Channel.of(['id':'ampcombi_complete_summary']) .combine(ch_ampcombi_summaries_out.summary.collectFile(name: 'ampcombi_complete_summary.csv', keepHeader:true)) - + TABIX_BGZIP(ch_tabix_input) emit: diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf new file mode 100644 index 00000000..4449b233 --- /dev/null +++ b/subworkflows/local/annotation.nf @@ -0,0 +1,90 @@ +/* + Run annotation tools +*/ + +include { PROKKA } from '../../modules/nf-core/prokka/main' +include { PRODIGAL as PRODIGAL_GFF } from '../../modules/nf-core/prodigal/main' +include { PRODIGAL as PRODIGAL_GBK } from '../../modules/nf-core/prodigal/main' +include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main' +include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main' +include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main' +include { GUNZIP as GUNZIP_FNA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_FAA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_GBK } from '../../modules/nf-core/gunzip/main' + +workflow ANNOTATION { + take: + fasta // tuple val(meta), path(contigs) + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.annotation_tool == "prodigal" ) { + PRODIGAL_GFF ( fasta, "gff" ) + GUNZIP_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) + GUNZIP_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) + GUNZIP_GFF ( PRODIGAL_GFF.out.gene_annotations ) + ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) + ch_annotation_faa = GUNZIP_FAA.out.gunzip + ch_annotation_fna = GUNZIP_FNA.out.gunzip + ch_annotation_gff = GUNZIP_GFF.out.gunzip + ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive + + if ( params.save_annotations == true ) { + PRODIGAL_GBK ( fasta, "gbk" ) + GUNZIP_GBK ( PRODIGAL_GBK.out.gene_annotations) + ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) + ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. + } + + } else if ( params.annotation_tool == "pyrodigal" ) { + + PYRODIGAL ( fasta ) + ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + ch_annotation_faa = PYRODIGAL.out.faa + ch_annotation_fna = PYRODIGAL.out.fna + ch_annotation_gff = PYRODIGAL.out.gff + ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK + + } else if ( params.annotation_tool == "prokka" ) { + + PROKKA ( fasta, [], [] ) + ch_versions = ch_versions.mix(PROKKA.out.versions) + ch_annotation_faa = PROKKA.out.faa + ch_annotation_fna = PROKKA.out.fna + ch_annotation_gff = PROKKA.out.gff + ch_annotation_gbk = PROKKA.out.gbk + ch_multiqc_files = PROKKA.out.txt + + } else if ( params.annotation_tool == "bakta" ) { + + // BAKTA prepare download + if ( params.annotation_bakta_db_localpath ) { + ch_bakta_db = Channel + .fromPath( params.annotation_bakta_db_localpath ) + .first() + } else { + BAKTA_BAKTADBDOWNLOAD ( ) + ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions ) + ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db ) + } + + BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) + ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) + ch_annotation_faa = BAKTA_BAKTA.out.faa + ch_annotation_fna = BAKTA_BAKTA.out.fna + ch_annotation_gff = BAKTA_BAKTA.out.gff + ch_annotation_gbk = BAKTA_BAKTA.out.gbff + + } + + emit: + versions = ch_versions + multiqc_files = ch_multiqc_files + faa = ch_annotation_faa // [ [meta], path(faa) ] + fna = ch_annotation_fna // [ [meta], path(fna) ] + gff = ch_annotation_gff // [ [meta], path(gff) ] + gbk = ch_annotation_gbk // [ [meta], path(gbk) ] +} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 0517ae93..1aaee7ed 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -13,7 +13,6 @@ workflow INPUT_CHECK { .csv .splitCsv ( header:true, sep:',' ) .map { create_input_channels(it) } - .dump(tag: "output") emit: contigs // channel: [ val(meta), [ fasta ] ] @@ -32,8 +31,9 @@ def create_input_channels(LinkedHashMap row) { array = [ meta, file(row.fasta), - file(row.protein, checkIfExists: true), - file(row.feature, checkIfExists: true) ] + row.protein ? file(row.protein, checkIfExists: true) : null, + row.feature ? file(row.feature, checkIfExists: true) : null + ] } return array diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index fa6a2164..6c26b3e7 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -73,9 +73,10 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // include { INPUT_CHECK } from '../subworkflows/local/input_check' -include { AMP } from '../subworkflows/local/amp' -include { ARG } from '../subworkflows/local/arg' -include { BGC } from '../subworkflows/local/bgc' +include { ANNOTATION } from '../subworkflows/local/annotation' +include { AMP } from '../subworkflows/local/amp' +include { ARG } from '../subworkflows/local/arg' +include { BGC } from '../subworkflows/local/bgc' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -88,18 +89,8 @@ include { BGC } from '../subworkflows/local/bgc' // include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { GUNZIP as GUNZIP_FASTA_PREP } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_FNA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_FAA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GFF } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GBK } from '../modules/nf-core/gunzip/main' include { BIOAWK } from '../modules/nf-core/bioawk/main' -include { PROKKA } from '../modules/nf-core/prokka/main' -include { PRODIGAL as PRODIGAL_GFF } from '../modules/nf-core/prodigal/main' -include { PRODIGAL as PRODIGAL_GBK } from '../modules/nf-core/prodigal/main' -include { PYRODIGAL } from '../modules/nf-core/pyrodigal/main' -include { BAKTA_BAKTADBDOWNLOAD } from '../modules/nf-core/bakta/baktadbdownload/main' -include { BAKTA_BAKTA } from '../modules/nf-core/bakta/bakta/main' +include { GUNZIP as GUNZIP_FASTA_PREP } from '../modules/nf-core/gunzip/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -124,19 +115,28 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) // Some tools require uncompressed input - fasta_prep = INPUT_CHECK.out.contigs + ch_fasta_prep = INPUT_CHECK.out.contigs + .map{ + meta, fasta, protein, feature -> + [meta, fasta] + } .branch { compressed: it[1].toString().endsWith('.gz') uncompressed: it[1] } - GUNZIP_FASTA_PREP ( fasta_prep.compressed ) + ch_preannotated_files = INPUT_CHECK.out.contigs.map{ + meta, fasta, protein, feature -> + [meta, protein, feature] + } + + GUNZIP_FASTA_PREP ( ch_fasta_prep.compressed ) ch_versions = ch_versions.mix(GUNZIP_FASTA_PREP.out.versions) // Merge all the already uncompressed and newly compressed FASTAs here into // a single input channel for downstream ch_prepped_fastas = GUNZIP_FASTA_PREP.out.gunzip - .mix(fasta_prep.uncompressed) + .mix(ch_fasta_prep.uncompressed) // Add to meta the length of longest contig for downstream filtering BIOAWK ( ch_prepped_fastas ) @@ -150,76 +150,52 @@ workflow FUNCSCAN { meta['longest_contig'] = Integer.parseInt(length) [ meta, fasta ] } + //.dump(tag: "prepped_input") /* ANNOTATION */ + // Join back prepped fastas with any other additional files (protein, fasta) + // Then we make specific channels for each context + ch_input_for_annotation = ch_prepped_fastas + .join(ch_preannotated_files) + .dump(tag: "joined") + .branch { + meta, fasta, protein, feature -> + annotated_protein: protein != null + annotated_feature: feature != null + unannotated: true + } // Some tools require annotated FASTAs // For prodigal: run twice, once for gff and once for gbk generation, (for parity with PROKKA which produces both) if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { - if ( params.annotation_tool == "prodigal" ) { - PRODIGAL_GFF ( ch_prepped_input, "gff" ) - GUNZIP_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) - GUNZIP_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) - GUNZIP_GFF ( PRODIGAL_GFF.out.gene_annotations ) - ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) - ch_annotation_faa = GUNZIP_FAA.out.gunzip - ch_annotation_fna = GUNZIP_FNA.out.gunzip - ch_annotation_gff = GUNZIP_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive - - if ( params.save_annotations == true ) { - PRODIGAL_GBK ( ch_prepped_input, "gbk" ) - GUNZIP_GBK ( PRODIGAL_GBK.out.gene_annotations) - ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) - ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. - } - } else if ( params.annotation_tool == "pyrodigal" ) { - PYRODIGAL ( ch_prepped_input ) - ch_versions = ch_versions.mix(PYRODIGAL.out.versions) - ch_annotation_faa = PYRODIGAL.out.faa - ch_annotation_fna = PYRODIGAL.out.fna - ch_annotation_gff = PYRODIGAL.out.gff - ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK - } else if ( params.annotation_tool == "prokka" ) { - PROKKA ( ch_prepped_input, [], [] ) - ch_versions = ch_versions.mix(PROKKA.out.versions) - ch_annotation_faa = PROKKA.out.faa - ch_annotation_fna = PROKKA.out.fna - ch_annotation_gff = PROKKA.out.gff - ch_annotation_gbk = PROKKA.out.gbk - } else if ( params.annotation_tool == "bakta" ) { - - // BAKTA prepare download - if ( params.annotation_bakta_db_localpath ) { - ch_bakta_db = Channel - .fromPath( params.annotation_bakta_db_localpath ) - .first() - } else { - BAKTA_BAKTADBDOWNLOAD ( ) - ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions ) - ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db ) - } - - BAKTA_BAKTA ( ch_prepped_input, ch_bakta_db, [], [] ) - ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) - ch_annotation_faa = BAKTA_BAKTA.out.faa - ch_annotation_fna = BAKTA_BAKTA.out.fna - ch_annotation_gff = BAKTA_BAKTA.out.gff - ch_annotation_gbk = BAKTA_BAKTA.out.gbff - } + ANNOTATION( ch_input_for_annotation.unannotated.map{meta, fasta, protein, feature -> [meta, fasta]}.dump(tag: "unannotated") ) + + ch_new_annotation_faa = ANNOTATION.out.faa.dump(tag: "faa") + ch_new_annotation_fna = ANNOTATION.out.fna + ch_new_annotation_gff = ANNOTATION.out.gff + ch_new_annotation_gbk = ANNOTATION.out.gbk } else { - ch_annotation_faa = Channel.empty() - ch_annotation_fna = Channel.empty() - ch_annotation_gff = Channel.empty() - ch_annotation_gbk = Channel.empty() + ch_new_annotation_faa = Channel.empty() + ch_new_annotation_fna = Channel.empty() + ch_new_annotation_gff = Channel.empty() + ch_new_annotation_gbk = Channel.empty() } + // Join back the pre-annotated FASTAs with newly annotated FASTAs + ch_annotation_proteins = ch_input_for_annotation.annotated_feature.map{meta, fasta, protein, feature -> [meta, feature]} + ch_annotation_faa = ch_new_annotation_faa.mix(ch_annotation_proteins) + + ch_annotation_features = ch_input_for_annotation.annotated_feature.map{meta, fasta, protein, feature -> [meta, feature]} + ch_annotation_gff = ch_annotation_features.filter { meta, feature -> feature.toString().endsWith('.gff') }.mix(ch_new_annotation_gff) + ch_annotation_gbk = ch_annotation_features.filter { meta, feature -> feature.toString().endsWith('.gbk') }.mix(ch_new_annotation_gbk) + + /* SCREENING */ @@ -266,10 +242,21 @@ workflow FUNCSCAN { ch_methods_description = Channel.value(methods_description) ch_multiqc_files = Channel.empty() + + + if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { + + if ( params.annotation_tool == 'prokka' ) { + ch_multiqc_files = ch_multiqc_files.mix(ANNOTATION.out.multiqc_files.map{it[1]}) + } + + } + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + MULTIQC ( ch_multiqc_files.collect(), ch_multiqc_config.toList(), From 43ee6155fa9679cc607962542a092aa294bc7975 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 31 May 2023 14:56:36 +0200 Subject: [PATCH 06/45] Sync latest dev changes from annotation into workflow --- subworkflows/local/annotation.nf | 60 ++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 23 deletions(-) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 4449b233..5ea008cf 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -2,16 +2,19 @@ Run annotation tools */ -include { PROKKA } from '../../modules/nf-core/prokka/main' -include { PRODIGAL as PRODIGAL_GFF } from '../../modules/nf-core/prodigal/main' -include { PRODIGAL as PRODIGAL_GBK } from '../../modules/nf-core/prodigal/main' -include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main' -include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main' -include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main' -include { GUNZIP as GUNZIP_FNA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_FAA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GFF } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_GBK } from '../../modules/nf-core/gunzip/main' +include { PROKKA } from '../../modules/nf-core/prokka/main' +include { PRODIGAL as PRODIGAL_GFF } from '../../modules/nf-core/prodigal/main' +include { PRODIGAL as PRODIGAL_GBK } from '../../modules/nf-core/prodigal/main' +include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main' +include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main' +include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main' +include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' workflow ANNOTATION { take: @@ -21,16 +24,21 @@ workflow ANNOTATION { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() + + if ( params.annotation_tool == "prodigal" ) { PRODIGAL_GFF ( fasta, "gff" ) - GUNZIP_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) - GUNZIP_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) - GUNZIP_GFF ( PRODIGAL_GFF.out.gene_annotations ) - ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) - ch_annotation_faa = GUNZIP_FAA.out.gunzip - ch_annotation_fna = GUNZIP_FNA.out.gunzip - ch_annotation_gff = GUNZIP_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive + GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) + GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) + GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations ) + ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GFF.out.versions) + ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip + ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip + ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive if ( params.save_annotations == true ) { PRODIGAL_GBK ( fasta, "gbk" ) @@ -42,11 +50,17 @@ workflow ANNOTATION { } else if ( params.annotation_tool == "pyrodigal" ) { PYRODIGAL ( fasta ) - ch_versions = ch_versions.mix(PYRODIGAL.out.versions) - ch_annotation_faa = PYRODIGAL.out.faa - ch_annotation_fna = PYRODIGAL.out.fna - ch_annotation_gff = PYRODIGAL.out.gff - ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK + GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) + GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) + GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.gff ) + ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GFF.out.versions) + ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PYRODIGAL_FAA.out.gunzip + ch_annotation_gff = GUNZIP_PYRODIGAL_FAA.out.gunzip + ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK } else if ( params.annotation_tool == "prokka" ) { From 183b81a7f069df0ff3eb79534acaa36f549078ed Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 31 May 2023 15:15:06 +0200 Subject: [PATCH 07/45] Update all modules to get right container version and also pyRodigal with gzip support --- modules.json | 70 +++++++++---------- modules/nf-core/abricate/run/main.nf | 2 +- modules/nf-core/ampcombi/main.nf | 2 +- modules/nf-core/ampir/main.nf | 5 +- modules/nf-core/ampir/meta.yml | 8 +-- modules/nf-core/amplify/predict/main.nf | 2 +- modules/nf-core/amrfinderplus/run/main.nf | 2 +- modules/nf-core/amrfinderplus/update/main.nf | 2 +- .../nf-core/antismash/antismashlite/main.nf | 2 +- .../antismashlitedownloaddatabases/main.nf | 2 +- modules/nf-core/bakta/bakta/main.nf | 2 +- modules/nf-core/bakta/baktadbdownload/main.nf | 2 +- modules/nf-core/bioawk/main.nf | 2 +- .../custom/dumpsoftwareversions/main.nf | 2 +- .../custom/dumpsoftwareversions/meta.yml | 2 + modules/nf-core/deeparg/downloaddata/main.nf | 2 +- modules/nf-core/deeparg/predict/main.nf | 2 +- modules/nf-core/deepbgc/download/main.nf | 2 +- modules/nf-core/deepbgc/pipeline/main.nf | 2 +- modules/nf-core/fargene/main.nf | 2 +- modules/nf-core/fastqc/main.nf | 2 +- modules/nf-core/gecco/run/main.nf | 2 +- modules/nf-core/gunzip/main.nf | 2 +- modules/nf-core/gunzip/meta.yml | 1 + .../nf-core/hamronization/abricate/main.nf | 2 +- .../hamronization/amrfinderplus/main.nf | 2 +- modules/nf-core/hamronization/deeparg/main.nf | 2 +- modules/nf-core/hamronization/fargene/main.nf | 2 +- modules/nf-core/hamronization/rgi/main.nf | 2 +- .../nf-core/hamronization/summarize/main.nf | 2 +- modules/nf-core/hmmer/hmmsearch/main.nf | 2 +- modules/nf-core/macrel/contigs/main.nf | 2 +- modules/nf-core/multiqc/main.nf | 2 +- modules/nf-core/multiqc/meta.yml | 3 +- modules/nf-core/prodigal/main.nf | 2 +- modules/nf-core/prodigal/meta.yml | 4 +- modules/nf-core/prokka/main.nf | 2 +- modules/nf-core/pyrodigal/main.nf | 18 +++-- modules/nf-core/pyrodigal/meta.yml | 10 +-- modules/nf-core/rgi/main/main.nf | 2 +- modules/nf-core/tabix/bgzip/main.nf | 2 +- modules/nf-core/untar/main.nf | 2 +- modules/nf-core/untar/meta.yml | 1 + subworkflows/local/amp.nf | 2 +- subworkflows/local/annotation.nf | 4 +- workflows/funcscan.nf | 7 +- 46 files changed, 103 insertions(+), 98 deletions(-) diff --git a/modules.json b/modules.json index cfc399b5..48f89af0 100644 --- a/modules.json +++ b/modules.json @@ -7,178 +7,178 @@ "nf-core": { "abricate/run": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "ampcombi": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "ampir": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "6ac776f62fad7360685a87680c5f57f74c3682dc", "installed_by": ["modules"] }, "amplify/predict": { "branch": "master", - "git_sha": "5293fc55d4d645cf9daffad835bee270d328ce91", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "amrfinderplus/run": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "amrfinderplus/update": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "antismash/antismashlite": { "branch": "master", - "git_sha": "3a8d2761600b789cbde3a7780a211a8328604af0", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "antismash/antismashlitedownloaddatabases": { "branch": "master", - "git_sha": "ac07fba2543c5c9b6da9306c2d3ee58bdafb5262", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "bakta/bakta": { "branch": "master", - "git_sha": "280c5c86b3da7dfcc92ebd5420584dd6ff26c4a8", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "bakta/baktadbdownload": { "branch": "master", - "git_sha": "280c5c86b3da7dfcc92ebd5420584dd6ff26c4a8", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "bioawk": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"], "patch": "modules/nf-core/bioawk/bioawk.diff" }, "custom/dumpsoftwareversions": { "branch": "master", - "git_sha": "b6d4d476aee074311c89d82a69c1921bd70c8180", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "deeparg/downloaddata": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "deeparg/predict": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "deepbgc/download": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "deepbgc/pipeline": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "fargene": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "fastqc": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "gecco/run": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "gunzip": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", "installed_by": ["modules"] }, "hamronization/abricate": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "hamronization/amrfinderplus": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "hamronization/deeparg": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "hamronization/fargene": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "hamronization/rgi": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "hamronization/summarize": { "branch": "master", - "git_sha": "0f8a77ff00e65eaeebc509b8156eaa983192474b", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "hmmer/hmmsearch": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "macrel/contigs": { "branch": "master", - "git_sha": "a5a5bb12e7cc25658fdc97d810f2d6688cdae169", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "multiqc": { "branch": "master", - "git_sha": "ee80d14721e76e2e079103b8dcd5d57129e584ba", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "prodigal": { "branch": "master", - "git_sha": "8c4f2d2ceeb5e9cc4a49a91c0e9a675b6724c043", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", "installed_by": ["modules"] }, "prokka": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "pyrodigal": { "branch": "master", - "git_sha": "93cca9af587f39eaaa357b9e589e3e657d8a0f75", + "git_sha": "1b91efd7ff7b2fb5fe0d78f0d2f6c728afc5e552", "installed_by": ["modules"] }, "rgi/main": { "branch": "master", - "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "tabix/bgzip": { "branch": "master", - "git_sha": "90294980a903ecebd99ac31d8b6c66af48fa8259", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", "installed_by": ["modules"] }, "untar": { "branch": "master", - "git_sha": "cc1f997fab6d8fde5dc0e6e2a310814df5b53ce7", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", "installed_by": ["modules"] } } diff --git a/modules/nf-core/abricate/run/main.nf b/modules/nf-core/abricate/run/main.nf index 05dac446..87ab0df9 100644 --- a/modules/nf-core/abricate/run/main.nf +++ b/modules/nf-core/abricate/run/main.nf @@ -5,7 +5,7 @@ process ABRICATE_RUN { conda "bioconda::abricate=1.0.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/abricate%3A1.0.1--ha8f3691_1': - 'quay.io/biocontainers/abricate:1.0.1--ha8f3691_1' }" + 'biocontainers/abricate:1.0.1--ha8f3691_1' }" input: tuple val(meta), path(assembly) diff --git a/modules/nf-core/ampcombi/main.nf b/modules/nf-core/ampcombi/main.nf index bb9b79f1..9cad25cb 100644 --- a/modules/nf-core/ampcombi/main.nf +++ b/modules/nf-core/ampcombi/main.nf @@ -5,7 +5,7 @@ process AMPCOMBI { conda "bioconda::ampcombi=0.1.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ampcombi:0.1.7--pyhdfd78af_0': - 'quay.io/biocontainers/ampcombi:0.1.7--pyhdfd78af_0' }" + 'biocontainers/ampcombi:0.1.7--pyhdfd78af_0' }" input: tuple val(meta), path(amp_input) diff --git a/modules/nf-core/ampir/main.nf b/modules/nf-core/ampir/main.nf index 19b63f66..50ef1e2e 100644 --- a/modules/nf-core/ampir/main.nf +++ b/modules/nf-core/ampir/main.nf @@ -5,7 +5,7 @@ process AMPIR { conda "conda-forge::r-ampir=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/r-ampir:1.1.0': - 'quay.io/biocontainers/r-ampir:1.1.0' }" + 'biocontainers/r-ampir:1.1.0' }" input: tuple val(meta), path(faa) @@ -24,14 +24,13 @@ process AMPIR { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - min_length = ("${min_length}" == "[]") ? "": " min_len = as.integer(${min_length})," // Fall back to AMPir default value if none specified if ("$faa" == "${prefix}.faa") error "Input and output names are the same, set prefix in module configuration to disambiguate!" """ #!/usr/bin/env Rscript library(ampir) input_seqs <- read_faa('${faa}') - prediction <- predict_amps(input_seqs,${min_length} model = '${model}') + prediction <- predict_amps(input_seqs,${min_length},model = '${model}') prediction <- prediction[which(prediction\$prob_AMP >= as.numeric(${min_probability})), ] output_seqs <- input_seqs[row.names(prediction), ] write.table(prediction, file = "${prefix}.tsv", row.names = FALSE, sep = "\t", quote = FALSE, dec = '.') diff --git a/modules/nf-core/ampir/meta.yml b/modules/nf-core/ampir/meta.yml index 7569ca69..9e854448 100644 --- a/modules/nf-core/ampir/meta.yml +++ b/modules/nf-core/ampir/meta.yml @@ -24,17 +24,17 @@ input: description: FASTA file containing amino acid sequences pattern: "*.{faa,fasta}" - model: - type: value + type: string description: Built-in model for AMP prediction pattern: "{precursor,mature}" - min_length: - type: value + type: integer description: Minimum protein length for which predictions will be generated pattern: "[0-9]+" - min_probability: - type: value + type: number description: Cut-off for AMP prediction - pattern: "[0-9][0-9]" + pattern: "[0-9].[0-9]+" output: - meta: diff --git a/modules/nf-core/amplify/predict/main.nf b/modules/nf-core/amplify/predict/main.nf index d47a402d..be5863f1 100644 --- a/modules/nf-core/amplify/predict/main.nf +++ b/modules/nf-core/amplify/predict/main.nf @@ -6,7 +6,7 @@ process AMPLIFY_PREDICT { conda "bioconda::amplify=1.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/amplify:1.1.0--hdfd78af_0': - 'quay.io/biocontainers/amplify:1.1.0--hdfd78af_0' }" + 'biocontainers/amplify:1.1.0--hdfd78af_0' }" input: tuple val(meta), path(faa) diff --git a/modules/nf-core/amrfinderplus/run/main.nf b/modules/nf-core/amrfinderplus/run/main.nf index 6234cbbb..8077bb48 100644 --- a/modules/nf-core/amrfinderplus/run/main.nf +++ b/modules/nf-core/amrfinderplus/run/main.nf @@ -5,7 +5,7 @@ process AMRFINDERPLUS_RUN { conda "bioconda::ncbi-amrfinderplus=3.10.42" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ncbi-amrfinderplus:3.10.42--h6e70893_0': - 'quay.io/biocontainers/ncbi-amrfinderplus:3.10.42--h6e70893_0' }" + 'biocontainers/ncbi-amrfinderplus:3.10.42--h6e70893_0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/amrfinderplus/update/main.nf b/modules/nf-core/amrfinderplus/update/main.nf index 0fdd61e5..a043a06a 100644 --- a/modules/nf-core/amrfinderplus/update/main.nf +++ b/modules/nf-core/amrfinderplus/update/main.nf @@ -5,7 +5,7 @@ process AMRFINDERPLUS_UPDATE { conda "bioconda::ncbi-amrfinderplus=3.10.42" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ncbi-amrfinderplus:3.10.42--h6e70893_0': - 'quay.io/biocontainers/ncbi-amrfinderplus:3.10.42--h6e70893_0' }" + 'biocontainers/ncbi-amrfinderplus:3.10.42--h6e70893_0' }" output: path "amrfinderdb.tar.gz", emit: db diff --git a/modules/nf-core/antismash/antismashlite/main.nf b/modules/nf-core/antismash/antismashlite/main.nf index 6a714185..1b551e6e 100644 --- a/modules/nf-core/antismash/antismashlite/main.nf +++ b/modules/nf-core/antismash/antismashlite/main.nf @@ -5,7 +5,7 @@ process ANTISMASH_ANTISMASHLITE { conda "bioconda::antismash-lite=6.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/antismash-lite:6.1.1--pyhdfd78af_0' : - 'quay.io/biocontainers/antismash-lite:6.1.1--pyhdfd78af_0' }" + 'biocontainers/antismash-lite:6.1.1--pyhdfd78af_0' }" containerOptions { workflow.containerEngine == 'singularity' ? diff --git a/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf b/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf index 029e9cf6..817db0c2 100644 --- a/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf +++ b/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf @@ -4,7 +4,7 @@ process ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES { conda "bioconda::antismash-lite=6.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/antismash-lite:6.1.1--pyhdfd78af_0' : - 'quay.io/biocontainers/antismash-lite:6.1.1--pyhdfd78af_0' }" + 'biocontainers/antismash-lite:6.1.1--pyhdfd78af_0' }" /* These files are normally downloaded/created by download-antismash-databases itself, and must be retrieved for input by manually running the command with conda or a standalone installation of antiSMASH. Therefore we do not recommend using this module for production pipelines, but rather require users to specify their own local copy of the antiSMASH database in pipelines. This is solely for use for CI tests of the nf-core/module version of antiSMASH. diff --git a/modules/nf-core/bakta/bakta/main.nf b/modules/nf-core/bakta/bakta/main.nf index 5fc10cb1..6399f5d0 100644 --- a/modules/nf-core/bakta/bakta/main.nf +++ b/modules/nf-core/bakta/bakta/main.nf @@ -5,7 +5,7 @@ process BAKTA_BAKTA { conda "bioconda::bakta=1.7.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bakta:1.7.0--pyhdfd78af_1' : - 'quay.io/biocontainers/bakta:1.7.0--pyhdfd78af_1' }" + 'biocontainers/bakta:1.7.0--pyhdfd78af_1' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/bakta/baktadbdownload/main.nf b/modules/nf-core/bakta/baktadbdownload/main.nf index f5e6d111..e62a2bed 100644 --- a/modules/nf-core/bakta/baktadbdownload/main.nf +++ b/modules/nf-core/bakta/baktadbdownload/main.nf @@ -4,7 +4,7 @@ process BAKTA_BAKTADBDOWNLOAD { conda "bioconda::bakta=1.7.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bakta:1.7.0--pyhdfd78af_1' : - 'quay.io/biocontainers/bakta:1.7.0--pyhdfd78af_1' }" + 'biocontainers/bakta:1.7.0--pyhdfd78af_1' }" output: path "db*" , emit: db diff --git a/modules/nf-core/bioawk/main.nf b/modules/nf-core/bioawk/main.nf index 32ded6e0..bd5c82dd 100644 --- a/modules/nf-core/bioawk/main.nf +++ b/modules/nf-core/bioawk/main.nf @@ -5,7 +5,7 @@ process BIOAWK { conda "bioconda::bioawk=1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/bioawk:1.0--h5bf99c6_6': - 'quay.io/biocontainers/bioawk:1.0--h5bf99c6_6' }" + 'biocontainers/bioawk:1.0--h5bf99c6_6' }" input: tuple val(meta), path(input) diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf index 800a6099..ebc87273 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/main.nf +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -5,7 +5,7 @@ process CUSTOM_DUMPSOFTWAREVERSIONS { conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path versions diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml index 60b546a0..c32657de 100644 --- a/modules/nf-core/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,7 +1,9 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: diff --git a/modules/nf-core/deeparg/downloaddata/main.nf b/modules/nf-core/deeparg/downloaddata/main.nf index 95bd6480..724a002b 100644 --- a/modules/nf-core/deeparg/downloaddata/main.nf +++ b/modules/nf-core/deeparg/downloaddata/main.nf @@ -4,7 +4,7 @@ process DEEPARG_DOWNLOADDATA { conda "bioconda::deeparg=1.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/deeparg:1.0.2--pyhdfd78af_1' : - 'quay.io/biocontainers/deeparg:1.0.2--pyhdfd78af_1' }" + 'biocontainers/deeparg:1.0.2--pyhdfd78af_1' }" /* We have to force singularity to run with -B to allow reading of a problematic file with borked read-write permissions in an upstream dependency (theanos). Original report: https://github.com/nf-core/funcscan/issues/23 diff --git a/modules/nf-core/deeparg/predict/main.nf b/modules/nf-core/deeparg/predict/main.nf index d2183170..11318476 100644 --- a/modules/nf-core/deeparg/predict/main.nf +++ b/modules/nf-core/deeparg/predict/main.nf @@ -5,7 +5,7 @@ process DEEPARG_PREDICT { conda "bioconda::deeparg=1.0.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/deeparg:1.0.2--pyhdfd78af_1' : - 'quay.io/biocontainers/deeparg:1.0.2--pyhdfd78af_1' }" + 'biocontainers/deeparg:1.0.2--pyhdfd78af_1' }" /* We have to force singularity to run with -B to allow reading of a problematic file with borked read-write permissions in an upstream dependency (theanos). Original report: https://github.com/nf-core/funcscan/issues/23 diff --git a/modules/nf-core/deepbgc/download/main.nf b/modules/nf-core/deepbgc/download/main.nf index f3282fc4..e4f0d503 100644 --- a/modules/nf-core/deepbgc/download/main.nf +++ b/modules/nf-core/deepbgc/download/main.nf @@ -4,7 +4,7 @@ process DEEPBGC_DOWNLOAD { conda "bioconda::deepbgc=0.1.30" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/deepbgc:0.1.30--pyhb7b1952_1': - 'quay.io/biocontainers/deepbgc:0.1.30--pyhb7b1952_1' }" + 'biocontainers/deepbgc:0.1.30--pyhb7b1952_1' }" output: path "deepbgc_db/" , emit: db diff --git a/modules/nf-core/deepbgc/pipeline/main.nf b/modules/nf-core/deepbgc/pipeline/main.nf index a0683ef7..e3aefba8 100644 --- a/modules/nf-core/deepbgc/pipeline/main.nf +++ b/modules/nf-core/deepbgc/pipeline/main.nf @@ -5,7 +5,7 @@ process DEEPBGC_PIPELINE { conda "bioconda::deepbgc=0.1.30" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/deepbgc:0.1.30--pyhb7b1952_1': - 'quay.io/biocontainers/deepbgc:0.1.30--pyhb7b1952_1' }" + 'biocontainers/deepbgc:0.1.30--pyhb7b1952_1' }" input: tuple val(meta), path(genome) diff --git a/modules/nf-core/fargene/main.nf b/modules/nf-core/fargene/main.nf index 909170b1..b0f6c183 100644 --- a/modules/nf-core/fargene/main.nf +++ b/modules/nf-core/fargene/main.nf @@ -6,7 +6,7 @@ process FARGENE { conda "bioconda::fargene=0.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fargene:0.1--py27h21c881e_4' : - 'quay.io/biocontainers/fargene:0.1--py27h21c881e_4' }" + 'biocontainers/fargene:0.1--py27h21c881e_4' }" input: // input may be fasta (for genomes or longer contigs) or paired-end fastq (for metagenome), the latter in addition with --meta flag diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf index 9ae58381..07d5e433 100644 --- a/modules/nf-core/fastqc/main.nf +++ b/modules/nf-core/fastqc/main.nf @@ -5,7 +5,7 @@ process FASTQC { conda "bioconda::fastqc=0.11.9" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0' : - 'quay.io/biocontainers/fastqc:0.11.9--0' }" + 'biocontainers/fastqc:0.11.9--0' }" input: tuple val(meta), path(reads) diff --git a/modules/nf-core/gecco/run/main.nf b/modules/nf-core/gecco/run/main.nf index 7405a461..b0a76a71 100644 --- a/modules/nf-core/gecco/run/main.nf +++ b/modules/nf-core/gecco/run/main.nf @@ -5,7 +5,7 @@ process GECCO_RUN { conda "bioconda::gecco=0.9.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/gecco:0.9.2--pyhdfd78af_0': - 'quay.io/biocontainers/gecco:0.9.2--pyhdfd78af_0' }" + 'biocontainers/gecco:0.9.2--pyhdfd78af_0' }" input: tuple val(meta), path(input), path(hmm) diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf index d906034c..e7189d2f 100644 --- a/modules/nf-core/gunzip/main.nf +++ b/modules/nf-core/gunzip/main.nf @@ -5,7 +5,7 @@ process GUNZIP { conda "conda-forge::sed=4.7" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(archive) diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml index 2e0e4054..4cdcdf4c 100644 --- a/modules/nf-core/gunzip/meta.yml +++ b/modules/nf-core/gunzip/meta.yml @@ -3,6 +3,7 @@ description: Compresses and decompresses files. keywords: - gunzip - compression + - decompression tools: - gunzip: description: | diff --git a/modules/nf-core/hamronization/abricate/main.nf b/modules/nf-core/hamronization/abricate/main.nf index 7a526ab9..b73e04be 100644 --- a/modules/nf-core/hamronization/abricate/main.nf +++ b/modules/nf-core/hamronization/abricate/main.nf @@ -5,7 +5,7 @@ process HAMRONIZATION_ABRICATE { conda "bioconda::hamronization=1.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hamronization:1.1.1--pyhdfd78af_0': - 'quay.io/biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" + 'biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" input: tuple val(meta), path(report) diff --git a/modules/nf-core/hamronization/amrfinderplus/main.nf b/modules/nf-core/hamronization/amrfinderplus/main.nf index 92a3f5fc..6f4cda34 100644 --- a/modules/nf-core/hamronization/amrfinderplus/main.nf +++ b/modules/nf-core/hamronization/amrfinderplus/main.nf @@ -5,7 +5,7 @@ process HAMRONIZATION_AMRFINDERPLUS { conda "bioconda::hamronization=1.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hamronization:1.1.1--pyhdfd78af_0': - 'quay.io/biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" + 'biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" input: tuple val(meta), path(report) diff --git a/modules/nf-core/hamronization/deeparg/main.nf b/modules/nf-core/hamronization/deeparg/main.nf index 8a9a1d7e..193b1ae1 100644 --- a/modules/nf-core/hamronization/deeparg/main.nf +++ b/modules/nf-core/hamronization/deeparg/main.nf @@ -5,7 +5,7 @@ process HAMRONIZATION_DEEPARG { conda "bioconda::hamronization=1.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hamronization:1.1.1--pyhdfd78af_0': - 'quay.io/biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" + 'biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" input: tuple val(meta), path(report) diff --git a/modules/nf-core/hamronization/fargene/main.nf b/modules/nf-core/hamronization/fargene/main.nf index 0e003429..79ebcc99 100644 --- a/modules/nf-core/hamronization/fargene/main.nf +++ b/modules/nf-core/hamronization/fargene/main.nf @@ -5,7 +5,7 @@ process HAMRONIZATION_FARGENE { conda "bioconda::hamronization=1.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hamronization:1.1.1--pyhdfd78af_0': - 'quay.io/biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" + 'biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" input: tuple val(meta), path(report) diff --git a/modules/nf-core/hamronization/rgi/main.nf b/modules/nf-core/hamronization/rgi/main.nf index 5da71230..bb550ee6 100644 --- a/modules/nf-core/hamronization/rgi/main.nf +++ b/modules/nf-core/hamronization/rgi/main.nf @@ -5,7 +5,7 @@ process HAMRONIZATION_RGI { conda "bioconda::hamronization=1.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hamronization:1.1.1--pyhdfd78af_0': - 'quay.io/biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" + 'biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" input: tuple val(meta), path(report) diff --git a/modules/nf-core/hamronization/summarize/main.nf b/modules/nf-core/hamronization/summarize/main.nf index 7660e160..fc58b720 100644 --- a/modules/nf-core/hamronization/summarize/main.nf +++ b/modules/nf-core/hamronization/summarize/main.nf @@ -4,7 +4,7 @@ process HAMRONIZATION_SUMMARIZE { conda "bioconda::hamronization=1.1.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hamronization:1.1.1--pyhdfd78af_0': - 'quay.io/biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" + 'biocontainers/hamronization:1.1.1--pyhdfd78af_0' }" input: path(reports) diff --git a/modules/nf-core/hmmer/hmmsearch/main.nf b/modules/nf-core/hmmer/hmmsearch/main.nf index d7759c1d..d40292d6 100644 --- a/modules/nf-core/hmmer/hmmsearch/main.nf +++ b/modules/nf-core/hmmer/hmmsearch/main.nf @@ -5,7 +5,7 @@ process HMMER_HMMSEARCH { conda "bioconda::hmmer=3.3.2" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/hmmer:3.3.2--h1b792b2_1' : - 'quay.io/biocontainers/hmmer:3.3.2--h1b792b2_1' }" + 'biocontainers/hmmer:3.3.2--h1b792b2_1' }" input: tuple val(meta), path(hmmfile), path(seqdb), val(write_align), val(write_target), val(write_domain) diff --git a/modules/nf-core/macrel/contigs/main.nf b/modules/nf-core/macrel/contigs/main.nf index 1eb1c0a0..df71bdea 100644 --- a/modules/nf-core/macrel/contigs/main.nf +++ b/modules/nf-core/macrel/contigs/main.nf @@ -5,7 +5,7 @@ process MACREL_CONTIGS { conda "bioconda::macrel=1.2.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/macrel:1.2.0--pyh5e36f6f_0': - 'quay.io/biocontainers/macrel:1.2.0--pyh5e36f6f_0' }" + 'biocontainers/macrel:1.2.0--pyh5e36f6f_0' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 4b604749..1fc387be 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -4,7 +4,7 @@ process MULTIQC { conda "bioconda::multiqc=1.14" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/multiqc:1.14--pyhdfd78af_0' : - 'quay.io/biocontainers/multiqc:1.14--pyhdfd78af_0' }" + 'biocontainers/multiqc:1.14--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index ebc29b27..f93b5ee5 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -1,3 +1,4 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json name: MultiQC description: Aggregate results from bioinformatics analyses across many samples into a single report keywords: @@ -37,7 +38,7 @@ output: description: MultiQC report file pattern: "multiqc_report.html" - data: - type: dir + type: directory description: MultiQC data dir pattern: "multiqc_data" - plots: diff --git a/modules/nf-core/prodigal/main.nf b/modules/nf-core/prodigal/main.nf index 4206166d..8cf87a6d 100644 --- a/modules/nf-core/prodigal/main.nf +++ b/modules/nf-core/prodigal/main.nf @@ -5,7 +5,7 @@ process PRODIGAL { conda "bioconda::prodigal=2.6.3 conda-forge::pigz=2.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' : - 'quay.io/biocontainers/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' }" + 'biocontainers/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' }" input: tuple val(meta), path(genome) diff --git a/modules/nf-core/prodigal/meta.yml b/modules/nf-core/prodigal/meta.yml index 8cb3d12e..30747a90 100644 --- a/modules/nf-core/prodigal/meta.yml +++ b/modules/nf-core/prodigal/meta.yml @@ -1,7 +1,9 @@ name: prodigal description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program keywords: - - sort + - prokaryotes + - gene finding + - microbial tools: - prodigal: description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program diff --git a/modules/nf-core/prokka/main.nf b/modules/nf-core/prokka/main.nf index 048d373f..60fbe232 100644 --- a/modules/nf-core/prokka/main.nf +++ b/modules/nf-core/prokka/main.nf @@ -5,7 +5,7 @@ process PROKKA { conda "bioconda::prokka=1.14.6" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/prokka%3A1.14.6--pl5321hdfd78af_4' : - 'quay.io/biocontainers/prokka:1.14.6--pl5321hdfd78af_4' }" + 'biocontainers/prokka:1.14.6--pl5321hdfd78af_4' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/pyrodigal/main.nf b/modules/nf-core/pyrodigal/main.nf index 2497cb7d..c6429b9d 100644 --- a/modules/nf-core/pyrodigal/main.nf +++ b/modules/nf-core/pyrodigal/main.nf @@ -4,17 +4,17 @@ process PYRODIGAL { conda "bioconda::pyrodigal=2.1.0" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/pyrodigal:2.1.0--py310h1425a21_0': - 'quay.io/biocontainers/pyrodigal:2.1.0--py310h1425a21_0' }" + 'https://depot.galaxyproject.org/singularity/mulled-v2-2fe9a8ce513c91df34b43a6610df94c3a2eb3bd0:697b3838b186fac6a9ceec198b09d4032162a079-0': + 'biocontainers/mulled-v2-2fe9a8ce513c91df34b43a6610df94c3a2eb3bd0:697b3838b186fac6a9ceec198b09d4032162a079-0' }" input: tuple val(meta), path(fasta) output: - tuple val(meta), path("*.gff") , emit: gff - tuple val(meta), path("*.fna") , emit: fna - tuple val(meta), path("*.faa") , emit: faa - tuple val(meta), path("*.score") , emit: score + tuple val(meta), path("*.gff.gz") , emit: gff + tuple val(meta), path("*.fna.gz") , emit: fna + tuple val(meta), path("*.faa.gz") , emit: faa + tuple val(meta), path("*.score.gz") , emit: score path "versions.yml" , emit: versions when: @@ -24,14 +24,18 @@ process PYRODIGAL { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ + pigz -cdf ${fasta} > pigz_fasta.fna + pyrodigal \\ $args \\ - -i ${fasta} \\ + -i pigz_fasta.fna \\ -o ${prefix}.gff \\ -d ${prefix}.fna \\ -a ${prefix}.faa \\ -s ${prefix}.score + pigz -nm ${prefix}* + cat <<-END_VERSIONS > versions.yml "${task.process}": pyrodigal: \$(echo \$(pyrodigal --version 2>&1 | sed 's/pyrodigal v//')) diff --git a/modules/nf-core/pyrodigal/meta.yml b/modules/nf-core/pyrodigal/meta.yml index cbceb2c8..6553e3c6 100644 --- a/modules/nf-core/pyrodigal/meta.yml +++ b/modules/nf-core/pyrodigal/meta.yml @@ -23,7 +23,7 @@ input: - fasta: type: file description: FASTA file - pattern: "*.{fasta,fa,fna}" + pattern: "*.{fasta.gz,fa.gz,fna.gz}" output: - meta: @@ -38,19 +38,19 @@ output: - gff: type: file description: gene annotations in gff format - pattern: "*.{gff}" + pattern: "*.{gff.gz}" - faa: type: file description: protein translations file - pattern: "*.{faa}" + pattern: "*.{faa.gz}" - fna: type: file description: nucleotide sequences file - pattern: "*.{fna}" + pattern: "*.{fna.gz}" - score: type: file description: all potential genes (with scores) - pattern: "*.{score}" + pattern: "*.{score.gz}" authors: - "@louperelo" diff --git a/modules/nf-core/rgi/main/main.nf b/modules/nf-core/rgi/main/main.nf index ce414691..26be7734 100644 --- a/modules/nf-core/rgi/main/main.nf +++ b/modules/nf-core/rgi/main/main.nf @@ -5,7 +5,7 @@ process RGI_MAIN { conda "bioconda::rgi=5.2.1" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/rgi:5.2.1--pyha8f3691_2': - 'quay.io/biocontainers/rgi:5.2.1--pyha8f3691_2' }" + 'biocontainers/rgi:5.2.1--pyha8f3691_2' }" input: tuple val(meta), path(fasta) diff --git a/modules/nf-core/tabix/bgzip/main.nf b/modules/nf-core/tabix/bgzip/main.nf index 6dd4e202..8c47d9e2 100644 --- a/modules/nf-core/tabix/bgzip/main.nf +++ b/modules/nf-core/tabix/bgzip/main.nf @@ -5,7 +5,7 @@ process TABIX_BGZIP { conda "bioconda::tabix=1.11" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/tabix:1.11--hdfd78af_0' : - 'quay.io/biocontainers/tabix:1.11--hdfd78af_0' }" + 'biocontainers/tabix:1.11--hdfd78af_0' }" input: tuple val(meta), path(input) diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf index 3384847a..8cd1856c 100644 --- a/modules/nf-core/untar/main.nf +++ b/modules/nf-core/untar/main.nf @@ -5,7 +5,7 @@ process UNTAR { conda "conda-forge::sed=4.7 bioconda::grep=3.4 conda-forge::tar=1.34" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : - 'ubuntu:20.04' }" + 'nf-core/ubuntu:20.04' }" input: tuple val(meta), path(archive) diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml index ea7a3f38..db241a6e 100644 --- a/modules/nf-core/untar/meta.yml +++ b/modules/nf-core/untar/meta.yml @@ -3,6 +3,7 @@ description: Extract files. keywords: - untar - uncompress + - extract tools: - untar: description: | diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index d0caa794..1e641893 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -27,7 +27,7 @@ workflow AMP { // to ensure annotation is executed! ch_faa_for_amplify = faa ch_faa_for_amp_hmmsearch = faa - ch_faa_for_ampir = faa.dump(tag: "amp_faa") + ch_faa_for_ampir = faa ch_faa_for_ampcombi = faa // AMPLIFY diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 5ea008cf..beb9650f 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -24,8 +24,6 @@ workflow ANNOTATION { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - - if ( params.annotation_tool == "prodigal" ) { PRODIGAL_GFF ( fasta, "gff" ) GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) @@ -42,7 +40,7 @@ workflow ANNOTATION { if ( params.save_annotations == true ) { PRODIGAL_GBK ( fasta, "gbk" ) - GUNZIP_GBK ( PRODIGAL_GBK.out.gene_annotations) + GUNZIP_PRODIGAL_GBK ( PRODIGAL_GBK.out.gene_annotations) ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 6c26b3e7..02f3172e 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -150,7 +150,6 @@ workflow FUNCSCAN { meta['longest_contig'] = Integer.parseInt(length) [ meta, fasta ] } - //.dump(tag: "prepped_input") /* ANNOTATION @@ -159,7 +158,6 @@ workflow FUNCSCAN { // Then we make specific channels for each context ch_input_for_annotation = ch_prepped_fastas .join(ch_preannotated_files) - .dump(tag: "joined") .branch { meta, fasta, protein, feature -> annotated_protein: protein != null @@ -171,10 +169,9 @@ workflow FUNCSCAN { // For prodigal: run twice, once for gff and once for gbk generation, (for parity with PROKKA which produces both) if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { - ANNOTATION( ch_input_for_annotation.unannotated.map{meta, fasta, protein, feature -> [meta, fasta]}.dump(tag: "unannotated") ) + ANNOTATION( ch_input_for_annotation.unannotated.map{meta, fasta, protein, feature -> [meta, fasta]}) - ch_new_annotation_faa = ANNOTATION.out.faa.dump(tag: "faa") - ch_new_annotation_fna = ANNOTATION.out.fna + ch_new_annotation_faa = ANNOTATION.out.faa ch_new_annotation_gff = ANNOTATION.out.gff ch_new_annotation_gbk = ANNOTATION.out.gbk From a7b03a787fa060670d9cbe5a4b113f580b74033d Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 7 Feb 2024 10:36:00 +0100 Subject: [PATCH 08/45] Add test nothing config --- conf/test_nothing.config | 48 ++++++++++++++++++++++++++++++++++++++++ nextflow.config | 1 + workflows/funcscan.nf | 3 ++- 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 conf/test_nothing.config diff --git a/conf/test_nothing.config b/conf/test_nothing.config new file mode 100644 index 00000000..7c6195f0 --- /dev/null +++ b/conf/test_nothing.config @@ -0,0 +1,48 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet.csv' + amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + + annotation_tool = 'prodigal' + + arg_fargene_hmmmodel = 'class_a,class_b_1_2' + + amp_skip_amplify = true + amp_skip_macrel = true + amp_skip_ampir = true + amp_skip_hmmsearch = true + + arg_skip_deeparg = true + arg_skip_fargene = true + arg_skip_rgi = true + arg_skip_amrfinderplus = true + arg_skip_deeparg = true + arg_skip_abricate = true + + bgc_skip_antismash = true + bgc_skip_deepbgc = true + bgc_skip_gecco = true + bgc_skip_hmmsearch = true + + +} diff --git a/nextflow.config b/nextflow.config index 15e90338..7a3964fc 100644 --- a/nextflow.config +++ b/nextflow.config @@ -325,6 +325,7 @@ profiles { test_bgc { includeConfig 'conf/test_bgc.config' } test_full { includeConfig 'conf/test_full.config' } test_deeparg { includeConfig 'conf/test_deeparg.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } } // Load igenomes.config if required diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 748207c7..70b87b25 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -170,7 +170,8 @@ workflow FUNCSCAN { // Join back prepped fastas with any other additional files (protein, fasta) // Then we make specific channels for each context ch_input_for_annotation = ch_prepped_fastas - .join(ch_preannotated_files) + .dump(tag: 'fastas') + .join(ch_preannotated_files.dump(tag: 'features')) .branch { meta, fasta, protein, feature -> annotated_protein: protein != null From ac0a25dc45776c84f25b51b6a23c021730eedb84 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 7 Feb 2024 12:30:54 +0100 Subject: [PATCH 09/45] Get back to previous starting point before bad merge removed old changes --- assets/schema_input.json | 18 ++++- subworkflows/local/annotation.nf | 110 ++++++++++++++----------------- workflows/funcscan.nf | 32 ++++----- 3 files changed, 85 insertions(+), 75 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 757969c2..615801e3 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -19,7 +19,23 @@ "format": "file-path", "exists": true, "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$", - "errorMessage": "Fasta file for reads must be provided, cannot contain spaces and must have extension '.fasta', '.fas', '.fa' or '.fna' (any of these can be optionally compressed as '.gz')", + "errorMessage": "Fasta file for reads must be provided, cannot contain spaces and must have extension '.fa.gz', '.fna.gz' or '.fasta.gz'", + "unique": true + }, + "protein": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.(fasta|fas|fa|faa)(\\.gz)?$", + "errorMessage": "Input file for peptide annotations has incorrect file format. File must end in .fasta, .faa, or .fa", + "unique": true + }, + "feature": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "^\\S+\\.g(bk|ff)\\.gz$$", + "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk or .gff", "unique": true } }, diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index beb9650f..9fab394f 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -24,73 +24,65 @@ workflow ANNOTATION { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - if ( params.annotation_tool == "prodigal" ) { - PRODIGAL_GFF ( fasta, "gff" ) - GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) - GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) - GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations ) - ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GFF.out.versions) - ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip - ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip - ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive + // For prodigal: run twice, once for gff and once for gbk generation, (for parity with PROKKA which produces both) + if ( params.annotation_tool == "prodigal" ) { + PRODIGAL_GFF ( ch_prepped_input, "gff" ) + GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) + GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) + GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations ) + ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) + ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip + ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip + ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive - if ( params.save_annotations == true ) { - PRODIGAL_GBK ( fasta, "gbk" ) - GUNZIP_PRODIGAL_GBK ( PRODIGAL_GBK.out.gene_annotations) - ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) - ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. - } - - } else if ( params.annotation_tool == "pyrodigal" ) { + if ( params.save_annotations == true ) { + PRODIGAL_GBK ( ch_prepped_input, "gbk" ) + ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) + ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. + } - PYRODIGAL ( fasta ) - GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) - GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) - GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.gff ) - ch_versions = ch_versions.mix(PYRODIGAL.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GFF.out.versions) - ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip - ch_annotation_fna = GUNZIP_PYRODIGAL_FAA.out.gunzip - ch_annotation_gff = GUNZIP_PYRODIGAL_FAA.out.gunzip - ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK + } else if ( params.annotation_tool == "pyrodigal" ) { - } else if ( params.annotation_tool == "prokka" ) { + PYRODIGAL ( ch_prepped_input ) + GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) + GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) + GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.gff ) + ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip + ch_annotation_gff = GUNZIP_PYRODIGAL_GFF.out.gunzip + ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK - PROKKA ( fasta, [], [] ) - ch_versions = ch_versions.mix(PROKKA.out.versions) - ch_annotation_faa = PROKKA.out.faa - ch_annotation_fna = PROKKA.out.fna - ch_annotation_gff = PROKKA.out.gff - ch_annotation_gbk = PROKKA.out.gbk - ch_multiqc_files = PROKKA.out.txt + } else if ( params.annotation_tool == "prokka" ) { - } else if ( params.annotation_tool == "bakta" ) { + PROKKA ( ch_prepped_input, [], [] ) + ch_versions = ch_versions.mix(PROKKA.out.versions) + ch_annotation_faa = PROKKA.out.faa + ch_annotation_fna = PROKKA.out.fna + ch_annotation_gff = PROKKA.out.gff + ch_annotation_gbk = PROKKA.out.gbk - // BAKTA prepare download - if ( params.annotation_bakta_db_localpath ) { - ch_bakta_db = Channel - .fromPath( params.annotation_bakta_db_localpath ) - .first() - } else { - BAKTA_BAKTADBDOWNLOAD ( ) - ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions ) - ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db ) - } + } else if ( params.annotation_tool == "bakta" ) { - BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) - ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) - ch_annotation_faa = BAKTA_BAKTA.out.faa - ch_annotation_fna = BAKTA_BAKTA.out.fna - ch_annotation_gff = BAKTA_BAKTA.out.gff - ch_annotation_gbk = BAKTA_BAKTA.out.gbff + // BAKTA prepare download + if ( params.annotation_bakta_db_localpath ) { + ch_bakta_db = Channel + .fromPath( params.annotation_bakta_db_localpath ) + .first() + } else { + BAKTA_BAKTADBDOWNLOAD ( ) + ch_versions = ch_versions.mix( BAKTA_BAKTADBDOWNLOAD.out.versions ) + ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db ) + } - } + BAKTA_BAKTA ( ch_prepped_input, ch_bakta_db, [], [] ) + ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) + ch_annotation_faa = BAKTA_BAKTA.out.faa + ch_annotation_fna = BAKTA_BAKTA.out.fna + ch_annotation_gff = BAKTA_BAKTA.out.gff + ch_annotation_gbk = BAKTA_BAKTA.out.gbff + } emit: versions = ch_versions diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index ec77e85a..a11ce14f 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -111,20 +111,23 @@ workflow FUNCSCAN { ch_input = Channel.fromSamplesheet("input") // Some tools require uncompressed input - ch_fasta_prep = INPUT_CHECK.out.contigs - .map{ - meta, fasta, protein, feature -> - [meta, fasta] - } - .branch { - compressed: it[1].toString().endsWith('.gz') - uncompressed: it[1] - } + ch_fasta_prep = ch_input + .dump(tag: 'ch_fasta_prep') + .map{ + meta, fasta, protein, feature -> + [meta, fasta] + } + .branch { + compressed: it[1].toString().endsWith('.gz') + uncompressed: it[1] + } - ch_preannotated_files = INPUT_CHECK.out.contigs.map{ - meta, fasta, protein, feature -> - [meta, protein, feature] - } + ch_preannotated_files = ch_input + .dump(tag: 'ch_preannotated_files') + .map{ + meta, fasta, protein, feature -> + [meta, protein, feature] + } GUNZIP_FASTA_PREP ( ch_fasta_prep.compressed ) ch_versions = ch_versions.mix(GUNZIP_FASTA_PREP.out.versions) @@ -163,7 +166,6 @@ workflow FUNCSCAN { } // Some tools require annotated FASTAs - // For prodigal: run twice, once for gff and once for gbk generation, (for parity with PROKKA which produces both) if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { ANNOTATION( ch_input_for_annotation.unannotated.map{meta, fasta, protein, feature -> [meta, fasta]}) @@ -183,7 +185,7 @@ workflow FUNCSCAN { // Join back the pre-annotated FASTAs with newly annotated FASTAs ch_annotation_proteins = ch_input_for_annotation.annotated_feature.map{meta, fasta, protein, feature -> [meta, feature]} - ch_annotation_faa = ch_new_annotation_faa.mix(ch_annotation_proteins) + ch_annotation_faa = ch_new_annotation_faa.mix(ch_annotation_proteins).dump(tag: 'ch_annotation_faa') ch_annotation_features = ch_input_for_annotation.annotated_feature.map{meta, fasta, protein, feature -> [meta, feature]} ch_annotation_gff = ch_annotation_features.filter { meta, feature -> feature.toString().endsWith('.gff') }.mix(ch_new_annotation_gff) From 437010679b4c8af8e2400b34fb3c0e5f58651a40 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 10:26:10 +0100 Subject: [PATCH 10/45] Refactor - have amp/arg working. Includes better fargene tagging --- assets/schema_input.json | 14 ++- conf/modules.config | 1 + subworkflows/local/amp.nf | 31 ++--- subworkflows/local/annotation.nf | 14 ++- subworkflows/local/arg.nf | 18 +-- workflows/funcscan.nf | 210 ++++++++++++++++++------------- 6 files changed, 163 insertions(+), 125 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 615801e3..032692f5 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -18,7 +18,7 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.(fasta|fas|fa|fna)(\\.gz)?$", + "pattern": "^\\S+\\.(fasta|fas|fna|fa)(\\.gz)?$", "errorMessage": "Fasta file for reads must be provided, cannot contain spaces and must have extension '.fa.gz', '.fna.gz' or '.fasta.gz'", "unique": true }, @@ -26,17 +26,19 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.(fasta|fas|fa|faa)(\\.gz)?$", - "errorMessage": "Input file for peptide annotations has incorrect file format. File must end in .fasta, .faa, or .fa", - "unique": true + "pattern": "^\\S+\\.(faa)(\\.gz)?$", + "errorMessage": "Input file for peptide annotations has incorrect file format. File must end in .fasta, .faa", + "unique": true, + "dependentRequired": ["feature"] }, "feature": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.g(bk|ff)\\.gz$$", + "pattern": "^\\S+\\.g(bk|ff)(\\.gz)?$", "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk or .gff", - "unique": true + "unique": true, + "dependentRequired": ["protein"] } }, "required": ["sample", "fasta"] diff --git a/conf/modules.config b/conf/modules.config index 0558176e..103a0258 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -228,6 +228,7 @@ process { } withName: FARGENE { + tag = {"${meta.id}|${hmm_model}"} publishDir = [ [ path: { "${params.outdir}/arg/fargene/${meta.id}" }, diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 1e641893..d03a425e 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -13,8 +13,8 @@ include { TABIX_BGZIP } from '../. workflow AMP { take: - contigs // tuple val(meta), path(contigs) - faa // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) + fastas // tuple val(meta), path(fasta) + faas // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) main: ch_versions = Channel.empty() @@ -25,10 +25,10 @@ workflow AMP { // When adding new tool that requires FAA, make sure to update conditions // in funcscan.nf around annotation and AMP subworkflow execution // to ensure annotation is executed! - ch_faa_for_amplify = faa - ch_faa_for_amp_hmmsearch = faa - ch_faa_for_ampir = faa - ch_faa_for_ampcombi = faa + ch_faa_for_amplify = faas + ch_faa_for_amp_hmmsearch = faas + ch_faa_for_ampir = faas + ch_faa_for_ampcombi = faas // AMPLIFY if ( !params.amp_skip_amplify ) { @@ -39,7 +39,7 @@ workflow AMP { // MACREL if ( !params.amp_skip_macrel ) { - MACREL_CONTIGS ( contigs ) + MACREL_CONTIGS ( fastas ) ch_versions = ch_versions.mix(MACREL_CONTIGS.out.versions) GUNZIP_MACREL_PRED ( MACREL_CONTIGS.out.amp_prediction ) GUNZIP_MACREL_ORFS ( MACREL_CONTIGS.out.all_orfs ) @@ -70,14 +70,15 @@ workflow AMP { [ meta, file ] } - ch_in_for_amp_hmmsearch = ch_faa_for_amp_hmmsearch.combine(ch_amp_hmm_models_meta) - .map { - meta_faa, faa, meta_hmm, hmm -> - def meta_new = [:] - meta_new['id'] = meta_faa['id'] - meta_new['hmm_id'] = meta_hmm['id'] - [ meta_new, hmm, faa, params.amp_hmmsearch_savealignments, params.amp_hmmsearch_savetargets, params.amp_hmmsearch_savedomains ] - } + ch_in_for_amp_hmmsearch = ch_faa_for_amp_hmmsearch + .combine(ch_amp_hmm_models_meta) + .map { + meta_faa, faa, meta_hmm, hmm -> + def meta_new = [:] + meta_new['id'] = meta_faa['id'] + meta_new['hmm_id'] = meta_hmm['id'] + [ meta_new, hmm, faa, params.amp_hmmsearch_savealignments, params.amp_hmmsearch_savetargets, params.amp_hmmsearch_savedomains ] + } AMP_HMMER_HMMSEARCH ( ch_in_for_amp_hmmsearch ) ch_versions = ch_versions.mix(AMP_HMMER_HMMSEARCH.out.versions) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 9fab394f..3d6daa6a 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -26,7 +26,7 @@ workflow ANNOTATION { // For prodigal: run twice, once for gff and once for gbk generation, (for parity with PROKKA which produces both) if ( params.annotation_tool == "prodigal" ) { - PRODIGAL_GFF ( ch_prepped_input, "gff" ) + PRODIGAL_GFF ( fasta, "gff" ) GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations ) @@ -37,14 +37,14 @@ workflow ANNOTATION { ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive if ( params.save_annotations == true ) { - PRODIGAL_GBK ( ch_prepped_input, "gbk" ) + PRODIGAL_GBK ( fasta, "gbk" ) ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. } } else if ( params.annotation_tool == "pyrodigal" ) { - PYRODIGAL ( ch_prepped_input ) + PYRODIGAL ( fasta ) GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.gff ) @@ -56,8 +56,10 @@ workflow ANNOTATION { } else if ( params.annotation_tool == "prokka" ) { - PROKKA ( ch_prepped_input, [], [] ) + PROKKA ( fasta, [], [] ) ch_versions = ch_versions.mix(PROKKA.out.versions) + ch_multiqc_files = PROKKA.out.txt + ch_annotation_faa = PROKKA.out.faa ch_annotation_fna = PROKKA.out.fna ch_annotation_gff = PROKKA.out.gff @@ -76,8 +78,10 @@ workflow ANNOTATION { ch_bakta_db = ( BAKTA_BAKTADBDOWNLOAD.out.db ) } - BAKTA_BAKTA ( ch_prepped_input, ch_bakta_db, [], [] ) + BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) + ch_multiqc_files = BAKTA.out.txt + ch_annotation_faa = BAKTA_BAKTA.out.faa ch_annotation_fna = BAKTA_BAKTA.out.fna ch_annotation_gff = BAKTA_BAKTA.out.gff diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index 17945a7c..b73b28db 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -18,7 +18,7 @@ include { HAMRONIZATION_SUMMARIZE } from '../../modules/nf-core/hamronizati workflow ARG { take: - contigs // tuple val(meta), path(contigs) + fastas // tuple val(meta), path(contigs) annotations // output from prokka main: @@ -40,7 +40,7 @@ workflow ARG { } if ( !params.arg_skip_amrfinderplus ) { - AMRFINDERPLUS_RUN ( contigs, ch_amrfinderplus_db ) + AMRFINDERPLUS_RUN ( fastas, ch_amrfinderplus_db ) ch_versions = ch_versions.mix(AMRFINDERPLUS_RUN.out.versions) // Reporting @@ -54,20 +54,20 @@ workflow ARG { ch_fargene_classes = Channel.fromList( params.arg_fargene_hmmmodel.tokenize(',') ) - ch_fargene_input = contigs + ch_fargene_input = fastas .combine(ch_fargene_classes) .map { - meta, contigs, hmm_class -> + meta, fastas, hmm_class -> def meta_new = meta.clone() meta_new['hmm_class'] = hmm_class - [ meta_new, contigs, hmm_class ] + [ meta_new, fastas, hmm_class ] } .multiMap { - contigs: [ it[0], it[1] ] + fastas: [ it[0], it[1] ] hmmclass: it[2] } - FARGENE ( ch_fargene_input.contigs, ch_fargene_input.hmmclass ) + FARGENE ( ch_fargene_input.fastas, ch_fargene_input.hmmclass ) ch_versions = ch_versions.mix(FARGENE.out.versions) // Reporting @@ -80,7 +80,7 @@ workflow ARG { // RGI run if ( !params.arg_skip_rgi ) { - RGI_MAIN ( contigs ) + RGI_MAIN ( fastas ) ch_versions = ch_versions.mix(RGI_MAIN.out.versions) // Reporting @@ -127,7 +127,7 @@ workflow ARG { // ABRicate run if ( !params.arg_skip_abricate ) { - ABRICATE_RUN ( contigs ) + ABRICATE_RUN ( fastas ) ch_versions = ch_versions.mix(ABRICATE_RUN.out.versions) HAMRONIZATION_ABRICATE ( ABRICATE_RUN.out.report, 'json', '1.0.1', '2021-Mar-27' ) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 1b2bfb1a..4f0016e0 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -79,7 +79,7 @@ include { BGC } from '../subworkflows/local/bgc' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' -include { GUNZIP as GUNZIP_FASTA_PREP } from '../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_INPUT_PREP } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../modules/nf-core/gunzip/main' @@ -111,98 +111,128 @@ workflow FUNCSCAN { ch_input = Channel.fromSamplesheet("input") // Some tools require uncompressed input - ch_fasta_prep = ch_input - .dump(tag: 'ch_fasta_prep') - .map{ - meta, fasta, protein, feature -> - [meta, fasta] - } + ch_input_prep = ch_input + .map{meta, fasta, faa, feature -> [meta, [fasta, faa, feature]]} + .transpose() .branch { compressed: it[1].toString().endsWith('.gz') uncompressed: it[1] } - ch_preannotated_files = ch_input - .dump(tag: 'ch_preannotated_files') - .map{ - meta, fasta, protein, feature -> - [meta, protein, feature] - } - - GUNZIP_FASTA_PREP ( ch_fasta_prep.compressed ) - ch_versions = ch_versions.mix(GUNZIP_FASTA_PREP.out.versions) + GUNZIP_INPUT_PREP ( ch_input_prep.compressed ) + ch_versions = ch_versions.mix(GUNZIP_INPUT_PREP.out.versions) // Merge all the already uncompressed and newly compressed FASTAs here into // a single input channel for downstream - ch_prepped_fastas = GUNZIP_FASTA_PREP.out.gunzip - .mix(ch_fasta_prep.uncompressed) + ch_intermediate_input = GUNZIP_INPUT_PREP.out.gunzip + .mix(ch_input_prep.uncompressed) + .groupTuple() + .map{ + meta, files -> + def fasta_found = files.find{it.toString().tokenize('.').last().matches('fasta|fas|fna|fa')} + def faa_found = files.find{it.toString().endsWith('.faa')} + def feature_found = files.find{it.toString().tokenize('.').last().matches('gff|gbk')} + + def fasta = fasta_found != null ? fasta_found : [] + def faa = faa_found != null ? faa_found : [] + def feature = feature_found != null ? feature_found : [] + + [meta, fasta, faa, feature] + } + .multiMap { + meta, fasta, faa, feature -> + fastas: [ meta, fasta ] + annotations : [ meta, faa, feature ] + } // Add to meta the length of longest contig for downstream filtering - BIOAWK ( ch_prepped_fastas ) + ch_intermediate_input.fastas + ch_intermediate_input.annotations + + BIOAWK ( ch_intermediate_input.fastas ) ch_versions = ch_versions.mix(BIOAWK.out.versions) - ch_prepped_input = ch_prepped_fastas - .join( BIOAWK.out.longest ) - .map{ - meta, fasta, length -> - def meta_new = meta.clone() - meta['longest_contig'] = Integer.parseInt(length) - [ meta, fasta ] - } + ch_intermediate_input = ch_intermediate_input.fastas + .join(BIOAWK.out.longest) + .join(ch_intermediate_input.annotations) + .map{ + meta, fasta, length, faa, feature -> + def meta_new = [:] + meta_new['longest_contig'] = Integer.parseInt(length) + [ meta + meta_new, fasta, faa, feature ] + } /* ANNOTATION */ - // Join back prepped fastas with any other additional files (protein, fasta) - // Then we make specific channels for each context - ch_input_for_annotation = ch_prepped_fastas - .dump(tag: 'fastas') - .join(ch_preannotated_files.dump(tag: 'features')) + + // Separate pre-annotated FASTAs from those that need annotation + ch_input_for_annotation = ch_intermediate_input .branch { meta, fasta, protein, feature -> - annotated_protein: protein != null - annotated_feature: feature != null + preannotated: protein != [] unannotated: true } // Some tools require annotated FASTAs if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { - ANNOTATION( ch_input_for_annotation.unannotated.map{meta, fasta, protein, feature -> [meta, fasta]}) - - ch_new_annotation_faa = ANNOTATION.out.faa - ch_new_annotation_gff = ANNOTATION.out.gff - ch_new_annotation_gbk = ANNOTATION.out.gbk + ch_unannotated_for_annotation = ch_input_for_annotation.unannotated + .map{ + meta, fasta, protein, feature -> + [meta, fasta] + } + + ANNOTATION( ch_unannotated_for_annotation ) + ch_versions = ch_versions.mix(ANNOTATION.out.versions) + + // Only Bakta and Prokka make GBK, else give empty entry to satisfy downstream cardinality + if ( ['bakta', 'prokka'].contains(params.annotation_tool) ) { + ch_new_annotation = ch_unannotated_for_annotation + .join(ANNOTATION.out.faa) + .join(ANNOTATION.out.gff) + .join(ANNOTATION.out.gbk) + } else { + ch_new_annotation = ch_unannotated_for_annotation + .join(ANNOTATION.out.faa) + .join(ANNOTATION.out.gff) + .map { + meta, fasta, faa, gff -> + [meta, fasta, faa, gff, []] + } + } } else { - - ch_new_annotation_faa = Channel.empty() - ch_new_annotation_fna = Channel.empty() - ch_new_annotation_gff = Channel.empty() - ch_new_annotation_gbk = Channel.empty() - + ch_new_annotation = Channel.empty() } - // Join back the pre-annotated FASTAs with newly annotated FASTAs - ch_annotation_proteins = ch_input_for_annotation.annotated_feature.map{meta, fasta, protein, feature -> [meta, feature]} - ch_annotation_faa = ch_new_annotation_faa.mix(ch_annotation_proteins).dump(tag: 'ch_annotation_faa') - - ch_annotation_features = ch_input_for_annotation.annotated_feature.map{meta, fasta, protein, feature -> [meta, feature]} - ch_annotation_gff = ch_annotation_features.filter { meta, feature -> feature.toString().endsWith('.gff') }.mix(ch_new_annotation_gff) - ch_annotation_gbk = ch_annotation_features.filter { meta, feature -> feature.toString().endsWith('.gbk') }.mix(ch_new_annotation_gbk) - + ch_prepped_input = ch_input_for_annotation.preannotated + .map{ + meta, fasta, protein, feature -> + def gff = feature.extension == 'gff' ? feature : [] + def gbk = feature.extension == 'gbk' ? feature : [] + [meta, fasta, protein, gff, gbk] + } + .mix(ch_new_annotation) + .multiMap { + meta, fasta, protein, gff, gbk -> + fastas: [meta, fasta] + faas: [meta, protein] + gffs: [meta, gff] + gbks: [meta, gbk] + } - /* - SCREENING - */ + // /* + // SCREENING + // */ /* AMPs */ if ( params.run_amp_screening ) { AMP ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") @@ -217,14 +247,14 @@ workflow FUNCSCAN { */ if ( params.run_arg_screening ) { if (params.arg_skip_deeparg) { - ARG ( ch_prepped_input, [] ) + ARG ( ch_prepped_input.fastas, [] ) } else { ARG ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. ARG screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -232,33 +262,33 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(ARG.out.versions) } - /* - BGCs - */ - if ( params.run_bgc_screening ) { - BGC ( - ch_prepped_input, - ch_annotation_gff - .filter { - meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") - !file.isEmpty() - }, - ch_annotation_faa - .filter { - meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") - !file.isEmpty() - }, - ch_annotation_gbk - .filter { - meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") - !file.isEmpty() - } - ) - ch_versions = ch_versions.mix(BGC.out.versions) - } + // /* + // BGCs + // */ + // if ( params.run_bgc_screening ) { + // BGC ( + // ch_prepped_input, + // ch_annotation_gff + // .filter { + // meta, file -> + // if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. BGC screening tools requiring this file will not be executed: ${meta.id}") + // !file.isEmpty() + // }, + // ch_annotation_faa + // .filter { + // meta, file -> + // if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") + // !file.isEmpty() + // }, + // ch_annotation_gbk + // .filter { + // meta, file -> + // if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + // !file.isEmpty() + // } + // ) + // ch_versions = ch_versions.mix(BGC.out.versions) + // } CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -287,7 +317,7 @@ workflow FUNCSCAN { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - if(params.annotation_tool=='prokka'){ch_multiqc_files = ch_multiqc_files.mix( PROKKA.out.txt.collect{it[1]}.ifEmpty([])) } + if(['prokka','bakta'].contains(params.annotation_tool)){ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files.collect{it[1]}.ifEmpty([])) } MULTIQC ( From 7eb5bed0a2f24f626f338e6d33fb9763c1129c74 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 11:20:30 +0100 Subject: [PATCH 11/45] Have it working --- conf/modules.config | 5 +++ conf/test_nothing.config | 2 +- subworkflows/local/bgc.nf | 74 ++++++++++++++++++--------------------- workflows/funcscan.nf | 70 +++++++++++++++++++----------------- 4 files changed, 78 insertions(+), 73 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index 103a0258..cf60fdf6 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -53,6 +53,7 @@ process { } withName: PROKKA { + ext.prefix = { "${meta.id}_prokka" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/prokka/" }, mode: params.publish_dir_mode, @@ -88,6 +89,7 @@ process { } withName: BAKTA_BAKTA { + ext.prefix = { "${meta.id}_bakta" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/bakta" }, mode: params.publish_dir_mode, @@ -118,6 +120,7 @@ process { } withName: PRODIGAL_GFF { + ext.prefix = { "${meta.id}_prodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, mode: params.publish_dir_mode, @@ -134,6 +137,7 @@ process { } withName: PRODIGAL_GBK { + ext.prefix = { "${meta.id}_prodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, mode: params.publish_dir_mode, @@ -150,6 +154,7 @@ process { } withName: PYRODIGAL { + ext.prefix = { "${meta.id}_pyrodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/pyrodigal/${meta.id}" }, mode: params.publish_dir_mode, diff --git a/conf/test_nothing.config b/conf/test_nothing.config index bfc89024..51d3d57e 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -22,6 +22,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' annotation_tool = 'prodigal' @@ -48,5 +49,4 @@ params { bgc_skip_gecco = true bgc_skip_hmmsearch = true - } diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 22074d16..2e96758a 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -6,7 +6,8 @@ include { UNTAR as UNTAR_CSS } from '../../modules/nf-core include { UNTAR as UNTAR_DETECTION } from '../../modules/nf-core/untar/main' include { UNTAR as UNTAR_MODULES } from '../../modules/nf-core/untar/main' include { ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES } from '../../modules/nf-core/antismash/antismashlitedownloaddatabases/main' -include { ANTISMASH_ANTISMASHLITE } from '../../modules/nf-core/antismash/antismashlite/main' +include { ANTISMASH_ANTISMASHLITE as ANTISMASH_GBK } from '../../modules/nf-core/antismash/antismashlite/main' +include { ANTISMASH_ANTISMASHLITE as ANTISMASH_GFF } from '../../modules/nf-core/antismash/antismashlite/main' include { GECCO_RUN } from '../../modules/nf-core/gecco/run/main' include { HMMER_HMMSEARCH as BGC_HMMER_HMMSEARCH } from '../../modules/nf-core/hmmer/hmmsearch/main' include { DEEPBGC_DOWNLOAD } from '../../modules/nf-core/deepbgc/download/main' @@ -16,10 +17,10 @@ include { COMBGC } from '../../modules/local/c workflow BGC { take: - fna // tuple val(meta), path(PREPPED_INPUT.out.fna) - gff // tuple val(meta), path(.out.gff) - faa // tuple val(meta), path(.out.faa) - gbk // tuple val(meta), path(.out.gbk) + fastas // tuple val(meta), path(PREPPED_INPUT.out.fna) + faas // tuple val(meta), path(.out.faa) + gffs // tuple val(meta), path(.out.gff) + gbks // tuple val(meta), path(.out.gbk) main: ch_versions = Channel.empty() @@ -28,7 +29,7 @@ workflow BGC { // When adding new tool that requires FAA, make sure to update conditions // in funcscan.nf around annotation and AMP subworkflow execution // to ensure annotation is executed! - ch_faa_for_bgc_hmmsearch = faa + ch_faa_for_bgc_hmmsearch = faas // ANTISMASH if ( !params.bgc_skip_antismash ) { @@ -68,52 +69,45 @@ workflow BGC { } - if ( params.annotation_tool == 'prodigal' || params.annotation_tool == "pyrodigal" ) { + // Exact input combination to antismash depends on whether gff (requires fna) or gbk (just gbk) necessary - ch_antismash_input = fna.join(gff, by: 0) + ch_antismash_gff_input = fastas.join(gffs, by: 0) .filter { - meta, fna, gff -> + meta, fastas, gff -> if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. Antismash will not be run for sample: ${meta.id}." meta.longest_contig >= params.bgc_antismash_sampleminlength } .multiMap { - meta, fna, gff -> - fna: [ meta, fna ] - gff: [ gff ] + meta, fastas, gff -> + fastas: [ meta, fastas ] + gffs: [ gff ] } - ANTISMASH_ANTISMASHLITE ( ch_antismash_input.fna, ch_antismash_databases, ch_antismash_directory, ch_antismash_input.gff ) + ANTISMASH_GFF ( ch_antismash_gff_input.fastas, ch_antismash_databases, ch_antismash_directory, ch_antismash_gff_input.gffs ) + ch_versions = ch_versions.mix(ANTISMASH_GFF.out.versions) - } else if ( params.annotation_tool == 'prokka' ) { - - ch_antismash_input = gbk.filter { - meta, files -> - if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. Antismash will not be run for sample: ${meta.id}." - meta.longest_contig >= params.bgc_antismash_sampleminlength - } - - ANTISMASH_ANTISMASHLITE ( ch_antismash_input, ch_antismash_databases, ch_antismash_directory, [] ) - - } else if ( params.annotation_tool == 'bakta' ) { - - ch_antismash_input = gbk.filter { + ch_antismash_gbk_input = gbks.filter { meta, files -> if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. Antismash will not be run for sample: ${meta.id}." meta.longest_contig >= params.bgc_antismash_sampleminlength } - ANTISMASH_ANTISMASHLITE ( ch_antismash_input, ch_antismash_databases, ch_antismash_directory, [] ) - - } - - ch_versions = ch_versions.mix(ANTISMASH_ANTISMASHLITE.out.versions) - ch_antismashresults_for_combgc = ANTISMASH_ANTISMASHLITE.out.knownclusterblast_dir - .mix(ANTISMASH_ANTISMASHLITE.out.gbk_input) - .groupTuple() - .map{ - meta, files -> - [meta, files.flatten()] - } + ANTISMASH_GBK ( ch_antismash_gbk_input, ch_antismash_databases, ch_antismash_directory, [] ) + ch_versions = ch_versions.mix(ANTISMASH_GBK.out.versions) + + ch_antismashresults_for_combgc = ANTISMASH_GFF.out.knownclusterblast_dir.dump(tag: 'gff_cluster') + .dump(tag: 'antismash_gff_knownclusterblast_dir') + .mix(ANTISMASH_GFF.out.gbk_input.dump(tag: 'gff_input')) + .dump(tag: 'antismash_gff_gbk_input') + .mix(ANTISMASH_GBK.out.knownclusterblast_dir.dump(tag: 'gbk_cluster')) + .dump(tag: 'antismash_gbk_knownclusterblast_dir') + .mix(ANTISMASH_GBK.out.gbk_input.dump(tag: 'gbk_input')) + .dump(tag: 'antismash_gbk_gbk_input') + .groupTuple() + .map{ + meta, files -> + [meta, files.flatten()] + } ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix(ch_antismashresults_for_combgc) } @@ -130,14 +124,14 @@ workflow BGC { ch_versions = ch_versions.mix(DEEPBGC_DOWNLOAD.out.versions) } - DEEPBGC_PIPELINE ( fna, ch_deepbgc_database) + DEEPBGC_PIPELINE ( fastas, ch_deepbgc_database) ch_versions = ch_versions.mix(DEEPBGC_PIPELINE.out.versions) ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix(DEEPBGC_PIPELINE.out.bgc_tsv) } // GECCO if ( !params.bgc_skip_gecco ) { - ch_gecco_input = fna.groupTuple() + ch_gecco_input = fastas.groupTuple() .multiMap { fna: [ it[0], it[1], [] ] } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 4f0016e0..aa321015 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -110,6 +110,10 @@ workflow FUNCSCAN { ch_input = Channel.fromSamplesheet("input") + /////////////////////// + // INPUT PREPARATION // + /////////////////////// + // Some tools require uncompressed input ch_input_prep = ch_input .map{meta, fasta, faa, feature -> [meta, [fasta, faa, feature]]} @@ -162,9 +166,9 @@ workflow FUNCSCAN { [ meta + meta_new, fasta, faa, feature ] } - /* - ANNOTATION - */ + //////////////// + // ANNOTATION // + //////////////// // Separate pre-annotated FASTAs from those that need annotation ch_input_for_annotation = ch_intermediate_input @@ -222,20 +226,21 @@ workflow FUNCSCAN { gbks: [meta, gbk] } - // /* - // SCREENING - // */ + /////////////// + // SCREENING // + /////////////// /* AMPs */ + if ( params.run_amp_screening ) { AMP ( ch_prepped_input.fastas, ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -245,6 +250,7 @@ workflow FUNCSCAN { /* ARGs */ + if ( params.run_arg_screening ) { if (params.arg_skip_deeparg) { ARG ( ch_prepped_input.fastas, [] ) @@ -254,7 +260,7 @@ workflow FUNCSCAN { ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. ARG screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. ARG screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -265,30 +271,30 @@ workflow FUNCSCAN { // /* // BGCs // */ - // if ( params.run_bgc_screening ) { - // BGC ( - // ch_prepped_input, - // ch_annotation_gff - // .filter { - // meta, file -> - // if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. BGC screening tools requiring this file will not be executed: ${meta.id}") - // !file.isEmpty() - // }, - // ch_annotation_faa - // .filter { - // meta, file -> - // if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") - // !file.isEmpty() - // }, - // ch_annotation_gbk - // .filter { - // meta, file -> - // if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") - // !file.isEmpty() - // } - // ) - // ch_versions = ch_versions.mix(BGC.out.versions) - // } + if ( params.run_bgc_screening ) { + BGC ( + ch_prepped_input.fastas, + ch_prepped_input.faas + .filter { + meta, file -> + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_prepped_input.gffs + .filter { + meta, file -> + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GFF file. BGC screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + }, + ch_prepped_input.gbks + .filter { + meta, file -> + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + !file.isEmpty() + } + ) + ch_versions = ch_versions.mix(BGC.out.versions) + } CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') From 28a50eee76c055ce16f782f9df208765af64f0b5 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 12:03:03 +0100 Subject: [PATCH 12/45] Add test profile and docs --- conf/test_preannotated.config | 32 +++++++++++++++++++++++++++++++ conf/test_preannotated_bgc.config | 31 ++++++++++++++++++++++++++++++ docs/usage.md | 32 ++++++++++++++++++++----------- nextflow.config | 11 ++++++----- workflows/funcscan.nf | 7 +++++-- 5 files changed, 95 insertions(+), 18 deletions(-) create mode 100644 conf/test_preannotated.config create mode 100644 conf/test_preannotated_bgc.config diff --git a/conf/test_preannotated.config b/conf/test_preannotated.config new file mode 100644 index 00000000..e536cfd5 --- /dev/null +++ b/conf/test_preannotated.config @@ -0,0 +1,32 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile - preannotated input' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv' + amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' + + annotation_tool = 'prodigal' + + run_arg_screening = true + arg_fargene_hmmmodel = 'class_a,class_b_1_2' + + run_amp_screening = true +} diff --git a/conf/test_preannotated_bgc.config b/conf/test_preannotated_bgc.config new file mode 100644 index 00000000..29a56281 --- /dev/null +++ b/conf/test_preannotated_bgc.config @@ -0,0 +1,31 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/funcscan -profile test_bgc, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'BGC test profile - preannotated input BGC' + config_profile_description = 'Minimal test dataset to check BGC workflow function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv' + bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' + + annotation_tool = 'prodigal' + + run_arg_screening = false + run_amp_screening = false + run_bgc_screening = true +} diff --git a/docs/usage.md b/docs/usage.md index 6af8a294..5311910c 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -51,25 +51,35 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s --input '[path to samplesheet file]' ``` -The input samplesheet has to be a comma-separated file (`.csv`) with 2 columns (`sample`, and `fasta`), and a header row as shown in the examples below. +The input samplesheet has to be a comma-separated file (`.csv`) with 2 (`sample`, and `fasta`) or 4 columns (`sample`, `fasta`, `protein`, `feature`), and a header row as shown in the examples below. + +If you already have annotated contigs, you can supply these to the pipeline using optional `protein` and `feature` columns. If these two columns are supplied, pipeline annotation will be skipped the corresponding FASTA file, and the corresponding annotation files used instead. + +For two column + +```bash +sample,fasta +sample_1,///wastewater_metagenome_contigs_1.fasta.gz +sample_2,///wastewater_metagenome_contigs_2.fasta.gz +``` + +For four column: ```bash sample,fasta,protein,feature -sample_1,///wastewater_metagenome_contigs_1.fasta.gz,, -sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa, +sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wastewater_metagenome_contigs_1.faa,///wastewater_metagenome_contigs_1.fasta.gff +sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_1.faa,///wastewater_metagenome_contigs_1.fasta.gbk ``` -| Column | Description | -| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | -| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | -| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`. Leave empty if not available. | -| `feature` | Optional path to a pre-generated annotation file (`.gbk` or `.gff`) containing annotations information of `fasta`. Leave empty if not available. | +| Column | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | +| `feature` | Optional path to a pre-generated annotation file (`.gbk` or `.gff`) containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -If you already have annotated contigs, you can supply these to the pipeline using optional `protein` and `feature` columns. If either of the two columns are supplied, pipeline annotation will not be performed for the corresponding FASTA file. - > ⚠️ We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. ## Notes on screening tools diff --git a/nextflow.config b/nextflow.config index b15a01df..de7dbc5b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -328,11 +328,12 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_bgc { includeConfig 'conf/test_bgc.config' } - test_full { includeConfig 'conf/test_full.config' } - test_deeparg { includeConfig 'conf/test_deeparg.config' } - test_nothing { includeConfig 'conf/test_nothing.config' } + test { includeConfig 'conf/test.config' } + test_bgc { includeConfig 'conf/test_bgc.config' } + test_full { includeConfig 'conf/test_full.config' } + test_deeparg { includeConfig 'conf/test_deeparg.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } + test_preannotated { includeConfig 'conf/test_preannotated.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index aa321015..da43008a 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -323,8 +323,11 @@ workflow FUNCSCAN { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - if(['prokka','bakta'].contains(params.annotation_tool)){ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files.collect{it[1]}.ifEmpty([])) } - + if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { + if( ['prokka','bakta'].contains(params.annotation_tool) ){ + ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files.collect{it[1]}.ifEmpty([])) + } + } MULTIQC ( ch_multiqc_files.collect(), From 9caae3ebab61471e4c1803ba1ae770617df17c81 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 12:06:08 +0100 Subject: [PATCH 13/45] Include preanntotaed files in one of the CI runs --- .github/workflows/ci.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1f2c2667..1b29ca0a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,9 +27,9 @@ jobs: - "23.04.0" - "latest-everything" parameters: - - "--annotation_tool prodigal" - - "--annotation_tool prokka" - - "--annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions + - "--profile docker,test_preannotated --annotation_tool prodigal" + - "--profile docker,test --annotation_tool prokka" + - "--profile docker,test --annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions steps: - name: Check out pipeline code @@ -42,7 +42,7 @@ jobs: - name: Run pipeline with test data (AMP and ARG workflows) run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results ${{ matrix.parameters }} + nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results test_bgc: name: Run pipeline with test data (BGC workflow) From 23e3929a9636d66a40c2d0e60813984828a5bf53 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 12:08:34 +0100 Subject: [PATCH 14/45] Fix prettier linting --- assets/multiqc_config.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index bd41658b..4926cda5 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -14,8 +14,6 @@ run_modules: - prokka - custom_content -prokka_fn_snames: True - table_columns_visible: Prokka: organism: False From 663a7e9a64ae7b4126e940e70a664300b8a8d0a6 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 12:09:36 +0100 Subject: [PATCH 15/45] Fix ci command --- .github/workflows/ci.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1b29ca0a..f2b1e73d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -27,9 +27,9 @@ jobs: - "23.04.0" - "latest-everything" parameters: - - "--profile docker,test_preannotated --annotation_tool prodigal" - - "--profile docker,test --annotation_tool prokka" - - "--profile docker,test --annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions + - "-profile docker,test_preannotated --annotation_tool prodigal" + - "-profile docker,test --annotation_tool prokka" + - "-profile docker,test --annotation_tool bakta --annotation_bakta_db_downloadtype light --arg_skip_deeparg --arg_skip_amrfinderplus" # Skip deeparg and amrfinderplus due to otherwise running out of space on GitHub Actions steps: - name: Check out pipeline code From 6177c900e3d5217a69f2180f5aad886320fdafdf Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 12:11:44 +0100 Subject: [PATCH 16/45] Fix BAKTA to multiqc channel name --- subworkflows/local/annotation.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 3d6daa6a..40e93672 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -80,7 +80,7 @@ workflow ANNOTATION { BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) - ch_multiqc_files = BAKTA.out.txt + ch_multiqc_files = BAKTA_BAKTA.out.txt ch_annotation_faa = BAKTA_BAKTA.out.faa ch_annotation_fna = BAKTA_BAKTA.out.fna From bf0f5721ac00dd918ce96ce87ef3951887684a4e Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 12:16:21 +0100 Subject: [PATCH 17/45] Add a preannotated test to BGC workflows --- .github/workflows/ci.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2b1e73d..905c0d55 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ concurrency: jobs: test: - name: Run pipeline with test data (AMP and ARG workflows) + name: Run pipeline with test data (AMP/ARG) # Only run on push if this is the nf-core dev branch (merged PRs) if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" runs-on: ubuntu-latest @@ -45,7 +45,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results test_bgc: - name: Run pipeline with test data (BGC workflow) + name: Run pipeline with test data (BGC) # Only run on push if this is the nf-core dev branch (merged PRs) if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" runs-on: ubuntu-latest @@ -55,9 +55,9 @@ jobs: - "23.04.0" - "latest-everything" parameters: - - "--annotation_tool prodigal" - - "--annotation_tool prokka" - - "--annotation_tool bakta --annotation_bakta_db_downloadtype light" + - "-profile docker,test_preannotated_bgc --annotation_tool prodigal" + - "-profile docker,test_bgc --annotation_tool prokka" + - "-profile docker,test_bgc --annotation_tool bakta --annotation_bakta_db_downloadtype light" steps: - name: Check out pipeline code @@ -70,4 +70,4 @@ jobs: - name: Run pipeline with test data (BGC workflow) run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_bgc,docker --outdir ./results ${{ matrix.parameters }} --bgc_skip_deepbgc + nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results --bgc_skip_deepbgc From ea555ab1f7993982794b9da4a6944be16275177e Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 12:31:36 +0100 Subject: [PATCH 18/45] Make preannotated bgc config accesible --- nextflow.config | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/nextflow.config b/nextflow.config index de7dbc5b..097fe691 100644 --- a/nextflow.config +++ b/nextflow.config @@ -328,12 +328,13 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_bgc { includeConfig 'conf/test_bgc.config' } - test_full { includeConfig 'conf/test_full.config' } - test_deeparg { includeConfig 'conf/test_deeparg.config' } - test_nothing { includeConfig 'conf/test_nothing.config' } - test_preannotated { includeConfig 'conf/test_preannotated.config' } + test { includeConfig 'conf/test.config' } + test_bgc { includeConfig 'conf/test_bgc.config' } + test_full { includeConfig 'conf/test_full.config' } + test_deeparg { includeConfig 'conf/test_deeparg.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } + test_preannotated { includeConfig 'conf/test_preannotated.config' } + test_preannotated_bgc { includeConfig 'conf/test_preannotated_bgc.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile From 80c5b0c69f4ad316c4fdf01071ee4d365a4f5ac2 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 15:11:25 +0100 Subject: [PATCH 19/45] Install newer version of antismash to see if it'll work with the GFF verison (as reported on antiSMASH github) --- modules.json | 4 +-- .../antismash/antismashlite/environment.yml | 2 +- .../nf-core/antismash/antismashlite/main.nf | 36 ++++++++++++++++--- .../nf-core/antismash/antismashlite/meta.yml | 2 +- .../environment.yml | 2 +- .../antismashlitedownloaddatabases/main.nf | 15 ++++---- 6 files changed, 44 insertions(+), 17 deletions(-) diff --git a/modules.json b/modules.json index 71a31800..b0415668 100644 --- a/modules.json +++ b/modules.json @@ -37,12 +37,12 @@ }, "antismash/antismashlite": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "1c78323903f07d62bb57686914b567fb2018b1e4", "installed_by": ["modules"] }, "antismash/antismashlitedownloaddatabases": { "branch": "master", - "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "git_sha": "6c7bac5e7c4c00c43803df47e7db5d0c344b7ac4", "installed_by": ["modules"] }, "bakta/bakta": { diff --git a/modules/nf-core/antismash/antismashlite/environment.yml b/modules/nf-core/antismash/antismashlite/environment.yml index 76de944b..227b5264 100644 --- a/modules/nf-core/antismash/antismashlite/environment.yml +++ b/modules/nf-core/antismash/antismashlite/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::antismash-lite=6.1.1 + - bioconda::antismash-lite=7.1.0 diff --git a/modules/nf-core/antismash/antismashlite/main.nf b/modules/nf-core/antismash/antismashlite/main.nf index f0a890c4..b306e75d 100644 --- a/modules/nf-core/antismash/antismashlite/main.nf +++ b/modules/nf-core/antismash/antismashlite/main.nf @@ -4,14 +4,14 @@ process ANTISMASH_ANTISMASHLITE { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/antismash-lite:6.1.1--pyhdfd78af_0' : - 'biocontainers/antismash-lite:6.1.1--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/antismash-lite:7.1.0--pyhdfd78af_0' : + 'biocontainers/antismash-lite:7.1.0--pyhdfd78af_0' }" containerOptions { workflow.containerEngine == 'singularity' ? - "-B $antismash_dir:/usr/local/lib/python3.8/site-packages/antismash" : + "-B $antismash_dir:/usr/local/lib/python3.10/site-packages/antismash" : workflow.containerEngine == 'docker' ? - "-v \$PWD/$antismash_dir:/usr/local/lib/python3.8/site-packages/antismash" : + "-v \$PWD/$antismash_dir:/usr/local/lib/python3.10/site-packages/antismash" : '' } @@ -64,7 +64,33 @@ process ANTISMASH_ANTISMASHLITE { cat <<-END_VERSIONS > versions.yml "${task.process}": - antismash-lite: \$(antismash --version | sed 's/antiSMASH //') + antismash-lite: \$(echo \$(antismash --version) | sed 's/antiSMASH //') + END_VERSIONS + """ + + stub: + prefix = task.ext.suffix ? "${meta.id}${task.ext.suffix}" : "${meta.id}" + def VERSION = '7.1.0' // WARN: Version information not provided by tool during stub run. Please update this string when bumping container versions. + """ + mkdir -p ${prefix}/css + mkdir ${prefix}/images + mkdir ${prefix}/js + touch ${prefix}/NZ_CP069563.1.region001.gbk + touch ${prefix}/NZ_CP069563.1.region002.gbk + touch ${prefix}/css/bacteria.css + touch ${prefix}/genome.gbk + touch ${prefix}/genome.json + touch ${prefix}/genome.zip + touch ${prefix}/images/about.svg + touch ${prefix}/index.html + touch ${prefix}/js/antismash.js + touch ${prefix}/js/jquery.js + touch ${prefix}/regions.js + touch ${prefix}/test.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + antismash-lite: $VERSION END_VERSIONS """ } diff --git a/modules/nf-core/antismash/antismashlite/meta.yml b/modules/nf-core/antismash/antismashlite/meta.yml index 2c9ca7f1..21f506bd 100644 --- a/modules/nf-core/antismash/antismashlite/meta.yml +++ b/modules/nf-core/antismash/antismashlite/meta.yml @@ -22,7 +22,7 @@ tools: documentation: "https://docs.antismash.secondarymetabolites.org" tool_dev_url: "https://github.com/antismash/antismash" doi: "10.1093/nar/gkab335" - licence: "['AGPL v3']" + licence: ["AGPL v3"] input: - meta: type: map diff --git a/modules/nf-core/antismash/antismashlitedownloaddatabases/environment.yml b/modules/nf-core/antismash/antismashlitedownloaddatabases/environment.yml index e6288d56..b9323a93 100644 --- a/modules/nf-core/antismash/antismashlitedownloaddatabases/environment.yml +++ b/modules/nf-core/antismash/antismashlitedownloaddatabases/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::antismash-lite=6.1.1 + - bioconda::antismash-lite=7.1.0 diff --git a/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf b/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf index dec16bb3..e63f20d2 100644 --- a/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf +++ b/modules/nf-core/antismash/antismashlitedownloaddatabases/main.nf @@ -3,8 +3,8 @@ process ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/antismash-lite:6.1.1--pyhdfd78af_0' : - 'biocontainers/antismash-lite:6.1.1--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/antismash-lite:7.1.0--pyhdfd78af_0' : + 'biocontainers/antismash-lite:7.1.0--pyhdfd78af_0' }" /* These files are normally downloaded/created by download-antismash-databases itself, and must be retrieved for input by manually running the command with conda or a standalone installation of antiSMASH. Therefore we do not recommend using this module for production pipelines, but rather require users to specify their own local copy of the antiSMASH database in pipelines. This is solely for use for CI tests of the nf-core/module version of antiSMASH. @@ -14,9 +14,9 @@ process ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES { containerOptions { workflow.containerEngine == 'singularity' ? - "-B $database_css:/usr/local/lib/python3.8/site-packages/antismash/outputs/html/css,$database_detection:/usr/local/lib/python3.8/site-packages/antismash/detection,$database_modules:/usr/local/lib/python3.8/site-packages/antismash/modules" : + "-B $database_css:/usr/local/lib/python3.10/site-packages/antismash/outputs/html/css,$database_detection:/usr/local/lib/python3.10/site-packages/antismash/detection,$database_modules:/usr/local/lib/python3.10/site-packages/antismash/modules" : workflow.containerEngine == 'docker' ? - "-v \$PWD/$database_css:/usr/local/lib/python3.8/site-packages/antismash/outputs/html/css -v \$PWD/$database_detection:/usr/local/lib/python3.8/site-packages/antismash/detection -v \$PWD/$database_modules:/usr/local/lib/python3.8/site-packages/antismash/modules" : + "-v \$PWD/$database_css:/usr/local/lib/python3.10/site-packages/antismash/outputs/html/css -v \$PWD/$database_detection:/usr/local/lib/python3.10/site-packages/antismash/detection -v \$PWD/$database_modules:/usr/local/lib/python3.10/site-packages/antismash/modules" : '' } @@ -35,7 +35,7 @@ process ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES { script: def args = task.ext.args ?: '' - cp_cmd = ( session.config.conda && session.config.conda.enabled ) ? "cp -r \$(python -c 'import antismash;print(antismash.__file__.split(\"/__\")[0])') antismash_dir;" : "cp -r /usr/local/lib/python3.8/site-packages/antismash antismash_dir;" + cp_cmd = ( session.config.conda && session.config.conda.enabled ) ? "cp -r \$(python -c 'import antismash;print(antismash.__file__.split(\"/__\")[0])') antismash_dir;" : "cp -r /usr/local/lib/python3.10/site-packages/antismash antismash_dir;" """ download-antismash-databases \\ --database-dir antismash_db \\ @@ -51,7 +51,8 @@ process ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES { stub: def args = task.ext.args ?: '' - cp_cmd = (session.config.conda && session.config.conda.enabled ) ? "cp -r \$(python -c 'import antismash;print(antismash.__file__.split(\"/__\")[0])') antismash_dir;" : "cp -r /usr/local/lib/python3.8/site-packages/antismash antismash_dir;" + cp_cmd = (session.config.conda && session.config.conda.enabled ) ? "cp -r \$(python -c 'import antismash;print(antismash.__file__.split(\"/__\")[0])') antismash_dir;" : "cp -r /usr/local/lib/python3.10/site-packages/antismash antismash_dir;" + def VERSION = '7.1.0' // WARN: Version information not provided by tool during stub run. Please update this string when bumping container versions. """ echo "download-antismash-databases --database-dir antismash_db $args" @@ -62,7 +63,7 @@ process ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES { cat <<-END_VERSIONS > versions.yml "${task.process}": - antismash-lite: \$(antismash --version | sed 's/antiSMASH //') + antismash-lite: $VERSION END_VERSIONS """ } From 106b76eb06dd94fae715b20df24ecaf96488ce39 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 15:26:00 +0100 Subject: [PATCH 20/45] Use correct dummy files --- subworkflows/local/bgc.nf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 2e96758a..2625d678 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -48,9 +48,9 @@ workflow BGC { } else { // May need to update on each new version of antismash-lite due to changes to scripts inside these tars - ch_css_for_antismash = "https://github.com/nf-core/test-datasets/raw/91bb8781c576967e23d2c5315dd4d43213575033/data/delete_me/antismash/css.tar.gz" - ch_detection_for_antismash = "https://github.com/nf-core/test-datasets/raw/91bb8781c576967e23d2c5315dd4d43213575033/data/delete_me/antismash/detection.tar.gz" - ch_modules_for_antismash = "https://github.com/nf-core/test-datasets/raw/91bb8781c576967e23d2c5315dd4d43213575033/data/delete_me/antismash/modules.tar.gz" + ch_css_for_antismash = "https://github.com/nf-core/test-datasets/raw/59ddeb5929f89ddddaff292d67f9025812762b87/data/delete_me/antismash/css.tar.gz" + ch_detection_for_antismash = "https://github.com/nf-core/test-datasets/raw/59ddeb5929f89ddddaff292d67f9025812762b87/data/delete_me/antismash/detection.tar.gz" + ch_modules_for_antismash = "https://github.com/nf-core/test-datasets/raw/59ddeb5929f89ddddaff292d67f9025812762b87/data/delete_me/antismash/modules.tar.gz" UNTAR_CSS ( [ [], ch_css_for_antismash ] ) ch_versions = ch_versions.mix(UNTAR_CSS.out.versions) From c25cab1486f7cc6cc3d751d35c7cacd00b8dc19a Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 14 Feb 2024 21:40:02 +0100 Subject: [PATCH 21/45] Add warning about Prokka GBK/GFF --- CHANGELOG.md | 11 ++++++----- docs/usage.md | 41 ++++++++++++++++++++++++++++++++--------- workflows/funcscan.nf | 6 ++++++ 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8147b46..5dd7ced3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,11 +16,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Dependencies` -| Tool | Previous version | New version | -| ------- | ---------------- | ----------- | -| DeepARG | 1.0.2 | 1.0.4 | -| DeepBGC | 0.1.30 | 0.1.31 | -| MultiQC | 1.15 | 1.19 | +| Tool | Previous version | New version | +| --------- | ---------------- | ----------- | +| antiSMASH | 6.1.1 | 7.1.0 | +| DeepARG | 1.0.2 | 1.0.4 | +| DeepBGC | 0.1.30 | 0.1.31 | +| MultiQC | 1.15 | 1.19 | ### `Deprecated` diff --git a/docs/usage.md b/docs/usage.md index 5311910c..ceb3f4e4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -80,7 +80,9 @@ sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wast An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -> ⚠️ We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. +:::warning +We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. +::: ## Notes on screening tools @@ -92,11 +94,19 @@ antiSMASH has a minimum contig parameter, in which only contigs of a certain len To prevent entire pipeline failures due to a single 'bad sample', nf-core/funcscan will filter out any input sample in which none of the contigs reach the minimum contig length in bp specified with `--bgc_antismash_sampleminlength` (default: 1000). -> ⚠️ If a sample does not reach this contig length threshold, you will receive a warning in your console and in the `.nextflow.log` file, and no result files will exist for this sample in your results directory for this tool. +:::warning +If a sample does not reach this contig length threshold, you will receive a warning in your console and in the `.nextflow.log` file, and no result files will exist for this sample in your results directory for this tool. +::: When the annotation is run with Prokka, the resulting `.gbk` file passed to antiSMASH may produce the error `translation longer than location allows` and end the pipeline run. This Prokka bug has been reported before (see [discussion on GitHub](https://github.com/antismash/antismash/discussions/450)) and is not likely to be fixed soon. -> ⚠️ If antiSMASH is run for BGC detection, we recommend to **not** run Prokka for annotation but instead use the default annotation tool (Pyrodigal) or switch to Prodigal, or (for bacteria only!) Bakta. +:::warning +Prokka GFF generated files [appears to be incompatible with antiSMASH](https://github.com/antismash/antismash/issues/364), and will likely fail! We recommend running or supplying Prodigal or Pyrodigal annotations instead. +::: + +:::warning +If antiSMASH is run for BGC detection, we recommend to **not** run Prokka for annotation but instead use the default annotation tool (Pyrodigal) or switch > to Prodigal, or (for bacteria only!) Bakta. +:::warning ## Databases and reference files @@ -106,7 +116,10 @@ nf-core/funcscan offers the functionality to auto-download databases for you, an We **highly recommend** allowing the pipeline to download these databases for you on a first run, saving these to your results directory with `--save_databases`, then moving these to a different location (in case you wish to delete the results directory of this first run). An exception to this is HMM files where no auto-downloading functionality is possible. -> ⚠️ We generally do not recommend downloading the databases yourself, as this can often be non-trivial to do! +:::warning + +> We generally do not recommend downloading the databases yourself, as this can often be non-trivial to do! +> ::: As a reference, we will describe below where and how you can obtain databases and reference files used for tools included in the pipeline. @@ -128,7 +141,9 @@ And then passed to the pipeline with: --annotation_bakta_db_localpath ///db/ ``` -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: ### hmmsearch @@ -186,7 +201,9 @@ The downloaded database folder contains the AMR related files: 2. Supply the database directory path to the pipeline as described above. -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: ### DeepARG @@ -215,7 +232,9 @@ You can then supply the path to resulting database directory with: Note that if you supply your own database that is not downloaded by the pipeline, make sure to also supply `--arg_deeparg_data_version` along with the version number so hAMRonization will correctly display the database version in the summary report. -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: ### antiSMASH @@ -238,9 +257,13 @@ To supply the database directories to the pipeline: Note that the names of the supplied folders must differ from each other (e.g. `antismash_db` and `antismash_dir`). If they are not provided, the databases will be auto-downloaded upon each BGC screening run of the pipeline. -> ℹ️ The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +:::info +The flag `--save_databases` saves the pipeline-downloaded databases in your results directory. You can then move these to a central cache directory of your choice for re-use in the future. +::: -> ℹ️ If installing with conda, the installation directory will be `lib/python3.8/site-packages/antismash` from the base directory of your conda install or conda environment directory. +:::info +If installing with conda, the installation directory will be `lib/python3.8/site-packages/antismash` from the base directory of your conda install or conda environment directory. +::: ### DeepBGC diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index da43008a..ab69738d 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -137,6 +137,12 @@ workflow FUNCSCAN { def faa_found = files.find{it.toString().endsWith('.faa')} def feature_found = files.find{it.toString().tokenize('.').last().matches('gff|gbk')} + // https://github.com/antismash/antismash/issues/364 + if ( params.run_bgc_screening && !params.bgc_skip_antismash && feature_found != null ) { + log.warn("[nf-core/funcscan] antiSMASH screening requested and pre-annotated files given.") + log.warn("Be aware that Prokka generated GFF or GBK files will likely fail with antiSMASH!") + log.warn("See usage docs. File: " + feature_found.name) } + def fasta = fasta_found != null ? fasta_found : [] def faa = faa_found != null ? faa_found : [] def feature = feature_found != null ? feature_found : [] From f9f808dca3e6e77049ff3f8c838fc17d22962eb7 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Thu, 4 Apr 2024 16:57:44 +0200 Subject: [PATCH 22/45] Wrapping my head around it --- .github/workflows/ci.yml | 12 ++--- CHANGELOG.md | 1 + subworkflows/local/amp.nf | 34 ++++++------ subworkflows/local/annotation.nf | 64 +++++++++++----------- subworkflows/local/arg.nf | 14 ++--- subworkflows/local/bgc.nf | 89 ++++++++++++++++-------------- subworkflows/local/taxa_class.nf | 4 +- workflows/funcscan.nf | 92 +++++++++++++++++--------------- 8 files changed, 164 insertions(+), 146 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 677735e5..a7385b6f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -76,10 +76,10 @@ jobs: - name: Run pipeline with test data (BGC workflow) run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_bgc,docker --outdir ./results ${{ matrix.parameters }} --bgc_skip_deepbgc + nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results --bgc_skip_deepbgc test_taxonomy: - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) + name: Run pipeline with test data (AMP, ARG and BGC taxonomy) # Only run on push if this is the nf-core dev branch (merged PRs) if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/funcscan') }}" runs-on: ubuntu-latest @@ -89,9 +89,9 @@ jobs: - "23.04.0" - "latest-everything" parameters: - - "--annotation_tool prodigal" - - "--annotation_tool prokka" - - "--annotation_tool bakta --annotation_bakta_db_downloadtype light" + - "-profile docker,test_taxonomy --annotation_tool prodigal" # TODO: Add test_taxonomy_preannotated.config + - "-profile docker,test_taxonomy --annotation_tool prokka" + - "-profile docker,test_taxonomy --annotation_tool bakta --annotation_bakta_db_downloadtype light" steps: - name: Check out pipeline code @@ -107,4 +107,4 @@ jobs: - name: Run pipeline with test data (AMP, ARG and BGC taxonomy workflows) run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_taxonomy,docker --outdir ./results ${{ matrix.parameters }} + nextflow run ${GITHUB_WORKSPACE} ${{ matrix.parameters }} --outdir ./results diff --git a/CHANGELOG.md b/CHANGELOG.md index 41f396ab..559f4fe3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#332](https://github.com/nf-core/funcscan/pull/332) & [#327](https://github.com/nf-core/funcscan/pull/327) Merged pipeline template of nf-core/tools version 2.12.1 (by @jfy133, @jasmezz) - [#338](https://github.com/nf-core/funcscan/pull/338) Set `--meta` parameter to default for Bakta, with singlemode optional. (by @jasmezz) - [#343](https://github.com/nf-core/funcscan/pull/343) Added contig taxonomic classification using [MMseqs2](https://github.com/soedinglab/MMseqs2/). (by @darcy220606) +- [#340](https://github.com/nf-core/funcscan/pull/340) Added support for supplying pre-annotated sequences to the pipeline. (by @jfy133, @jasmezz) ### `Fixed` diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index b2cb5573..7489f73d 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -14,9 +14,9 @@ include { MERGE_TAXONOMY_AMPCOMBI } from '.. workflow AMP { take: - contigs // tuple val(meta), path(contigs) - faa // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) - tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + fastas // tuple val(meta), path(contigs) + faas // tuple val(meta), path(PROKKA/PRODIGAL.out.faa) + tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -27,11 +27,10 @@ workflow AMP { // When adding new tool that requires FAA, make sure to update conditions // in funcscan.nf around annotation and AMP subworkflow execution // to ensure annotation is executed! - - ch_faa_for_amplify = faa - ch_faa_for_amp_hmmsearch = faa - ch_faa_for_ampir = faa - ch_faa_for_ampcombi = faa + ch_faa_for_amplify = faas + ch_faa_for_amp_hmmsearch = faas + ch_faa_for_ampir = faas + ch_faa_for_ampcombi = faas // AMPLIFY if ( !params.amp_skip_amplify ) { @@ -42,7 +41,7 @@ workflow AMP { // MACREL if ( !params.amp_skip_macrel ) { - MACREL_CONTIGS ( contigs ) + MACREL_CONTIGS ( fastas ) ch_versions = ch_versions.mix( MACREL_CONTIGS.out.versions ) GUNZIP_MACREL_PRED ( MACREL_CONTIGS.out.amp_prediction ) GUNZIP_MACREL_ORFS ( MACREL_CONTIGS.out.all_orfs ) @@ -72,14 +71,15 @@ workflow AMP { [ meta, file ] } - ch_in_for_amp_hmmsearch = ch_faa_for_amp_hmmsearch.combine( ch_amp_hmm_models_meta ) - .map { - meta_faa, faa, meta_hmm, hmm -> - def meta_new = [:] - meta_new['id'] = meta_faa['id'] - meta_new['hmm_id'] = meta_hmm['id'] - [ meta_new, hmm, faa, params.amp_hmmsearch_savealignments, params.amp_hmmsearch_savetargets, params.amp_hmmsearch_savedomains ] - } + ch_in_for_amp_hmmsearch = ch_faa_for_amp_hmmsearch + .combine( ch_amp_hmm_models_meta ) + .map { + meta_faa, faa, meta_hmm, hmm -> + def meta_new = [:] + meta_new['id'] = meta_faa['id'] + meta_new['hmm_id'] = meta_hmm['id'] + [ meta_new, hmm, faa, params.amp_hmmsearch_savealignments, params.amp_hmmsearch_savetargets, params.amp_hmmsearch_savedomains ] + } AMP_HMMER_HMMSEARCH ( ch_in_for_amp_hmmsearch ) ch_versions = ch_versions.mix( AMP_HMMER_HMMSEARCH.out.versions ) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 40e93672..27c0bfba 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -30,40 +30,40 @@ workflow ANNOTATION { GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations ) - ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) - ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip - ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip - ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive + ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) + ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip + ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip + ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive if ( params.save_annotations == true ) { PRODIGAL_GBK ( fasta, "gbk" ) - ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) - ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. + ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) + ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. } } else if ( params.annotation_tool == "pyrodigal" ) { - PYRODIGAL ( fasta ) + PYRODIGAL ( fasta, "gbk" ) GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) - GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.gff ) - ch_versions = ch_versions.mix(PYRODIGAL.out.versions) - ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip - ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip - ch_annotation_gff = GUNZIP_PYRODIGAL_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK + GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.annotations ) + ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip + ch_annotation_gff = GUNZIP_PYRODIGAL_GFF.out.gunzip + ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK } else if ( params.annotation_tool == "prokka" ) { PROKKA ( fasta, [], [] ) - ch_versions = ch_versions.mix(PROKKA.out.versions) - ch_multiqc_files = PROKKA.out.txt + ch_versions = ch_versions.mix(PROKKA.out.versions) + ch_multiqc_files = PROKKA.out.txt - ch_annotation_faa = PROKKA.out.faa - ch_annotation_fna = PROKKA.out.fna - ch_annotation_gff = PROKKA.out.gff - ch_annotation_gbk = PROKKA.out.gbk + ch_annotation_faa = PROKKA.out.faa + ch_annotation_fna = PROKKA.out.fna + ch_annotation_gff = PROKKA.out.gff + ch_annotation_gbk = PROKKA.out.gbk } else if ( params.annotation_tool == "bakta" ) { @@ -79,20 +79,20 @@ workflow ANNOTATION { } BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) - ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) - ch_multiqc_files = BAKTA_BAKTA.out.txt + ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) + ch_multiqc_files = BAKTA_BAKTA.out.txt - ch_annotation_faa = BAKTA_BAKTA.out.faa - ch_annotation_fna = BAKTA_BAKTA.out.fna - ch_annotation_gff = BAKTA_BAKTA.out.gff - ch_annotation_gbk = BAKTA_BAKTA.out.gbff + ch_annotation_faa = BAKTA_BAKTA.out.faa + ch_annotation_fna = BAKTA_BAKTA.out.fna + ch_annotation_gff = BAKTA_BAKTA.out.gff + ch_annotation_gbk = BAKTA_BAKTA.out.gbff } emit: - versions = ch_versions - multiqc_files = ch_multiqc_files - faa = ch_annotation_faa // [ [meta], path(faa) ] - fna = ch_annotation_fna // [ [meta], path(fna) ] - gff = ch_annotation_gff // [ [meta], path(gff) ] - gbk = ch_annotation_gbk // [ [meta], path(gbk) ] + versions = ch_versions + multiqc_files = ch_multiqc_files + faa = ch_annotation_faa // [ [meta], path(faa) ] + fna = ch_annotation_fna // [ [meta], path(fna) ] + gff = ch_annotation_gff // [ [meta], path(gff) ] + gbk = ch_annotation_gbk // [ [meta], path(gbk) ] } diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index 416aa9e7..5b9d276b 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -22,9 +22,9 @@ include { MERGE_TAXONOMY_HAMRONIZATION } from '../../modules/local/merge_t workflow ARG { take: - contigs // tuple val(meta), path(contigs) + fastas // tuple val(meta), path(contigs) annotations - tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -45,7 +45,7 @@ workflow ARG { } if ( !params.arg_skip_amrfinderplus ) { - AMRFINDERPLUS_RUN ( contigs, ch_amrfinderplus_db ) + AMRFINDERPLUS_RUN ( fastas, ch_amrfinderplus_db ) ch_versions = ch_versions.mix( AMRFINDERPLUS_RUN.out.versions ) // Reporting @@ -59,7 +59,7 @@ workflow ARG { ch_fargene_classes = Channel.fromList( params.arg_fargene_hmmmodel.tokenize(',') ) - ch_fargene_input = contigs + ch_fargene_input = fastas .combine( ch_fargene_classes ) .map { meta, fastas, hmm_class -> @@ -72,7 +72,7 @@ workflow ARG { hmmclass: it[2] } - FARGENE ( ch_fargene_input.contigs, ch_fargene_input.hmmclass ) + FARGENE ( ch_fargene_input.fastas, ch_fargene_input.hmmclass ) ch_versions = ch_versions.mix( FARGENE.out.versions ) // Reporting @@ -91,7 +91,7 @@ workflow ARG { RGI_CARDANNOTATION ( UNTAR.out.untar.map{ it[1] } ) ch_versions = ch_versions.mix( RGI_CARDANNOTATION.out.versions ) - RGI_MAIN ( contigs, RGI_CARDANNOTATION.out.db, [] ) + RGI_MAIN ( fastas, RGI_CARDANNOTATION.out.db, [] ) ch_versions = ch_versions.mix( RGI_MAIN.out.versions ) // Reporting @@ -138,7 +138,7 @@ workflow ARG { // ABRicate run if ( !params.arg_skip_abricate ) { - ABRICATE_RUN ( contigs ) + ABRICATE_RUN ( fastas ) ch_versions = ch_versions.mix( ABRICATE_RUN.out.versions ) HAMRONIZATION_ABRICATE ( ABRICATE_RUN.out.report, 'json', '1.0.1', '2021-Mar-27' ) diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index afb6f5e7..203098dc 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -19,11 +19,11 @@ include { MERGE_TAXONOMY_COMBGC } from '../../modules/local/m workflow BGC { take: - fna // tuple val(meta), path(PREPPED_INPUT.out.fna) - gff // tuple val(meta), path(.out.gff) - faa // tuple val(meta), path(.out.faa) - gbk // tuple val(meta), path(.out.gbk) - tsv // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) + fastas // tuple val(meta), path(PREPPED_INPUT.out.fna) + faas // tuple val(meta), path(.out.faa) + gffs // tuple val(meta), path(.out.gff) + gbks // tuple val(meta), path(.out.gbk) + tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) main: ch_versions = Channel.empty() @@ -74,40 +74,49 @@ workflow BGC { // Exact input combination to antismash depends on whether gff (requires fna) or gbk (just gbk) necessary - ch_antismash_gff_input = fastas.join(gffs, by: 0) - .filter { - meta, fastas, gff -> - if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. Antismash will not be run for sample: ${meta.id}." - meta.longest_contig >= params.bgc_antismash_sampleminlength - } - .multiMap { - meta, fastas, gff -> - fastas: [ meta, fastas ] - gffs: [ gff ] - } - - ANTISMASH_GFF ( ch_antismash_gff_input.fastas, ch_antismash_databases, ch_antismash_directory, ch_antismash_gff_input.gffs ) - ch_versions = ch_versions.mix(ANTISMASH_GFF.out.versions) - - ch_antismash_gbk_input = gbks.filter { - meta, files -> - if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. Antismash will not be run for sample: ${meta.id}." - meta.longest_contig >= params.bgc_antismash_sampleminlength - } - - ANTISMASH_ANTISMASHLITE ( ch_antismash_input, ch_antismash_databases, ch_antismash_directory, [] ) - - } - - ch_versions = ch_versions.mix( ANTISMASH_ANTISMASHLITE.out.versions ) - ch_antismashresults_for_combgc = ANTISMASH_ANTISMASHLITE.out.knownclusterblast_dir - .mix( ANTISMASH_ANTISMASHLITE.out.gbk_input ) - .groupTuple() - .map{ - meta, files -> - [meta, files.flatten()] - } - ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( ch_antismashresults_for_combgc ) + // TODO: if ( annotation input is "gff" ) do this: + ch_antismash_gff_input = fastas.join(gffs, by: 0) + .filter { + meta, fastas, gff -> + if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. AntiSMASH will not be run for sample: ${meta.id}." + meta.longest_contig >= params.bgc_antismash_sampleminlength + } + .multiMap { + meta, fastas, gff -> + fastas: [ meta, fastas ] + gffs: [ gff ] + } + + ANTISMASH_GFF ( ch_antismash_gff_input.fastas, ch_antismash_databases, ch_antismash_directory, ch_antismash_gff_input.gffs ) + ch_versions = ch_versions.mix(ANTISMASH_GFF.out.versions) + + // TODO: else if ( antismash input is "gbk") do this: + ch_antismash_gbk_input = gbks + .filter { + meta, files -> + if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. AntiSMASH will not be run for sample: ${meta.id}." + meta.longest_contig >= params.bgc_antismash_sampleminlength + } + + ANTISMASH_GBK ( ch_antismash_gbk_input, ch_antismash_databases, ch_antismash_directory, [] ) + ch_versions = ch_versions.mix(ANTISMASH_GBK.out.versions) + + // TODO: Fix below + // ch_antismashresults_for_combgc = ANTISMASH_GFF.out.knownclusterblast_dir.dump(tag: 'gff_cluster') + // .dump(tag: 'antismash_gff_knownclusterblast_dir') + // .mix(ANTISMASH_GFF.out.gbk_input.dump(tag: 'gff_input')) + // .dump(tag: 'antismash_gff_gbk_input') + // .mix(ANTISMASH_GBK.out.knownclusterblast_dir.dump(tag: 'gbk_cluster')) + // .dump(tag: 'antismash_gbk_knownclusterblast_dir') + // .mix(ANTISMASH_GBK.out.gbk_input.dump(tag: 'gbk_input')) + // .dump(tag: 'antismash_gbk_gbk_input') + // .groupTuple() + // .map{ + // meta, files -> + // [meta, files.flatten()] + // } + + // ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( ch_antismashresults_for_combgc ) } // DEEPBGC @@ -123,7 +132,7 @@ workflow BGC { ch_versions = ch_versions.mix( DEEPBGC_DOWNLOAD.out.versions ) } - DEEPBGC_PIPELINE ( fna, ch_deepbgc_database) + DEEPBGC_PIPELINE ( fastas, ch_deepbgc_database) ch_versions = ch_versions.mix( DEEPBGC_PIPELINE.out.versions ) ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( DEEPBGC_PIPELINE.out.bgc_tsv ) } diff --git a/subworkflows/local/taxa_class.nf b/subworkflows/local/taxa_class.nf index ec9f273a..8f3fb88d 100644 --- a/subworkflows/local/taxa_class.nf +++ b/subworkflows/local/taxa_class.nf @@ -9,7 +9,7 @@ include { MMSEQS_CREATETSV } from '../../modules/nf-core/mmseqs/createtsv/main' workflow TAXA_CLASS { take: - contigs // tuple val(meta), path(contigs) + fastas // tuple val(meta), path(contigs) main: ch_versions = Channel.empty() @@ -34,7 +34,7 @@ workflow TAXA_CLASS { // Create db for query contigs, assign taxonomy and convert to table format // MMSEQS_CREATEDB - MMSEQS_CREATEDB ( contigs ) + MMSEQS_CREATEDB ( fastas ) ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions ) ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 4b79dcf6..bfc82491 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -30,9 +30,9 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // include { ANNOTATION } from '../subworkflows/local/annotation' -include { AMP } from '../subworkflows/local/amp' -include { ARG } from '../subworkflows/local/arg' -include { BGC } from '../subworkflows/local/bgc' +include { AMP } from '../subworkflows/local/amp' +include { ARG } from '../subworkflows/local/arg' +include { BGC } from '../subworkflows/local/bgc' include { TAXA_CLASS } from '../subworkflows/local/taxa_class' /* @@ -110,7 +110,7 @@ workflow FUNCSCAN { // https://github.com/antismash/antismash/issues/364 if ( params.run_bgc_screening && !params.bgc_skip_antismash && feature_found != null ) { log.warn("[nf-core/funcscan] antiSMASH screening requested and pre-annotated files given.") - log.warn("Be aware that Prokka generated GFF or GBK files will likely fail with antiSMASH!") + log.warn("Be aware that Prokka-generated GFF or GBK files will likely fail with antiSMASH!") log.warn("See usage docs. File: " + feature_found.name) } def fasta = fasta_found != null ? fasta_found : [] @@ -142,25 +142,13 @@ workflow FUNCSCAN { [ meta + meta_new, fasta, faa, feature ] } - /* - TAXONOMIC CLASSIFICATION - */ - - // The final subworkflow reports need taxonomic classification. - // This can be either on NT or AA level depending on annotation. - // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. - if ( params.run_taxa_classification ) { - TAXA_CLASS ( ch_prepped_input ) - ch_versions = ch_versions.mix( TAXA_CLASS.out.versions ) - ch_taxonomy_tsv = TAXA_CLASS.out.sample_taxonomy - - } else { - - ch_mmseqs_db = Channel.empty() - ch_taxonomy_querydb = Channel.empty() - ch_taxonomy_querydb_taxdb = Channel.empty() - ch_taxonomy_tsv = Channel.empty() - } + // Separate pre-annotated FASTAs from those that need annotation + ch_input_for_annotation = ch_intermediate_input + .branch { + meta, fasta, protein, feature -> + preannotated: protein != [] + unannotated: true + } /* ANNOTATION @@ -214,6 +202,26 @@ workflow FUNCSCAN { gbks: [meta, gbk] } + + /* + TAXONOMIC CLASSIFICATION + */ + + // The final subworkflow reports need taxonomic classification. + // This can be either on NT or AA level depending on annotation. + // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. + if ( params.run_taxa_classification ) { + TAXA_CLASS ( ch_prepped_input.fastas ) + ch_versions = ch_versions.mix( TAXA_CLASS.out.versions ) + ch_taxonomy_tsv = TAXA_CLASS.out.sample_taxonomy + + } else { + ch_mmseqs_db = Channel.empty() + ch_taxonomy_querydb = Channel.empty() + ch_taxonomy_querydb_taxdb = Channel.empty() + ch_taxonomy_tsv = Channel.empty() + } + /////////////// // SCREENING // /////////////// @@ -236,17 +244,17 @@ workflow FUNCSCAN { ch_versions = ch_versions.mix(AMP.out.versions) } else if ( params.run_amp_screening && params.run_taxa_classification ) { AMP ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -259,7 +267,7 @@ workflow FUNCSCAN { if ( params.run_arg_screening && !params.run_taxa_classification ) { if ( params.arg_skip_deeparg ) { ARG ( - ch_prepped_input, + ch_prepped_input.fastas, [], ch_taxonomy_tsv ) @@ -279,28 +287,28 @@ workflow FUNCSCAN { } else if ( params.run_arg_screening && params.run_taxa_classification ) { if ( params.arg_skip_deeparg ) { ARG ( - ch_prepped_input, + ch_prepped_input.fastas, [], ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) } else { ARG ( - ch_prepped_input, - ch_annotation_faa + ch_prepped_input.fastas, + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) @@ -313,23 +321,23 @@ workflow FUNCSCAN { */ if ( params.run_bgc_screening && !params.run_taxa_classification ) { BGC ( - ch_prepped_input, - ch_annotation_gff + ch_prepped_input.fastas, + ch_prepped_input.gffs .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_annotation_faa + ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_annotation_gbk + ch_prepped_input.gbks .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv @@ -359,7 +367,7 @@ workflow FUNCSCAN { ch_taxonomy_tsv .filter { meta, file -> - if ( file.isEmpty() ) log.warn("Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Taxonomy classification of the following sample produced an empty TSV file. Taxonomy merging will not be executed: ${meta.id}") !file.isEmpty() } ) From 9b483aca97492dd8a335c0be12936b36062febcb Mon Sep 17 00:00:00 2001 From: jasmezz Date: Fri, 5 Apr 2024 16:08:02 +0200 Subject: [PATCH 23/45] Excluded GFF support, fixed multiqc report, update variables etc. --- subworkflows/local/annotation.nf | 39 +++++---------- subworkflows/local/bgc.nf | 56 ++++++--------------- workflows/funcscan.nf | 83 +++++++------------------------- 3 files changed, 44 insertions(+), 134 deletions(-) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 27c0bfba..7b00b2f6 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -3,18 +3,16 @@ */ include { PROKKA } from '../../modules/nf-core/prokka/main' -include { PRODIGAL as PRODIGAL_GFF } from '../../modules/nf-core/prodigal/main' -include { PRODIGAL as PRODIGAL_GBK } from '../../modules/nf-core/prodigal/main' +include { PRODIGAL } from '../../modules/nf-core/prodigal/main' include { PYRODIGAL } from '../../modules/nf-core/pyrodigal/main' include { BAKTA_BAKTADBDOWNLOAD } from '../../modules/nf-core/bakta/baktadbdownload/main' include { BAKTA_BAKTA } from '../../modules/nf-core/bakta/bakta/main' include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../../modules/nf-core/gunzip/main' include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../../modules/nf-core/gunzip/main' +include { GUNZIP as GUNZIP_PYRODIGAL_GBK } from '../../modules/nf-core/gunzip/main' workflow ANNOTATION { take: @@ -24,45 +22,35 @@ workflow ANNOTATION { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // For prodigal: run twice, once for gff and once for gbk generation, (for parity with PROKKA which produces both) if ( params.annotation_tool == "prodigal" ) { - PRODIGAL_GFF ( fasta, "gff" ) - GUNZIP_PRODIGAL_FAA ( PRODIGAL_GFF.out.amino_acid_fasta ) - GUNZIP_PRODIGAL_FNA ( PRODIGAL_GFF.out.nucleotide_fasta) - GUNZIP_PRODIGAL_GFF ( PRODIGAL_GFF.out.gene_annotations ) - ch_versions = ch_versions.mix(PRODIGAL_GFF.out.versions) + + PRODIGAL ( fasta, "gbk" ) + GUNZIP_PRODIGAL_FAA ( PRODIGAL.out.amino_acid_fasta ) + GUNZIP_PRODIGAL_FNA ( PRODIGAL.out.nucleotide_fasta) + GUNZIP_PYRODIGAL_GBK ( PRODIGAL.out.gene_annotations ) + ch_versions = ch_versions.mix(PRODIGAL.out.versions) ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip - ch_annotation_gff = GUNZIP_PRODIGAL_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Prodigal GBK and GFF output are mutually exclusive - - if ( params.save_annotations == true ) { - PRODIGAL_GBK ( fasta, "gbk" ) - ch_versions = ch_versions.mix(PRODIGAL_GBK.out.versions) - ch_annotation_gbk = PRODIGAL_GBK.out.gene_annotations // Prodigal GBK output stays zipped because it is currently not used by any downstream subworkflow. - } + ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip } else if ( params.annotation_tool == "pyrodigal" ) { PYRODIGAL ( fasta, "gbk" ) GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) - GUNZIP_PYRODIGAL_GFF ( PYRODIGAL.out.annotations ) + GUNZIP_PYRODIGAL_GBK ( PYRODIGAL.out.annotations ) ch_versions = ch_versions.mix(PYRODIGAL.out.versions) ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip - ch_annotation_gff = GUNZIP_PYRODIGAL_GFF.out.gunzip - ch_annotation_gbk = Channel.empty() // Pyrodigal doesn't produce GBK + ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip } else if ( params.annotation_tool == "prokka" ) { PROKKA ( fasta, [], [] ) ch_versions = ch_versions.mix(PROKKA.out.versions) - ch_multiqc_files = PROKKA.out.txt - + ch_multiqc_files = PROKKA.out.txt.collect{it[1]}.ifEmpty([]) ch_annotation_faa = PROKKA.out.faa ch_annotation_fna = PROKKA.out.fna - ch_annotation_gff = PROKKA.out.gff ch_annotation_gbk = PROKKA.out.gbk } else if ( params.annotation_tool == "bakta" ) { @@ -81,10 +69,8 @@ workflow ANNOTATION { BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) ch_multiqc_files = BAKTA_BAKTA.out.txt - ch_annotation_faa = BAKTA_BAKTA.out.faa ch_annotation_fna = BAKTA_BAKTA.out.fna - ch_annotation_gff = BAKTA_BAKTA.out.gff ch_annotation_gbk = BAKTA_BAKTA.out.gbff } @@ -93,6 +79,5 @@ workflow ANNOTATION { multiqc_files = ch_multiqc_files faa = ch_annotation_faa // [ [meta], path(faa) ] fna = ch_annotation_fna // [ [meta], path(fna) ] - gff = ch_annotation_gff // [ [meta], path(gff) ] gbk = ch_annotation_gbk // [ [meta], path(gbk) ] } diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 203098dc..156014c3 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -6,8 +6,7 @@ include { UNTAR as UNTAR_CSS } from '../../modules/nf-core include { UNTAR as UNTAR_DETECTION } from '../../modules/nf-core/untar/main' include { UNTAR as UNTAR_MODULES } from '../../modules/nf-core/untar/main' include { ANTISMASH_ANTISMASHLITEDOWNLOADDATABASES } from '../../modules/nf-core/antismash/antismashlitedownloaddatabases/main' -include { ANTISMASH_ANTISMASHLITE as ANTISMASH_GBK } from '../../modules/nf-core/antismash/antismashlite/main' -include { ANTISMASH_ANTISMASHLITE as ANTISMASH_GFF } from '../../modules/nf-core/antismash/antismashlite/main' +include { ANTISMASH_ANTISMASHLITE } from '../../modules/nf-core/antismash/antismashlite/main' include { GECCO_RUN } from '../../modules/nf-core/gecco/run/main' include { HMMER_HMMSEARCH as BGC_HMMER_HMMSEARCH } from '../../modules/nf-core/hmmer/hmmsearch/main' include { DEEPBGC_DOWNLOAD } from '../../modules/nf-core/deepbgc/download/main' @@ -21,7 +20,6 @@ workflow BGC { take: fastas // tuple val(meta), path(PREPPED_INPUT.out.fna) faas // tuple val(meta), path(.out.faa) - gffs // tuple val(meta), path(.out.gff) gbks // tuple val(meta), path(.out.gbk) tsvs // tuple val(meta), path(MMSEQS_CREATETSV.out.tsv) @@ -72,51 +70,25 @@ workflow BGC { } - // Exact input combination to antismash depends on whether gff (requires fna) or gbk (just gbk) necessary - - // TODO: if ( annotation input is "gff" ) do this: - ch_antismash_gff_input = fastas.join(gffs, by: 0) - .filter { - meta, fastas, gff -> - if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. AntiSMASH will not be run for sample: ${meta.id}." - meta.longest_contig >= params.bgc_antismash_sampleminlength - } - .multiMap { - meta, fastas, gff -> - fastas: [ meta, fastas ] - gffs: [ gff ] - } - - ANTISMASH_GFF ( ch_antismash_gff_input.fastas, ch_antismash_databases, ch_antismash_directory, ch_antismash_gff_input.gffs ) - ch_versions = ch_versions.mix(ANTISMASH_GFF.out.versions) - - // TODO: else if ( antismash input is "gbk") do this: - ch_antismash_gbk_input = gbks + ch_antismash_input = gbks .filter { meta, files -> if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. AntiSMASH will not be run for sample: ${meta.id}." meta.longest_contig >= params.bgc_antismash_sampleminlength } - ANTISMASH_GBK ( ch_antismash_gbk_input, ch_antismash_databases, ch_antismash_directory, [] ) - ch_versions = ch_versions.mix(ANTISMASH_GBK.out.versions) - - // TODO: Fix below - // ch_antismashresults_for_combgc = ANTISMASH_GFF.out.knownclusterblast_dir.dump(tag: 'gff_cluster') - // .dump(tag: 'antismash_gff_knownclusterblast_dir') - // .mix(ANTISMASH_GFF.out.gbk_input.dump(tag: 'gff_input')) - // .dump(tag: 'antismash_gff_gbk_input') - // .mix(ANTISMASH_GBK.out.knownclusterblast_dir.dump(tag: 'gbk_cluster')) - // .dump(tag: 'antismash_gbk_knownclusterblast_dir') - // .mix(ANTISMASH_GBK.out.gbk_input.dump(tag: 'gbk_input')) - // .dump(tag: 'antismash_gbk_gbk_input') - // .groupTuple() - // .map{ - // meta, files -> - // [meta, files.flatten()] - // } - - // ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( ch_antismashresults_for_combgc ) + ANTISMASH_ANTISMASHLITE ( ch_antismash_input, ch_antismash_databases, ch_antismash_directory, [] ) + ch_versions = ch_versions.mix(ANTISMASH_ANTISMASHLITE.out.versions) + + ch_antismashresults_for_combgc = ANTISMASH_ANTISMASHLITE.out.knownclusterblast_dir + .mix(ANTISMASH_ANTISMASHLITE.out.gbk_input) + .groupTuple() + .map{ + meta, files -> + [meta, files.flatten()] + } + + ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( ch_antismashresults_for_combgc ) } // DEEPBGC diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index bfc82491..81545dfb 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -45,22 +45,9 @@ include { TAXA_CLASS } from '../subworkflows/local/taxa_class' // MODULE: Installed directly from nf-core/modules // -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { GUNZIP as GUNZIP_INPUT_PREP } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PRODIGAL_FNA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PRODIGAL_FAA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PRODIGAL_GFF } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PYRODIGAL_FNA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PYRODIGAL_FAA } from '../modules/nf-core/gunzip/main' -include { GUNZIP as GUNZIP_PYRODIGAL_GFF } from '../modules/nf-core/gunzip/main' -include { BIOAWK } from '../modules/nf-core/bioawk/main' -include { PROKKA } from '../modules/nf-core/prokka/main' -include { PRODIGAL as PRODIGAL_GFF } from '../modules/nf-core/prodigal/main' -include { PRODIGAL as PRODIGAL_GBK } from '../modules/nf-core/prodigal/main' -include { PYRODIGAL as PYRODIGAL_GBK } from '../modules/nf-core/pyrodigal/main' -include { PYRODIGAL as PYRODIGAL_GFF } from '../modules/nf-core/pyrodigal/main' -include { BAKTA_BAKTADBDOWNLOAD } from '../modules/nf-core/bakta/baktadbdownload/main' -include { BAKTA_BAKTA } from '../modules/nf-core/bakta/bakta/main' +include { BIOAWK } from '../modules/nf-core/bioawk/main' +include { GUNZIP as GUNZIP_INPUT_PREP } from '../modules/nf-core/gunzip/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -105,17 +92,10 @@ workflow FUNCSCAN { meta, files -> def fasta_found = files.find{it.toString().tokenize('.').last().matches('fasta|fas|fna|fa')} def faa_found = files.find{it.toString().endsWith('.faa')} - def feature_found = files.find{it.toString().tokenize('.').last().matches('gff|gbk')} - - // https://github.com/antismash/antismash/issues/364 - if ( params.run_bgc_screening && !params.bgc_skip_antismash && feature_found != null ) { - log.warn("[nf-core/funcscan] antiSMASH screening requested and pre-annotated files given.") - log.warn("Be aware that Prokka-generated GFF or GBK files will likely fail with antiSMASH!") - log.warn("See usage docs. File: " + feature_found.name) } - - def fasta = fasta_found != null ? fasta_found : [] - def faa = faa_found != null ? faa_found : [] - def feature = feature_found != null ? feature_found : [] + def feature_found = files.find{it.toString().tokenize('.').last().matches('gbk')} + def fasta = fasta_found != null ? fasta_found : [] + def faa = faa_found != null ? faa_found : [] + def feature = feature_found != null ? feature_found : [] [meta, fasta, faa, feature] } @@ -165,22 +145,12 @@ workflow FUNCSCAN { ANNOTATION( ch_unannotated_for_annotation ) ch_versions = ch_versions.mix(ANNOTATION.out.versions) + ch_multiqc_files = ch_multiqc_files.mix(ANNOTATION.out.multiqc_files) // Only Bakta and Prokka make GBK, else give empty entry to satisfy downstream cardinality - if ( ['bakta', 'prokka'].contains(params.annotation_tool) ) { - ch_new_annotation = ch_unannotated_for_annotation - .join(ANNOTATION.out.faa) - .join(ANNOTATION.out.gff) - .join(ANNOTATION.out.gbk) - } else { - ch_new_annotation = ch_unannotated_for_annotation - .join(ANNOTATION.out.faa) - .join(ANNOTATION.out.gff) - .map { - meta, fasta, faa, gff -> - [meta, fasta, faa, gff, []] - } - } + ch_new_annotation = ch_unannotated_for_annotation + .join(ANNOTATION.out.faa) + .join(ANNOTATION.out.gbk) } else { ch_new_annotation = Channel.empty() @@ -189,16 +159,14 @@ workflow FUNCSCAN { ch_prepped_input = ch_input_for_annotation.preannotated .map{ meta, fasta, protein, feature -> - def gff = feature.extension == 'gff' ? feature : [] def gbk = feature.extension == 'gbk' ? feature : [] - [meta, fasta, protein, gff, gbk] + [meta, fasta, protein, gbk] } .mix(ch_new_annotation) .multiMap { - meta, fasta, protein, gff, gbk -> + meta, fasta, protein, gbk -> fastas: [meta, fasta] faas: [meta, protein] - gffs: [meta, gff] gbks: [meta, gbk] } @@ -302,7 +270,7 @@ workflow FUNCSCAN { ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. ARG screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv @@ -322,22 +290,16 @@ workflow FUNCSCAN { if ( params.run_bgc_screening && !params.run_taxa_classification ) { BGC ( ch_prepped_input.fastas, - ch_prepped_input.gffs - .filter { - meta, file -> - if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GFF file. AMP screening tools requiring this file will not be executed: ${meta.id}") - !file.isEmpty() - }, ch_prepped_input.faas .filter { meta, file -> - if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_prepped_input.gbks .filter { meta, file -> - if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv @@ -352,16 +314,10 @@ workflow FUNCSCAN { if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_prepped_input.gffs - .filter { - meta, file -> - if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GFF file. BGC screening tools requiring this file will not be executed: ${meta.id}") - !file.isEmpty() - }, ch_prepped_input.gbks .filter { meta, file -> - if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. AMP screening tools requiring this file will not be executed: ${meta.id}") + if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced an empty GBK file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, ch_taxonomy_tsv @@ -389,15 +345,12 @@ workflow FUNCSCAN { ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() summary_params = paramsSummaryMap( workflow, parameters_schema: "nextflow_schema.json" ) - ch_workflow_summary = Channel.value( paramsSummaryMultiqc(summary_params) ) + ch_workflow_summary = Channel.value( paramsSummaryMultiqc( summary_params ) ) ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) ch_methods_description = Channel.value( methodsDescriptionText( ch_multiqc_custom_methods_description )) ch_multiqc_files = ch_multiqc_files.mix( ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml') ) ch_multiqc_files = ch_multiqc_files.mix( ch_collated_versions ) ch_multiqc_files = ch_multiqc_files.mix( ch_methods_description.collectFile(name: 'methods_description_mqc.yaml') ) - if( params.annotation_tool=='prokka' ) { - ch_multiqc_files = ch_multiqc_files.mix( PROKKA.out.txt.collect{it[1]}.ifEmpty([]) ) - } MULTIQC ( ch_multiqc_files.collect(), From 43e77cfdd44baf016dc87d2babc24ec64112ed93 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Fri, 5 Apr 2024 16:32:13 +0200 Subject: [PATCH 24/45] Update usage docs and samplesheet --- assets/samplesheet.csv | 7 ++++--- docs/usage.md | 18 +++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 22583f22..644f0a74 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,4 @@ -sample,fasta -sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz -sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz +sample,fasta,protein,feature +sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.faa,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.gbk +sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.faa.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.gbk.gz +sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta \ No newline at end of file diff --git a/docs/usage.md b/docs/usage.md index 6e874622..6ed6308a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -52,24 +52,24 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s --input '[path to samplesheet file]' ``` -The input samplesheet has to be a comma-separated file (`.csv`) with 2 (`sample`, and `fasta`) or 4 columns (`sample`, `fasta`, `protein`, `feature`), and a header row as shown in the examples below. +The input samplesheet has to be a comma-separated file (`.csv`) with 2 (`sample`, `fasta`) or 4 columns (`sample`, `fasta`, `protein`, `feature`), and a header row as shown in the examples below. -If you already have annotated contigs, you can supply these to the pipeline using optional `protein` and `feature` columns. If these two columns are supplied, pipeline annotation will be skipped the corresponding FASTA file, and the corresponding annotation files used instead. +If you already have annotated contigs, you can supply these to the pipeline using the optional `protein` and `feature` columns. If these two columns are supplied, pipeline annotation (i.e. with bakta, prodigal, pyrodigal or prokka) will be skipped and the corresponding annotation files used instead. -For two column +For two columns: -```bash +```csv title="samplesheet.csv" sample,fasta sample_1,///wastewater_metagenome_contigs_1.fasta.gz sample_2,///wastewater_metagenome_contigs_2.fasta.gz ``` -For four column: +For four columns: -```bash +```csv title="samplesheet.csv" sample,fasta,protein,feature -sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wastewater_metagenome_contigs_1.faa,///wastewater_metagenome_contigs_1.fasta.gff -sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_1.faa,///wastewater_metagenome_contigs_1.fasta.gbk +sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wastewater_metagenome_contigs_1.faa,///wastewater_metagenome_contigs_1.fasta.gbk +sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa,///wastewater_metagenome_contigs_2.fasta.gbk ``` | Column | Description | @@ -77,7 +77,7 @@ sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wast | `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | | `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | | `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | -| `feature` | Optional path to a pre-generated annotation file (`.gbk` or `.gff`) containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` also given. | +| `feature` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From 311f77c5aa8eb636abd2bafe4237847ecfef9ef3 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Fri, 5 Apr 2024 16:43:56 +0200 Subject: [PATCH 25/45] Update modules.config, fix linting, variable typos --- assets/samplesheet.csv | 2 +- conf/modules.config | 23 +++-------------------- subworkflows/local/amp.nf | 2 +- subworkflows/local/annotation.nf | 10 ++++++++-- subworkflows/local/bgc.nf | 2 +- 5 files changed, 14 insertions(+), 25 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 644f0a74..4645a661 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ sample,fasta,protein,feature sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.faa,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.gbk sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.faa.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.gbk.gz -sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta \ No newline at end of file +sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta diff --git a/conf/modules.config b/conf/modules.config index 0a96eec4..5c981595 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -161,30 +161,13 @@ process { ].join(' ').trim() } - withName: PRODIGAL_GFF { + withName: PRODIGAL { ext.prefix = { "${meta.id}_prodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping publishDir = [ path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, mode: params.publish_dir_mode, enabled: params.save_annotations, - pattern: "*.{faa,fna,gff,faa.gz,faa.gz,fna.gz,gff.gz}", - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - ext.args = [ - params.annotation_prodigal_singlemode ? "-p single" : "-p meta", - params.annotation_prodigal_closed ? "-c" : "", - params.annotation_prodigal_forcenonsd ? "-n" : "", - "-g ${params.annotation_prodigal_transtable}" - ].join(' ').trim() - } - - withName: PRODIGAL_GBK { - ext.prefix = { "${meta.id}_prodigal" } // to prevent pigz symlink problems of input files if already uncompressed during post-annotation gzipping - publishDir = [ - path: { "${params.outdir}/annotation/prodigal/${meta.id}" }, - mode: params.publish_dir_mode, - enabled: params.save_annotations, - pattern: "*.gbk.gz", + pattern: "*.{faa,fna,gbk,faa.gz,faa.gz,fna.gz,gbk.gz}", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ @@ -201,7 +184,7 @@ process { path: { "${params.outdir}/annotation/pyrodigal/${meta.id}" }, mode: params.publish_dir_mode, enabled: params.save_annotations, - pattern: "*.{faa,fna,gff,score}.gz", + pattern: "*.{faa,fna,gbk,score}.gz", saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.args = [ diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index 7489f73d..a3260bbc 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -116,7 +116,7 @@ workflow AMP { // MERGE_TAXONOMY if ( params.run_taxa_classification ) { - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + ch_mmseqs_taxonomy_list = tsvs.map{ it[1] }.collect() MERGE_TAXONOMY_AMPCOMBI(ch_ampcombi_summaries, ch_mmseqs_taxonomy_list) ch_versions = ch_versions.mix(MERGE_TAXONOMY_AMPCOMBI.out.versions) diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 7b00b2f6..9955d3a8 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -27,11 +27,14 @@ workflow ANNOTATION { PRODIGAL ( fasta, "gbk" ) GUNZIP_PRODIGAL_FAA ( PRODIGAL.out.amino_acid_fasta ) GUNZIP_PRODIGAL_FNA ( PRODIGAL.out.nucleotide_fasta) - GUNZIP_PYRODIGAL_GBK ( PRODIGAL.out.gene_annotations ) + GUNZIP_PRODIGAL_GBK ( PRODIGAL.out.gene_annotations ) ch_versions = ch_versions.mix(PRODIGAL.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GBK.out.versions) ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip - ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip + ch_annotation_gbk = GUNZIP_PRODIGAL_GBK.out.gunzip } else if ( params.annotation_tool == "pyrodigal" ) { @@ -40,6 +43,9 @@ workflow ANNOTATION { GUNZIP_PYRODIGAL_FNA ( PYRODIGAL.out.fna) GUNZIP_PYRODIGAL_GBK ( PYRODIGAL.out.annotations ) ch_versions = ch_versions.mix(PYRODIGAL.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FAA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_FNA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PYRODIGAL_GBK.out.versions) ch_annotation_faa = GUNZIP_PYRODIGAL_FAA.out.gunzip ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 156014c3..53359185 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -87,7 +87,7 @@ workflow BGC { meta, files -> [meta, files.flatten()] } - + ch_bgcresults_for_combgc = ch_bgcresults_for_combgc.mix( ch_antismashresults_for_combgc ) } From 2ac179d6a2b2d9d1cd2b6d2ad56209f395b2873c Mon Sep 17 00:00:00 2001 From: jasmezz Date: Mon, 8 Apr 2024 11:37:42 +0200 Subject: [PATCH 26/45] Fix variable typos, fix multiqc channel for bakta --- nextflow_schema.json | 2 +- subworkflows/local/annotation.nf | 2 +- subworkflows/local/arg.nf | 2 +- subworkflows/local/bgc.nf | 2 +- workflows/funcscan.nf | 23 +++++++++++------------ 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 31678cd0..04202137 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -253,7 +253,7 @@ "default": "?", "enum": ["+", "-", "?"], "description": "Specify the type of bacteria to be annotated to detect signaling peptides.", - "help_text": "Specify the type of bacteria expected in the input dataset for correct annotation of the signal peptide predictions. More details can be found in the [documentation](https://github.com/oschwengers/bakta/blob/main/README.md#usage).\n\n> Modifies tool parameter(s):\n> - BAKTA: `--gram`", + "help_text": "Specify the type of bacteria expected in the input dataset for correct annotation of the signal peptide predictions. Gram types: +/-/?\nMore details can be found in the [documentation](https://github.com/oschwengers/bakta/blob/main/README.md#usage).\n\n> Modifies tool parameter(s):\n> - BAKTA: `--gram`", "fa_icon": "far fa-plus-square" }, "annotation_bakta_complete": { diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index 9955d3a8..dbf4562e 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -74,7 +74,7 @@ workflow ANNOTATION { BAKTA_BAKTA ( fasta, ch_bakta_db, [], [] ) ch_versions = ch_versions.mix(BAKTA_BAKTA.out.versions) - ch_multiqc_files = BAKTA_BAKTA.out.txt + ch_multiqc_files = BAKTA_BAKTA.out.txt.collect{it[1]}.ifEmpty([]) ch_annotation_faa = BAKTA_BAKTA.out.faa ch_annotation_fna = BAKTA_BAKTA.out.fna ch_annotation_gbk = BAKTA_BAKTA.out.gbff diff --git a/subworkflows/local/arg.nf b/subworkflows/local/arg.nf index 5b9d276b..c43db0e8 100644 --- a/subworkflows/local/arg.nf +++ b/subworkflows/local/arg.nf @@ -159,7 +159,7 @@ workflow ARG { // MERGE_TAXONOMY if ( params.run_taxa_classification ) { - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + ch_mmseqs_taxonomy_list = tsvs.map{ it[1] }.collect() MERGE_TAXONOMY_HAMRONIZATION( HAMRONIZATION_SUMMARIZE.out.tsv, ch_mmseqs_taxonomy_list ) ch_versions = ch_versions.mix( MERGE_TAXONOMY_HAMRONIZATION.out.versions ) diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 53359185..518b28b1 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -168,7 +168,7 @@ workflow BGC { // MERGE_TAXONOMY if ( params.run_taxa_classification ) { - ch_mmseqs_taxonomy_list = tsv.map{ it[1] }.collect() + ch_mmseqs_taxonomy_list = tsvs.map{ it[1] }.collect() MERGE_TAXONOMY_COMBGC( ch_combgc_summaries, ch_mmseqs_taxonomy_list ) ch_versions = ch_versions.mix( MERGE_TAXONOMY_COMBGC.out.versions ) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 81545dfb..0db4d3f5 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -81,12 +81,12 @@ workflow FUNCSCAN { } GUNZIP_INPUT_PREP ( ch_input_prep.compressed ) - ch_versions = ch_versions.mix(GUNZIP_INPUT_PREP.out.versions) + ch_versions = ch_versions.mix( GUNZIP_INPUT_PREP.out.versions ) // Merge all the already uncompressed and newly compressed FASTAs here into // a single input channel for downstream ch_intermediate_input = GUNZIP_INPUT_PREP.out.gunzip - .mix(ch_input_prep.uncompressed) + .mix( ch_input_prep.uncompressed ) .groupTuple() .map{ meta, files -> @@ -110,11 +110,11 @@ workflow FUNCSCAN { ch_intermediate_input.annotations BIOAWK ( ch_intermediate_input.fastas ) - ch_versions = ch_versions.mix(BIOAWK.out.versions) + ch_versions = ch_versions.mix( BIOAWK.out.versions ) ch_intermediate_input = ch_intermediate_input.fastas - .join(BIOAWK.out.longest) - .join(ch_intermediate_input.annotations) + .join( BIOAWK.out.longest ) + .join( ch_intermediate_input.annotations ) .map{ meta, fasta, length, faa, feature -> def meta_new = [:] @@ -122,7 +122,7 @@ workflow FUNCSCAN { [ meta + meta_new, fasta, faa, feature ] } - // Separate pre-annotated FASTAs from those that need annotation + // Separate pre-annotated FASTAs from those that need annotation ch_input_for_annotation = ch_intermediate_input .branch { meta, fasta, protein, feature -> @@ -144,13 +144,12 @@ workflow FUNCSCAN { } ANNOTATION( ch_unannotated_for_annotation ) - ch_versions = ch_versions.mix(ANNOTATION.out.versions) - ch_multiqc_files = ch_multiqc_files.mix(ANNOTATION.out.multiqc_files) + ch_versions = ch_versions.mix( ANNOTATION.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files ) - // Only Bakta and Prokka make GBK, else give empty entry to satisfy downstream cardinality ch_new_annotation = ch_unannotated_for_annotation - .join(ANNOTATION.out.faa) - .join(ANNOTATION.out.gbk) + .join( ANNOTATION.out.faa ) + .join( ANNOTATION.out.gbk ) } else { ch_new_annotation = Channel.empty() @@ -162,7 +161,7 @@ workflow FUNCSCAN { def gbk = feature.extension == 'gbk' ? feature : [] [meta, fasta, protein, gbk] } - .mix(ch_new_annotation) + .mix( ch_new_annotation ) .multiMap { meta, fasta, protein, gbk -> fastas: [meta, fasta] From 212ce0c0dd728fa150957730a86010b26d42da15 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Mon, 8 Apr 2024 11:47:52 +0200 Subject: [PATCH 27/45] Fix linting --- docs/usage.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 6ed6308a..0a63486f 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -72,11 +72,11 @@ sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wast sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa,///wastewater_metagenome_contigs_2.fasta.gbk ``` -| Column | Description | -| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | -| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | -| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | +| Column | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | | `feature` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From 15f9fbf8a528dd269b7c39fe393fded8c38b408a Mon Sep 17 00:00:00 2001 From: jasmezz Date: Mon, 8 Apr 2024 14:15:59 +0200 Subject: [PATCH 28/45] Prefer pyrodigal in tests, add warning when prodigal + antismash are selected. --- conf/test.config | 2 +- conf/test_bgc.config | 2 +- conf/test_nothing.config | 2 +- conf/test_preannotated.config | 2 +- conf/test_preannotated_bgc.config | 2 +- conf/test_taxonomy.config | 2 +- subworkflows/local/annotation.nf | 36 ++++++++++--------- .../utils_nfcore_funcscan_pipeline/main.nf | 2 +- 8 files changed, 27 insertions(+), 23 deletions(-) diff --git a/conf/test.config b/conf/test.config index 9e95a491..d558feaf 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,7 +23,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = true arg_fargene_hmmmodel = 'class_a,class_b_1_2' diff --git a/conf/test_bgc.config b/conf/test_bgc.config index 89228579..d1419d86 100644 --- a/conf/test_bgc.config +++ b/conf/test_bgc.config @@ -23,7 +23,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_reduced.csv' bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = false run_amp_screening = false diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 8c57427d..9a3118a2 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -24,7 +24,7 @@ params { amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = false run_amp_screening = false diff --git a/conf/test_preannotated.config b/conf/test_preannotated.config index e536cfd5..09ccb0cf 100644 --- a/conf/test_preannotated.config +++ b/conf/test_preannotated.config @@ -23,7 +23,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv' amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = true arg_fargene_hmmmodel = 'class_a,class_b_1_2' diff --git a/conf/test_preannotated_bgc.config b/conf/test_preannotated_bgc.config index 29a56281..e56d6519 100644 --- a/conf/test_preannotated_bgc.config +++ b/conf/test_preannotated_bgc.config @@ -23,7 +23,7 @@ params { input = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/samplesheet_preannotated.csv' bgc_hmmsearch_models = 'https://raw.githubusercontent.com/antismash/antismash/fd61de057e082fbf071732ac64b8b2e8883de32f/antismash/detection/hmm_detection/data/ToyB.hmm' - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = false run_amp_screening = false diff --git a/conf/test_taxonomy.config b/conf/test_taxonomy.config index ad477b3c..2e0cab02 100644 --- a/conf/test_taxonomy.config +++ b/conf/test_taxonomy.config @@ -25,7 +25,7 @@ params { amp_hmmsearch_models = 'https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/hmms/mybacteriocin.hmm' run_taxa_classification = true - annotation_tool = 'prodigal' + annotation_tool = 'pyrodigal' run_arg_screening = true arg_skip_deeparg = true diff --git a/subworkflows/local/annotation.nf b/subworkflows/local/annotation.nf index dbf4562e..74bf9666 100644 --- a/subworkflows/local/annotation.nf +++ b/subworkflows/local/annotation.nf @@ -22,21 +22,11 @@ workflow ANNOTATION { ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - if ( params.annotation_tool == "prodigal" ) { + if ( params.annotation_tool == "pyrodigal" || ( params.annotation_tool == "prodigal" && params.run_bgc_screening == true && !params.bgc_skip_antismash ) ) { // Need to use pyrodigal for antiSMASH because prodigal GBK annotation format is incompatible with antiSMASH. - PRODIGAL ( fasta, "gbk" ) - GUNZIP_PRODIGAL_FAA ( PRODIGAL.out.amino_acid_fasta ) - GUNZIP_PRODIGAL_FNA ( PRODIGAL.out.nucleotide_fasta) - GUNZIP_PRODIGAL_GBK ( PRODIGAL.out.gene_annotations ) - ch_versions = ch_versions.mix(PRODIGAL.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) - ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GBK.out.versions) - ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip - ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip - ch_annotation_gbk = GUNZIP_PRODIGAL_GBK.out.gunzip - - } else if ( params.annotation_tool == "pyrodigal" ) { + if ( params.annotation_tool == "prodigal" && params.run_bgc_screening == true && !params.bgc_skip_antismash ) { + log.warn("[nf-core/funcscan] Switching annotation tool to: pyrodigal. This is because prodigal annotations (in GBK format) are incompatible with antiSMASH. If you specifically wish to run prodigal instead, please skip antiSMASH or provide a pre-annotated GBK file in the samplesheet.") + } PYRODIGAL ( fasta, "gbk" ) GUNZIP_PYRODIGAL_FAA ( PYRODIGAL.out.faa ) @@ -50,7 +40,21 @@ workflow ANNOTATION { ch_annotation_fna = GUNZIP_PYRODIGAL_FNA.out.gunzip ch_annotation_gbk = GUNZIP_PYRODIGAL_GBK.out.gunzip - } else if ( params.annotation_tool == "prokka" ) { + } else if ( params.annotation_tool == "prodigal" ) { + + PRODIGAL ( fasta, "gbk" ) + GUNZIP_PRODIGAL_FAA ( PRODIGAL.out.amino_acid_fasta ) + GUNZIP_PRODIGAL_FNA ( PRODIGAL.out.nucleotide_fasta) + GUNZIP_PRODIGAL_GBK ( PRODIGAL.out.gene_annotations ) + ch_versions = ch_versions.mix(PRODIGAL.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FAA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_FNA.out.versions) + ch_versions = ch_versions.mix(GUNZIP_PRODIGAL_GBK.out.versions) + ch_annotation_faa = GUNZIP_PRODIGAL_FAA.out.gunzip + ch_annotation_fna = GUNZIP_PRODIGAL_FNA.out.gunzip + ch_annotation_gbk = GUNZIP_PRODIGAL_GBK.out.gunzip + + } else if ( params.annotation_tool == "prokka" ) { PROKKA ( fasta, [], [] ) ch_versions = ch_versions.mix(PROKKA.out.versions) @@ -59,7 +63,7 @@ workflow ANNOTATION { ch_annotation_fna = PROKKA.out.fna ch_annotation_gbk = PROKKA.out.gbk - } else if ( params.annotation_tool == "bakta" ) { + } else if ( params.annotation_tool == "bakta" ) { // BAKTA prepare download if ( params.annotation_bakta_db_localpath ) { diff --git a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf index 6ce90033..80438ee9 100644 --- a/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_funcscan_pipeline/main.nf @@ -148,7 +148,7 @@ def validateInputParameters() { // 3. Give warning if not using container system assuming conda - if ( params.run_bgc_screening && ( !params.bgc_antismash_databases || !params.bgc_antismash_installationdirectory ) && !params.bgc_skip_antismash && ( session.config.conda && session.config.conda.enabled ) ) { log.warn "[nf-core/funcscan] Running antiSMASH download database module, and detected conda has been enabled. Assuming using conda for pipeline run, check config if this is not expected!" } + if ( params.run_bgc_screening && ( !params.bgc_antismash_databases || !params.bgc_antismash_installationdirectory ) && !params.bgc_skip_antismash && ( session.config.conda && session.config.conda.enabled ) ) { log.warn "[nf-core/funcscan] Running antiSMASH download database module, and detected conda has been enabled. Assuming using conda for pipeline run. Check config if this is not expected!" } } From a8716f8893a499b803542da73fff3d8574663ee1 Mon Sep 17 00:00:00 2001 From: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:47:19 +0000 Subject: [PATCH 29/45] Apply suggestions from code review Co-authored-by: James A. Fellows Yates --- assets/schema_input.json | 4 ++-- docs/usage.md | 20 ++++++++------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 032692f5..4d161949 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -35,8 +35,8 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.g(bk|ff)(\\.gz)?$", - "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk or .gff", + "pattern": "^\\S+\\.gbk(\\.gz)?$", + "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk", "unique": true, "dependentRequired": ["protein"] } diff --git a/docs/usage.md b/docs/usage.md index 0a63486f..4d8a1761 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -52,11 +52,11 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s --input '[path to samplesheet file]' ``` -The input samplesheet has to be a comma-separated file (`.csv`) with 2 (`sample`, `fasta`) or 4 columns (`sample`, `fasta`, `protein`, `feature`), and a header row as shown in the examples below. +The input samplesheet has to be a comma-separated file (`.csv`) with 2 (`sample`, `fasta`) or 4 columns (`sample`, `fasta`, `protein`, `gbk`), and a header row as shown in the examples below. -If you already have annotated contigs, you can supply these to the pipeline using the optional `protein` and `feature` columns. If these two columns are supplied, pipeline annotation (i.e. with bakta, prodigal, pyrodigal or prokka) will be skipped and the corresponding annotation files used instead. +If you already have annotated contigs with peptide sequences and an annotation file in `gbk` format, you can supply these to the pipeline using the optional `protein` and `gbk` columns. If these additional columns are supplied, pipeline annotation (i.e. with bakta, prodigal, pyrodigal or prokka) will be skipped and the corresponding annotation files used instead. -For two columns: +For two columns (without pre-annotated data): ```csv title="samplesheet.csv" sample,fasta @@ -64,10 +64,10 @@ sample_1,///wastewater_metagenome_contigs_1.fasta.gz sample_2,///wastewater_metagenome_contigs_2.fasta.gz ``` -For four columns: +For four columns (with pre-annotated data): ```csv title="samplesheet.csv" -sample,fasta,protein,feature +sample,fasta,protein,gbk sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wastewater_metagenome_contigs_1.faa,///wastewater_metagenome_contigs_1.fasta.gbk sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa,///wastewater_metagenome_contigs_2.fasta.gbk ``` @@ -77,12 +77,12 @@ sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wast | `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | | `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | | `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | -| `feature` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` also given. | +| `gbk` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. :::warning -We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools if none of the contigs in a FASTA file reach certain thresholds. Check parameter documentation for relevant minimum contig parameters. +We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools, or the pipeline may even crash, if none of the contigs in a FASTA file reach certain thresholds for different tools. Check parameter documentation for relevant minimum contig parameters. ::: ## Notes on screening tools and taxonomic classification @@ -117,10 +117,6 @@ If a sample does not reach this contig length threshold, you will receive a warn When the annotation is run with Prokka, the resulting `.gbk` file passed to antiSMASH may produce the error `translation longer than location allows` and end the pipeline run. This Prokka bug has been reported before (see [discussion on GitHub](https://github.com/antismash/antismash/discussions/450)) and is not likely to be fixed soon. -:::warning -Prokka GFF generated files [appears to be incompatible with antiSMASH](https://github.com/antismash/antismash/issues/364), and will likely fail! We recommend running or supplying Prodigal or Pyrodigal annotations instead. -::: - :::warning If antiSMASH is run for BGC detection, we recommend to **not** run Prokka for annotation but instead use the default annotation tool (Pyrodigal) or switch > to Prodigal, or (for bacteria only!) Bakta. :::warning @@ -131,7 +127,7 @@ Various tools of nf-core/funcscan use databases and reference files to operate. nf-core/funcscan offers the functionality to auto-download databases for you, and as these databases can be very large, and we suggest to store these files in a central place from where you can reuse them across pipeline runs. -We **highly recommend** allowing the pipeline to download these databases for you on a first run, saving these to your results directory with `--save_databases`, then moving these to a different location (in case you wish to delete the results directory of this first run). An exception to this is HMM files where no auto-downloading functionality is possible. +If your infrastructure has internet access (particularly on compute nodes), we **highly recommend** allowing the pipeline to download these databases for you on a first run, saving these to your results directory with `--save_databases`, then moving these to a different location (in case you wish to delete the results directory of this first run). An exception to this is HMM files where no auto-downloading functionality is possible. :::warning From fd52feef24d4d20f5be3776f77135a66c8061bab Mon Sep 17 00:00:00 2001 From: jasmezz Date: Wed, 10 Apr 2024 13:55:16 +0200 Subject: [PATCH 30/45] Apply suggestions from code review, fix linting --- docs/usage.md | 12 ++++++------ workflows/funcscan.nf | 4 ---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 4d8a1761..04133e59 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -72,12 +72,12 @@ sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wast sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa,///wastewater_metagenome_contigs_2.fasta.gbk ``` -| Column | Description | -| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | -| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | -| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | -| `gbk` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | +| Column | Description | +| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | +| `gbk` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 0db4d3f5..79b344e2 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -106,9 +106,6 @@ workflow FUNCSCAN { } // Add to meta the length of longest contig for downstream filtering - ch_intermediate_input.fastas - ch_intermediate_input.annotations - BIOAWK ( ch_intermediate_input.fastas ) ch_versions = ch_versions.mix( BIOAWK.out.versions ) @@ -169,7 +166,6 @@ workflow FUNCSCAN { gbks: [meta, gbk] } - /* TAXONOMIC CLASSIFICATION */ From 4e0a61fcfcffb66e3cb610e70288a5278459e492 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Wed, 10 Apr 2024 15:57:21 +0200 Subject: [PATCH 31/45] Change feature to gbk, remove gff from docs --- assets/samplesheet.csv | 2 +- assets/schema_input.json | 2 +- docs/output.md | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 4645a661..791912cd 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,4 +1,4 @@ -sample,fasta,protein,feature +sample,fasta,protein,gbk sample_1,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_1.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.faa,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_1.gbk sample_2,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_2.fasta.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.faa.gz,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs_prokka_2.gbk.gz sample_3,https://raw.githubusercontent.com/nf-core/test-datasets/funcscan/wastewater_metagenome_contigs.fasta diff --git a/assets/schema_input.json b/assets/schema_input.json index 4d161949..f956c79c 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -31,7 +31,7 @@ "unique": true, "dependentRequired": ["feature"] }, - "feature": { + "gbk": { "type": "string", "format": "file-path", "exists": true, diff --git a/docs/output.md b/docs/output.md index f20d1cd2..e2b9f270 100644 --- a/docs/output.md +++ b/docs/output.md @@ -125,7 +125,6 @@ Output Summaries: - `prodigal/` - `/`: - - `*.gff`: annotation in GFF3 format, containing both sequences and annotations - `*.fna`: nucleotide FASTA file of the input contig sequences - `*.faa`: protein FASTA file of the translated CDS sequences - `*.gbk`: annotation in GBK format, containing both sequences and annotations @@ -143,9 +142,10 @@ Output Summaries: - `pyrodigal/` - `/`: - - `*.gff`: annotation in GFF3 format, containing both sequences and annotations - - `*.fna`: nucleotide FASTA file of the input contig sequences + - `*.gbk`: annotation in GBK format, containing both sequences and annotations + - `*.fna`: nucleotide FASTA file of the annotated CDS sequences - `*.faa`: protein FASTA file of the translated CDS sequences + - `*.score.gz`: all potential genes (with scores) > Descriptions taken from the [Pyrodigal documentation](https://pyrodigal.readthedocs.io/) From b5fc8f40ab058afe840cec737114e18bf0d0ddf9 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Wed, 10 Apr 2024 16:51:09 +0200 Subject: [PATCH 32/45] Fix "feature" renaming to "gbk" --- assets/schema_input.json | 2 +- docs/usage.md | 2 +- workflows/funcscan.nf | 26 +++++++++++++------------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index f956c79c..2f75d80e 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -29,7 +29,7 @@ "pattern": "^\\S+\\.(faa)(\\.gz)?$", "errorMessage": "Input file for peptide annotations has incorrect file format. File must end in .fasta, .faa", "unique": true, - "dependentRequired": ["feature"] + "dependentRequired": ["gbk"] }, "gbk": { "type": "string", diff --git a/docs/usage.md b/docs/usage.md index 6db46a55..f1fc90cd 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -76,7 +76,7 @@ sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wast | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | | `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | -| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `feature` also given. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `gbk` also given. | | `gbk` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 79b344e2..4c02b3e2 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -73,7 +73,7 @@ workflow FUNCSCAN { // Some tools require uncompressed input ch_input_prep = ch_input - .map{meta, fasta, faa, feature -> [meta, [fasta, faa, feature]]} + .map{meta, fasta, faa, gbk -> [meta, [fasta, faa, gbk]]} .transpose() .branch { compressed: it[1].toString().endsWith('.gz') @@ -92,17 +92,17 @@ workflow FUNCSCAN { meta, files -> def fasta_found = files.find{it.toString().tokenize('.').last().matches('fasta|fas|fna|fa')} def faa_found = files.find{it.toString().endsWith('.faa')} - def feature_found = files.find{it.toString().tokenize('.').last().matches('gbk')} + def gbk_found = files.find{it.toString().tokenize('.').last().matches('gbk')} def fasta = fasta_found != null ? fasta_found : [] def faa = faa_found != null ? faa_found : [] - def feature = feature_found != null ? feature_found : [] + def gbk = gbk_found != null ? gbk_found : [] - [meta, fasta, faa, feature] + [meta, fasta, faa, gbk] } .multiMap { - meta, fasta, faa, feature -> + meta, fasta, faa, gbk -> fastas: [ meta, fasta ] - annotations : [ meta, faa, feature ] + annotations : [ meta, faa, gbk ] } // Add to meta the length of longest contig for downstream filtering @@ -113,16 +113,16 @@ workflow FUNCSCAN { .join( BIOAWK.out.longest ) .join( ch_intermediate_input.annotations ) .map{ - meta, fasta, length, faa, feature -> + meta, fasta, length, faa, gbk -> def meta_new = [:] meta_new['longest_contig'] = Integer.parseInt(length) - [ meta + meta_new, fasta, faa, feature ] + [ meta + meta_new, fasta, faa, gbk ] } // Separate pre-annotated FASTAs from those that need annotation ch_input_for_annotation = ch_intermediate_input .branch { - meta, fasta, protein, feature -> + meta, fasta, protein, gbk -> preannotated: protein != [] unannotated: true } @@ -136,7 +136,7 @@ workflow FUNCSCAN { ch_unannotated_for_annotation = ch_input_for_annotation.unannotated .map{ - meta, fasta, protein, feature -> + meta, fasta, protein, gbk -> [meta, fasta] } @@ -154,9 +154,9 @@ workflow FUNCSCAN { ch_prepped_input = ch_input_for_annotation.preannotated .map{ - meta, fasta, protein, feature -> - def gbk = feature.extension == 'gbk' ? feature : [] - [meta, fasta, protein, gbk] + meta, fasta, protein, gbk -> + def gbk_format = gbk.extension == 'gbk' ? gbk : [] + [meta, fasta, protein, gbk_format] } .mix( ch_new_annotation ) .multiMap { From 95f8fb55ae776bde014dedbad2e6ce85d8a15703 Mon Sep 17 00:00:00 2001 From: jasmezz Date: Wed, 10 Apr 2024 16:53:34 +0200 Subject: [PATCH 33/45] Fix linting --- docs/usage.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index f1fc90cd..fffec87d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -76,7 +76,7 @@ sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wast | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | | `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | -| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `gbk` also given. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `gbk` also given. | | `gbk` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From bf430497aa0fe396eeb1034874ffcbd3bd6601bf Mon Sep 17 00:00:00 2001 From: jasmezz Date: Mon, 22 Apr 2024 14:27:47 +0200 Subject: [PATCH 34/45] Fix variables --- workflows/funcscan.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index f87ffce7..22b858e0 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -129,10 +129,10 @@ workflow FUNCSCAN { !fasta.isEmpty() } - ch_intermediate_input = ch_intermediate_input_long + ch_intermediate_input = ch_intermediate_input_long.mix( ch_intermediate_input_short ) // Separate pre-annotated FASTAs from those that need annotation - ch_input_for_annotation = ch_intermediate_input_long + ch_input_for_annotation = ch_intermediate_input .branch { meta, fasta, faa, gbk -> preannotated: faa != [] From 0d2ef7c36d359b5936f3ded1020e1b86694be6c5 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 24 Apr 2024 11:53:12 +0200 Subject: [PATCH 35/45] Fix channels, missing warnin/docs about no splitting for preanno --- subworkflows/local/bgc.nf | 9 +----- workflows/funcscan.nf | 65 +++++++++++++++++++-------------------- 2 files changed, 32 insertions(+), 42 deletions(-) diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 8740677a..9d1312fb 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -70,14 +70,7 @@ workflow BGC { } - ch_antismash_input = gbks - .filter { - meta, files -> - if ( meta.longest_contig < params.bgc_antismash_sampleminlength ) log.warn "[nf-core/funcscan] Sample does not have any contig reaching min. length threshold of --bgc_antismash_sampleminlength ${params.bgc_antismash_sampleminlength}. AntiSMASH will not be run for sample: ${meta.id}." - meta.longest_contig >= params.bgc_antismash_sampleminlength - } - - ANTISMASH_ANTISMASHLITE ( ch_antismash_input, ch_antismash_databases, ch_antismash_directory, [] ) + ANTISMASH_ANTISMASHLITE ( gbk, ch_antismash_databases, ch_antismash_directory, [] ) ch_versions = ch_versions.mix( ANTISMASH_ANTISMASHLITE.out.versions ) ch_antismashresults_for_combgc = ANTISMASH_ANTISMASHLITE.out.knownclusterblast_dir .mix( ANTISMASH_ANTISMASHLITE.out.gbk_input ) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 22b858e0..21aa7db5 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -102,16 +102,17 @@ workflow FUNCSCAN { [meta, fasta, faa, gbk] } - .multiMap { + .branch { meta, fasta, faa, gbk -> - fastas: [ meta, fasta ] - annotations : [ meta, faa, gbk ] + preannotated: gbk != [] + fastas: true } // Split each FASTA into long and short contigs to // speed up BGC workflow with BGC-compatible contig lengths only - SEQKIT_SEQ_LONG ( ch_intermediate_input.fastas ) - SEQKIT_SEQ_SHORT ( ch_intermediate_input.fastas ) + ch_intermediate_fasta_for_split = ch_intermediate_input.fastas.map{ meta, fasta, faa, gbk -> [ meta, fasta ] } + SEQKIT_SEQ_LONG ( ch_intermediate_fasta_for_split ) + SEQKIT_SEQ_SHORT ( ch_intermediate_fasta_for_split ) ch_versions = ch_versions.mix( SEQKIT_SEQ_LONG.out.versions ) ch_versions = ch_versions.mix( SEQKIT_SEQ_SHORT.out.versions ) @@ -129,15 +130,7 @@ workflow FUNCSCAN { !fasta.isEmpty() } - ch_intermediate_input = ch_intermediate_input_long.mix( ch_intermediate_input_short ) - - // Separate pre-annotated FASTAs from those that need annotation - ch_input_for_annotation = ch_intermediate_input - .branch { - meta, fasta, faa, gbk -> - preannotated: faa != [] - unannotated: true - } + ch_input_for_annotation = ch_intermediate_input_long.mix( ch_intermediate_input_short ) /* ANNOTATION @@ -146,17 +139,11 @@ workflow FUNCSCAN { // Some tools require annotated FASTAs if ( ( params.run_arg_screening && !params.arg_skip_deeparg ) || ( params.run_amp_screening && ( !params.amp_skip_hmmsearch || !params.amp_skip_amplify || !params.amp_skip_ampir ) ) || ( params.run_bgc_screening && ( !params.bgc_skip_hmmsearch || !params.bgc_skip_antismash ) ) ) { - ch_unannotated_for_annotation = ch_input_for_annotation.unannotated - .map{ - meta, fasta, faa, gbk -> - [meta, fasta] - } - - ANNOTATION( ch_unannotated_for_annotation ) + ANNOTATION( ch_input_for_annotation ) ch_versions = ch_versions.mix( ANNOTATION.out.versions ) ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files ) - ch_new_annotation = ch_unannotated_for_annotation + ch_new_annotation = ch_input_for_annotation .join( ANNOTATION.out.faa ) .join( ANNOTATION.out.gbk ) @@ -164,20 +151,30 @@ workflow FUNCSCAN { ch_new_annotation = Channel.empty() } - ch_prepped_input = ch_input_for_annotation.preannotated - .map{ - meta, fasta, faa, gbk -> - def gbk_format = gbk.extension == 'gbk' ? gbk : [] - [meta, fasta, faa, gbk_format] - } + ch_prepped_input = ch_intermediate_input.preannotated .mix( ch_new_annotation ) + .dump(tag: 'final_for_screening_all') .multiMap { meta, fasta, faa, gbk -> - fastas: [meta, fasta] - faas: [meta, faa] - gbks: [meta, gbk] + fastas: [meta, fasta] + faas: [meta, faa] + gbks: [meta, gbk] } + ch_prepped_input_long = ch_new_annotation + .filter{ + meta, fasta, faa, gbk -> + meta.length == "long" + } + .mix(ch_intermediate_input.preannotated) + .dump(tag: 'final_for_screening_long') + .multiMap { + meta, fasta, faa, gbk -> + fastas: [meta, fasta] + faas: [meta, faa] + gbks: [meta, gbk] + } + /* TAXONOMIC CLASSIFICATION */ @@ -296,14 +293,14 @@ workflow FUNCSCAN { */ if ( params.run_bgc_screening && !params.run_taxa_classification ) { BGC ( - ch_prepped_input.fastas, - ch_prepped_input.faas + ch_prepped_input_long.fastas, + ch_prepped_input_long.faas .filter { meta, file -> if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_prepped_input.gbks + ch_prepped_input_long.gbks .filter { meta, file -> if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. BGC screening tools requiring this file will not be executed: ${meta.id}") From 7ed3594b18eab571905a65cd5bb33242aff2fb5d Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 24 Apr 2024 12:16:04 +0200 Subject: [PATCH 36/45] Use correct GBK channel --- subworkflows/local/bgc.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/local/bgc.nf b/subworkflows/local/bgc.nf index 9d1312fb..089f57f7 100644 --- a/subworkflows/local/bgc.nf +++ b/subworkflows/local/bgc.nf @@ -70,7 +70,7 @@ workflow BGC { } - ANTISMASH_ANTISMASHLITE ( gbk, ch_antismash_databases, ch_antismash_directory, [] ) + ANTISMASH_ANTISMASHLITE ( gbks, ch_antismash_databases, ch_antismash_directory, [] ) ch_versions = ch_versions.mix( ANTISMASH_ANTISMASHLITE.out.versions ) ch_antismashresults_for_combgc = ANTISMASH_ANTISMASHLITE.out.knownclusterblast_dir .mix( ANTISMASH_ANTISMASHLITE.out.gbk_input ) From 619479a75dd033425db468ffab622bd7ca716d10 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 24 Apr 2024 12:36:47 +0200 Subject: [PATCH 37/45] Add log warning when BGC and preannotated input --- docs/usage.md | 8 +++++--- workflows/funcscan.nf | 14 +++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index e45f6a5a..76ac5061 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -81,10 +81,12 @@ sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wast An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -:::warning -We highly recommend performing quality control on input contigs before running the pipeline. You may not receive results for some tools, or the pipeline may even crash, if none of the contigs in a FASTA file reach certain thresholds for different tools. Check parameter documentation for relevant minimum contig parameters. +:::danger +We highly recommend performing quality control on input contigs before running the pipeline. + +For example, **for un-annotated** input nf-core/funcscan will by default filter for the BGC subworkflow to screen only contigs with at least 3,000 bp length or more (see `--contig_qc_lengththreshold`). -For example, by default BGC screening requires contigs of at least 3,000 bp (see `--contig_qc_lengththreshold`). +In contrast, no such filtering is done for the **pre-annotated** input sent to the BGC screening subworkflow! If you have pre-annotated contigs, make sure they and the annotation files contain sufficiently high quality and length for the type of molecule to be screened for. ::: ## Notes on screening tools and taxonomic classification diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 21aa7db5..0d2d0abe 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -83,9 +83,6 @@ workflow FUNCSCAN { GUNZIP_INPUT_PREP ( ch_input_prep.compressed ) ch_versions = ch_versions.mix( GUNZIP_INPUT_PREP.out.versions ) - // ch _unzipped_fastas = GUNZIP_FASTA_PREP.out.gunzip - // .mix( fasta_prep.uncompressed ) - // Merge all the already uncompressed and newly compressed FASTAs here into // a single input channel for downstream ch_intermediate_input = GUNZIP_INPUT_PREP.out.gunzip @@ -130,6 +127,7 @@ workflow FUNCSCAN { !fasta.isEmpty() } + // Now they are split, can annotated together for efficiency ch_input_for_annotation = ch_intermediate_input_long.mix( ch_intermediate_input_short ) /* @@ -151,9 +149,11 @@ workflow FUNCSCAN { ch_new_annotation = Channel.empty() } + // Mix back the preannotated samples with the newly annotated ones, + // but also have dedicated channel for subworkflows that should only use + // for long contigs ch_prepped_input = ch_intermediate_input.preannotated .mix( ch_new_annotation ) - .dump(tag: 'final_for_screening_all') .multiMap { meta, fasta, faa, gbk -> fastas: [meta, fasta] @@ -167,7 +167,11 @@ workflow FUNCSCAN { meta.length == "long" } .mix(ch_intermediate_input.preannotated) - .dump(tag: 'final_for_screening_long') + .map { + meta, fasta, faa, gbk -> + if ( params.run_bgc_screening && meta.length == null ) { log.warn("[nf-core/funcscan] Pre-annotated input will not be filtered to long contigs for BGC screening! Expect long-run times and/or possible crashes if includes very short contigs") } + [meta, fasta, faa, gbk] + } .multiMap { meta, fasta, faa, gbk -> fastas: [meta, fasta] From 85c43596e6d3d79e30f2e8d3de034f1749667190 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Mon, 6 May 2024 14:11:19 +0200 Subject: [PATCH 38/45] Start trying to fix taxonomy, not working yet as MMSEQS_TAXONOMYDB not executing --- .gitignore | 1 + subworkflows/local/amp.nf | 4 ++-- subworkflows/local/taxa_class.nf | 14 ++++++++++---- workflows/funcscan.nf | 12 +++++++++--- 4 files changed, 22 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 5124c9ac..2eef655b 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ results/ testing/ testing* *.pyc +.nf-test* diff --git a/subworkflows/local/amp.nf b/subworkflows/local/amp.nf index a3260bbc..47dec041 100644 --- a/subworkflows/local/amp.nf +++ b/subworkflows/local/amp.nf @@ -108,9 +108,9 @@ workflow AMP { //AMPCOMBI concatenation if ( !params.run_taxa_classification ) { - ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.tsv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true ) + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.csv', storeDir: "${params.outdir}/reports/ampcombi",keepHeader:true ) } else { - ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.tsv', keepHeader:true ) + ch_ampcombi_summaries = AMPCOMBI.out.csv.map{ it[1] }.collectFile( name: 'ampcombi_complete_summary.csv', keepHeader:true ) } // MERGE_TAXONOMY diff --git a/subworkflows/local/taxa_class.nf b/subworkflows/local/taxa_class.nf index 8f3fb88d..c73eaa79 100644 --- a/subworkflows/local/taxa_class.nf +++ b/subworkflows/local/taxa_class.nf @@ -29,22 +29,28 @@ workflow TAXA_CLASS { } else { MMSEQS_DATABASES ( params.taxa_classification_mmseqs_databases_id ) ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) - ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) + ch_mmseqs_db = MMSEQS_DATABASES.out.database } // Create db for query contigs, assign taxonomy and convert to table format // MMSEQS_CREATEDB MMSEQS_CREATEDB ( fastas ) ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions ) - ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db // MMSEQS_TAXONOMY MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) ch_versions = ch_versions.mix( MMSEQS_TAXONOMY.out.versions ) - ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy + + ch_taxonomy_input_for_createtsv = MMSEQS_CREATEDB.out.db.dump(tag: 'db') + .join(MMSEQS_TAXONOMY.out.db_taxonomy.dump(tag: 'db_taxonomy')) + .dump(tag: 'post_join') + .multiMap { meta, db, db_taxonomy -> + db: [ meta,db ] + db_taxonomy: [ meta,db_taxonomy ] + } // MMSEQS_CREATETSV - MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb ) + MMSEQS_CREATETSV ( ch_taxonomy_input_for_createtsv.db, [[:],[]], ch_taxonomy_input_for_createtsv.db_taxonomy ) ch_versions = ch_versions.mix( MMSEQS_CREATETSV.out.versions ) ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 0d2d0abe..0874bb62 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -92,7 +92,7 @@ workflow FUNCSCAN { meta, files -> def fasta_found = files.find{it.toString().tokenize('.').last().matches('fasta|fas|fna|fa')} def faa_found = files.find{it.toString().endsWith('.faa')} - def gbk_found = files.find{it.toString().tokenize('.').last().matches('gbk')} + def gbk_found = files.find{it.toString().tokenize('.').last().matches('gbk|gbff')} def fasta = fasta_found != null ? fasta_found : [] def faa = faa_found != null ? faa_found : [] def gbk = gbk_found != null ? gbk_found : [] @@ -185,9 +185,15 @@ workflow FUNCSCAN { // The final subworkflow reports need taxonomic classification. // This can be either on NT or AA level depending on annotation. - // TODO: Only NT at the moment. AA tax. classification will be added only when its PR is merged. if ( params.run_taxa_classification ) { - TAXA_CLASS ( ch_prepped_input.fastas ) + + if ( params.run_bgc_screening && !params.run_amp_screening && !params.run_arg_screening ) { + ch_input_for_taxonomy = ch_prepped_input_long.fastas.dump(tag: 'ch_prepped_input_long') + } else { + ch_input_for_taxonomy = ch_prepped_input.fastas.dump(tag: 'ch_prepped_input') + } + + TAXA_CLASS ( ch_input_for_taxonomy ) ch_versions = ch_versions.mix( TAXA_CLASS.out.versions ) ch_taxonomy_tsv = TAXA_CLASS.out.sample_taxonomy From b11bed10e83c81924a1116934a8fd8363d596a35 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 15 May 2024 11:02:46 +0200 Subject: [PATCH 39/45] Add more GBK/GBFF updates --- assets/schema_input.json | 4 ++-- docs/usage.md | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/assets/schema_input.json b/assets/schema_input.json index 2f75d80e..25efc523 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -35,8 +35,8 @@ "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.gbk(\\.gz)?$", - "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk", + "pattern": "^\\S+\\.(gbk|gbff)(\\.gz)?$", + "errorMessage": "Input file for feature annotations has incorrect file format. File must end in .gbk or .gbff", "unique": true, "dependentRequired": ["protein"] } diff --git a/docs/usage.md b/docs/usage.md index 76ac5061..bce08da6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -54,7 +54,7 @@ nf-core/funcscan takes FASTA files as input, typically contigs or whole genome s The input samplesheet has to be a comma-separated file (`.csv`) with 2 (`sample`, `fasta`) or 4 columns (`sample`, `fasta`, `protein`, `gbk`), and a header row as shown in the examples below. -If you already have annotated contigs with peptide sequences and an annotation file in `gbk` format, you can supply these to the pipeline using the optional `protein` and `gbk` columns. If these additional columns are supplied, pipeline annotation (i.e. with bakta, prodigal, pyrodigal or prokka) will be skipped and the corresponding annotation files used instead. +If you already have annotated contigs with peptide sequences and an annotation file in Genbank format (`.gbk.` or `.gbff`), you can supply these to the pipeline using the optional `protein` and `gbk` columns. If these additional columns are supplied, pipeline annotation (i.e. with bakta, prodigal, pyrodigal or prokka) will be skipped and the corresponding annotation files used instead. For two columns (without pre-annotated data): @@ -72,12 +72,12 @@ sample_1,///wastewater_metagenome_contigs_1.fasta.gz,///wast sample_2,///wastewater_metagenome_contigs_2.fasta.gz,///wastewater_metagenome_contigs_2.faa,///wastewater_metagenome_contigs_2.fasta.gbk ``` -| Column | Description | -| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | -| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | -| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `gbk` also given. | -| `gbk` | Optional path to a pre-generated annotation file in `.gbk` format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | +| Column | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This will be used to name all output files from the pipeline. Spaces in sample names are automatically converted to underscores (`_`). | +| `fasta` | Path or URL to a gzipped or uncompressed FASTA file. Accepted file suffixes are: `.fasta`, `.fna`, or `.fa`, or any of these with `.gz`, e.g. `.fa.gz`. | +| `protein` | Optional path to a pre-generated amino acid FASTA file (`.faa`) containing protein annotations of `fasta`, optionally gzipped. Required to be supplied if `gbk` also given. | +| `gbk` | Optional path to a pre-generated annotation file in Genbank format (`.gbk`, or `.gbff`) format containing annotations information of `fasta`, optionally gzipped. Required to be supplied if `protein` is also given. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. From 800dff9dc63c5f16569d1a54af06a5047cbcf273 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 15 May 2024 14:09:09 +0200 Subject: [PATCH 40/45] Remove dumps --- workflows/funcscan.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 0874bb62..4944a655 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -188,9 +188,9 @@ workflow FUNCSCAN { if ( params.run_taxa_classification ) { if ( params.run_bgc_screening && !params.run_amp_screening && !params.run_arg_screening ) { - ch_input_for_taxonomy = ch_prepped_input_long.fastas.dump(tag: 'ch_prepped_input_long') + ch_input_for_taxonomy = ch_prepped_input_long.fastas } else { - ch_input_for_taxonomy = ch_prepped_input.fastas.dump(tag: 'ch_prepped_input') + ch_input_for_taxonomy = ch_prepped_input.fastas } TAXA_CLASS ( ch_input_for_taxonomy ) From 50aa0761333c7de057f1a4dde23fe703fbccecfc Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 22 May 2024 11:11:19 +0200 Subject: [PATCH 41/45] Only do splitting when BGC workflow executed --- docs/output.md | 2 +- docs/usage.md | 4 ++-- workflows/funcscan.nf | 55 ++++++++++++++++++++++++++----------------- 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/docs/output.md b/docs/output.md index 69cbdfc6..ae542df3 100644 --- a/docs/output.md +++ b/docs/output.md @@ -15,7 +15,7 @@ Similarly, all downloaded databases are saved (i.e. from [MMseqs2](https://githu Furthermore, for reproducibility, versions of all software used in the run is presented in a [MultiQC](http://multiqc.info) report. :::info -Note that (unannotated) input contigs will be split into two categories per sample: long and short. Each sample will thus get two sets of results for each ARG/AMP screening (suffixed with `_long` and `_short` respectively, assuming contigs remain above/below the threshold), whereas for BGC results only `_long` will exist. This is because BGCs can only be reliability screened with longer contigs. +Note that if running the BGC subworkflow (unannotated) input contigs will be split into two categories per sample: long and short. Each sample will thus get two sets of results for each ARG/AMP screening (suffixed with `_long` and `_short` respectively, assuming contigs remain above/below the threshold), whereas for BGC results only `_long` will exist. This is because BGCs can only be reliability screened with longer contigs. The threshold for the separation can be adjusted with `--contig_qc_lengththreshold `. ::: diff --git a/docs/usage.md b/docs/usage.md index b598d282..bb9814d2 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -84,9 +84,9 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p :::danger We highly recommend performing quality control on input contigs before running the pipeline. -For example, **for un-annotated** input nf-core/funcscan will by default filter for the BGC subworkflow to screen only contigs with at least 3,000 bp length or more (see `--contig_qc_lengththreshold`). +For example, **for un-annotated** input if running the BGC screening subworkflow, nf-core/funcscan will by default filter for the BGC subworkflow to screen only contigs with at least 3,000 bp length or more (see `--contig_qc_lengththreshold`). This will split the input contigs into two files: one with contigs of sufficient length for BGC screening and one with contigs below the threshold. Only the former will go for BGC screening, whereas both short and long are used for AMP and ARG screening. Thus when running the BGC subworkflow, all output files will be labelled with the suffix `_long` or `_short` to indicate the length of the contigs. -In contrast, no such filtering is done for the **pre-annotated** input sent to the BGC screening subworkflow! If you have pre-annotated contigs, make sure they and the annotation files contain sufficiently high quality and length for the type of molecule to be screened for. +In contrast, no such filtering nor relabelling is performed for the **pre-annotated** input sent to the BGC screening subworkflow! If you have pre-annotated contigs, make sure they, and the corresponding annotation files, contain sufficiently high quality and length for the type of molecule to be screened for. ::: ## Notes on screening tools and taxonomic classification diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 0f0d2383..93b50cb4 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -107,28 +107,34 @@ workflow FUNCSCAN { // Split each FASTA into long and short contigs to // speed up BGC workflow with BGC-compatible contig lengths only - ch_intermediate_fasta_for_split = ch_intermediate_input.fastas.map{ meta, fasta, faa, gbk -> [ meta, fasta ] } - SEQKIT_SEQ_LONG ( ch_intermediate_fasta_for_split ) - SEQKIT_SEQ_SHORT ( ch_intermediate_fasta_for_split ) - ch_versions = ch_versions.mix( SEQKIT_SEQ_LONG.out.versions ) - ch_versions = ch_versions.mix( SEQKIT_SEQ_SHORT.out.versions ) - - ch_intermediate_input_long = SEQKIT_SEQ_LONG.out.fastx - .map{ meta, file -> [ meta + [id: meta.id + '_long', length: "long" ], file ] } - .filter{ - meta, fasta -> - !fasta.isEmpty() - } - - ch_intermediate_input_short = SEQKIT_SEQ_SHORT.out.fastx - .map{ meta, file -> [ meta + [id: meta.id + '_short', length: "short" ], file ] } - .filter{ - meta, fasta -> - !fasta.isEmpty() - } - - // Now they are split, can annotated together for efficiency - ch_input_for_annotation = ch_intermediate_input_long.mix( ch_intermediate_input_short ) + // Only if BGC screening is enabled! + if ( params.run_bgc_screening) { + + ch_intermediate_fasta_for_split = ch_intermediate_input.fastas.map{ meta, fasta, faa, gbk -> [ meta, fasta ] } + SEQKIT_SEQ_LONG ( ch_intermediate_fasta_for_split ) + SEQKIT_SEQ_SHORT ( ch_intermediate_fasta_for_split ) + ch_versions = ch_versions.mix( SEQKIT_SEQ_LONG.out.versions ) + ch_versions = ch_versions.mix( SEQKIT_SEQ_SHORT.out.versions ) + + ch_intermediate_input_long = SEQKIT_SEQ_LONG.out.fastx + .map{ meta, file -> [ meta + [id: meta.id + '_long', length: "long" ], file ] } + .filter{ + meta, fasta -> + !fasta.isEmpty() + } + + ch_intermediate_input_short = SEQKIT_SEQ_SHORT.out.fastx + .map{ meta, file -> [ meta + [id: meta.id + '_short', length: "short" ], file ] } + .filter{ + meta, fasta -> + !fasta.isEmpty() + } + + // Now they are split, can annotated together for efficiency + ch_input_for_annotation = ch_intermediate_input_long.mix( ch_intermediate_input_short ) + } else { + ch_input_for_annotation = ch_intermediate_input.fastas.map{ meta, fasta, faa, gbk -> [ meta, fasta ] } + } /* ANNOTATION @@ -161,6 +167,9 @@ workflow FUNCSCAN { gbks: [meta, gbk] } + // Generate long contigs only channel only when BGC screening is enabled + if ( params.run_bgc_screening) { + ch_prepped_input_long = ch_new_annotation .filter{ meta, fasta, faa, gbk -> @@ -179,6 +188,8 @@ workflow FUNCSCAN { gbks: [meta, gbk] } + } + /* TAXONOMIC CLASSIFICATION */ From c419ce9cca8709c1b45d8542e3e48bcdf360f2ab Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 22 May 2024 11:12:03 +0200 Subject: [PATCH 42/45] Fix taxonomy workflow from possibly getting async between two input channels for CREATETSV --- subworkflows/local/taxa_class.nf | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/subworkflows/local/taxa_class.nf b/subworkflows/local/taxa_class.nf index c73eaa79..898f570f 100644 --- a/subworkflows/local/taxa_class.nf +++ b/subworkflows/local/taxa_class.nf @@ -9,7 +9,7 @@ include { MMSEQS_CREATETSV } from '../../modules/nf-core/mmseqs/createtsv/main' workflow TAXA_CLASS { take: - fastas // tuple val(meta), path(contigs) + contigs // tuple val(meta), path(contigs) main: ch_versions = Channel.empty() @@ -29,28 +29,27 @@ workflow TAXA_CLASS { } else { MMSEQS_DATABASES ( params.taxa_classification_mmseqs_databases_id ) ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions ) - ch_mmseqs_db = MMSEQS_DATABASES.out.database + ch_mmseqs_db = ( MMSEQS_DATABASES.out.database ) } // Create db for query contigs, assign taxonomy and convert to table format // MMSEQS_CREATEDB - MMSEQS_CREATEDB ( fastas ) + MMSEQS_CREATEDB ( contigs ) ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions ) // MMSEQS_TAXONOMY - MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db ) + MMSEQS_TAXONOMY ( MMSEQS_CREATEDB.out.db, ch_mmseqs_db ) ch_versions = ch_versions.mix( MMSEQS_TAXONOMY.out.versions ) - ch_taxonomy_input_for_createtsv = MMSEQS_CREATEDB.out.db.dump(tag: 'db') - .join(MMSEQS_TAXONOMY.out.db_taxonomy.dump(tag: 'db_taxonomy')) - .dump(tag: 'post_join') - .multiMap { meta, db, db_taxonomy -> - db: [ meta,db ] - db_taxonomy: [ meta,db_taxonomy ] - } + ch_taxonomy_input_for_createtsv = MMSEQS_CREATEDB.out.db + .join(MMSEQS_TAXONOMY.out.db_taxonomy) + .multiMap { meta, db, db_taxonomy -> + db: [ meta,db ] + taxdb: [ meta, db_taxonomy ] + } // MMSEQS_CREATETSV - MMSEQS_CREATETSV ( ch_taxonomy_input_for_createtsv.db, [[:],[]], ch_taxonomy_input_for_createtsv.db_taxonomy ) + MMSEQS_CREATETSV ( ch_taxonomy_input_for_createtsv.taxdb, [[:],[]], ch_taxonomy_input_for_createtsv.db ) ch_versions = ch_versions.mix( MMSEQS_CREATETSV.out.versions ) ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv } From d1d0177d9caa5148b6a896106ea0733dc32c7d92 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 22 May 2024 12:10:18 +0200 Subject: [PATCH 43/45] Fix prokka annotation MQC collection --- workflows/funcscan.nf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 93b50cb4..6ea9f28c 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -398,9 +398,7 @@ workflow FUNCSCAN { ) ) - if( params.annotation_tool=='prokka' ) { - ch_multiqc_files = ch_multiqc_files.mix( PROKKA.out.txt.collect{it[1]}.ifEmpty([]) ) - } + ch_multiqc_files = ch_multiqc_files.mix( ANNOTATION.out.multiqc_files.collect{it[1]}.ifEmpty([]) ) MULTIQC ( ch_multiqc_files.collect(), From 8f1c7ba245a57fbd4af9ad23f7d0f0ec0ea32c20 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 22 May 2024 12:21:52 +0200 Subject: [PATCH 44/45] Fix linting --- conf/base.config | 5 ----- 1 file changed, 5 deletions(-) diff --git a/conf/base.config b/conf/base.config index c3d2523f..32c67616 100644 --- a/conf/base.config +++ b/conf/base.config @@ -79,11 +79,6 @@ process { time = { check_max( 8.h * task.attempt, 'time' ) } } - withName: PRODIGAL_GFF { - memory = { check_max( 2.GB * task.attempt, 'memory' ) } - cpus = 1 - } - withName: PRODIGAL_GBK { memory = { check_max( 2.GB * task.attempt, 'memory' ) } cpus = 1 From 2d8b238308c4b3b61df1bd773ee48b25dfbb8a0b Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 22 May 2024 16:00:39 +0200 Subject: [PATCH 45/45] Make it so deepBGC actually produces otutput, and START send only long fastas taxonom results to BGC --- conf/test_bgc.config | 4 ++++ workflows/funcscan.nf | 14 +++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/conf/test_bgc.config b/conf/test_bgc.config index d1419d86..c5e816ee 100644 --- a/conf/test_bgc.config +++ b/conf/test_bgc.config @@ -28,4 +28,8 @@ params { run_arg_screening = false run_amp_screening = false run_bgc_screening = true + + // Set scores so deepBGC can actually find a hit so comBGC is actually executed + bgc_deepbgc_score = 0.1 + } diff --git a/workflows/funcscan.nf b/workflows/funcscan.nf index 6ea9f28c..1e33e9ef 100644 --- a/workflows/funcscan.nf +++ b/workflows/funcscan.nf @@ -120,6 +120,7 @@ workflow FUNCSCAN { .map{ meta, file -> [ meta + [id: meta.id + '_long', length: "long" ], file ] } .filter{ meta, fasta -> + if ( fasta.isEmpty() ) { log.warn("[nf-core/funcscan] The following sample did not contain contigs longer than ${params.contig_qc_lengththreshold} BGC screening will not be executed: ${meta.id}") } !fasta.isEmpty() } @@ -178,7 +179,7 @@ workflow FUNCSCAN { .mix(ch_intermediate_input.preannotated) .map { meta, fasta, faa, gbk -> - if ( params.run_bgc_screening && meta.length == null ) { log.warn("[nf-core/funcscan] Pre-annotated input will not be filtered to long contigs for BGC screening! Expect long-run times and/or possible crashes if includes very short contigs") } + if ( params.run_bgc_screening && meta.length == null ) { log.warn("[nf-core/funcscan] Pre-annotated input will not be filtered to long contigs for BGC screening! Expect long-run times and/or possible crashes if includes very short contigs. Sample: ${meta.id}") } [meta, fasta, faa, gbk] } .multiMap { @@ -313,6 +314,9 @@ workflow FUNCSCAN { BGCs */ if ( params.run_bgc_screening && !params.run_taxa_classification ) { + + ch_filtered_taxonomytsv_for_bgc = ch_taxonomy_tsv.dump(tag: 'ch_taxonomy_tsv_for_bgc.tsv') + BGC ( ch_prepped_input_long.fastas, ch_prepped_input_long.faas @@ -327,19 +331,19 @@ workflow FUNCSCAN { if ( file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty GBK file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_taxonomy_tsv + ch_filtered_taxonomytsv_for_bgc ) ch_versions = ch_versions.mix( BGC.out.versions ) } else if ( params.run_bgc_screening && params.run_taxa_classification ) { BGC ( - ch_prepped_input.fastas, - ch_prepped_input.faas + ch_prepped_input_long.fastas, + ch_prepped_input_long.faas .filter { meta, file -> if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced produced an empty FAA file. BGC screening tools requiring this file will not be executed: ${meta.id}") !file.isEmpty() }, - ch_prepped_input.gbks + ch_prepped_input_long.gbks .filter { meta, file -> if ( file != [] && file.isEmpty() ) log.warn("[nf-core/funcscan] Annotation of following sample produced an empty GBK file. BGC screening tools requiring this file will not be executed: ${meta.id}")