Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SubWorkflow: mmseqs_contig_taxonomy #5524

Merged
merged 16 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions subworkflows/nf-core/mmseqs_contig_taxonomy/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
include { MMSEQS_CREATEDB } from '../../../modules/nf-core/mmseqs/createdb/main'
include { MMSEQS_DATABASES } from '../../../modules/nf-core/mmseqs/databases/main'
include { MMSEQS_TAXONOMY } from '../../../modules/nf-core/mmseqs/taxonomy/main'
include { MMSEQS_CREATETSV } from '../../../modules/nf-core/mmseqs/createtsv/main'

workflow MMSEQS_CONTIG_TAXONOMY {

take:
contigs // channel: tuple val(meta), path(contigs)
mmseqs_databases // channel: path(mmseqs2 local db)
databases_id // channel: [mmseqs2_db_id]

main:

ch_versions = Channel.empty()
ch_mmseqs_db = Channel.empty()
ch_taxonomy_querydb = Channel.empty()
ch_taxonomy_querydb_taxdb = Channel.empty()
ch_taxonomy_tsv = Channel.empty()

// Download the ref db if not supplied by user
// MMSEQS_DATABASE
if ( !mmseqs_databases.empty ) {
ch_mmseqs_db = Channel
.fromPath( mmseqs_databases )
.first()
} else {
MMSEQS_DATABASES ( databases_id )
ch_versions = ch_versions.mix( MMSEQS_DATABASES.out.versions )
ch_mmseqs_db = ( MMSEQS_DATABASES.out.database )
}

// Create db for query contigs, assign taxonomy and convert to table format
// MMSEQS_CREATEDB
MMSEQS_CREATEDB ( contigs )
ch_versions = ch_versions.mix( MMSEQS_CREATEDB.out.versions )
ch_taxonomy_querydb = MMSEQS_CREATEDB.out.db

// MMSEQS_TAXONOMY
MMSEQS_TAXONOMY ( ch_taxonomy_querydb, ch_mmseqs_db )
ch_versions = ch_versions.mix( MMSEQS_TAXONOMY.out.versions )
ch_taxonomy_querydb_taxdb = MMSEQS_TAXONOMY.out.db_taxonomy

// MMSEQS_CREATETSV
MMSEQS_CREATETSV ( ch_taxonomy_querydb_taxdb, [[:],[]], ch_taxonomy_querydb )
ch_versions = ch_versions.mix( MMSEQS_CREATETSV.out.versions )
ch_taxonomy_tsv = MMSEQS_CREATETSV.out.tsv

emit:
taxonomy = ch_taxonomy_tsv // channel: [ val(meta), tsv ]
db_mmseqs = ch_mmseqs_db // channel: [ val(meta), mmseqs_database ]
db_taxonomy = ch_taxonomy_querydb_taxdb // channel: [ val(meta), db_taxonomy ]
db_contig = ch_taxonomy_querydb // channel: [ val(meta), db ]
versions = ch_versions // channel: [ versions.yml ]
}

68 changes: 68 additions & 0 deletions subworkflows/nf-core/mmseqs_contig_taxonomy/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
name: "mmseqs_contig_taxonomy"
description: Assign taxonomy to contigs using the MMseqs2 workflow.
keywords:
- metagenomics
- database
- contigs
- mmsesq2
- taxonomy
components:
- mmseqs/databases
- mmseqs/createdb
- mmseqs/taxonomy
- mmseqs/createtsv
input:
- contigs:
type: file
description: |
Channel containing each fasta in nucleotide format as a distinct element with meta.
Structure: [ val(meta), path(fasta) ]
pattern: "*.{fasta,fa,fna}"
- mmseqs_databases:
type: string
description: |
Channel containing a database created by mmseqs2 databases.
Structure: [ path(mmseqsdb) ]
pattern: "*/mmseqs/database"
- databases_id:
type: string
description: |
Channel containing the ID of a database made available by developers of mmseqs2. Please refer to https://github.com/soedinglab/MMseqs2/wiki#downloading-databases for possible IDs to use.
Structure: [ val(id) ]
output:
- taxonomy:
type: file
description: |
Channel containing the tab seperated file with all assigned taxonomy.
Structure: [ val(meta), path(tsv) ]
pattern: "*.tsv"
- db_mmseqs:
type: directory
description: |
Channel containing the mmseqs database directory. Useful for when the databases is downloaded in the pipeline.
Structure: [ path(outputdir) ]
pattern: "*/mmseqs_database"
- db_taxonomy:
type: directory
description: |
Channel containing the database containing the taxonomic classification for each input fasta file.
Structure: [ path(outputdir) ]
pattern: "*/sample_taxonomy"
- db_contig:
type: directory
description: |
Channel containing the database containing the mmseqs format of each input fasta file.
Structure: [ path(outputdir) ]
pattern: "*/sample_db"
- versions:
type: file
description: |
File containing software versions
Structure: [ path(versions.yml) ]
pattern: "versions.yml"

authors:
- "@darcy220606"
maintainers:
- "@darcy220606"
60 changes: 60 additions & 0 deletions subworkflows/nf-core/mmseqs_contig_taxonomy/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
nextflow_workflow {

name "Test Subworkflow MMSEQS_CONTIG_TAXONOMY"
script "../main.nf"
workflow "MMSEQS_CONTIG_TAXONOMY"
config './nextflow.config'

tag "subworkflows"
tag "subworkflows_nfcore"
tag "subworkflows/mmseqs_contig_taxonomy"
tag "gunzip"
tag "mmseqs/createdb"
tag "mmseqs/databases"
tag "mmseqs/taxonomy"
tag "mmseqs/createtsv"

test("mmseqs_contig_taxonomy - bacteroides_fragilis - contig") {

setup {
run("GUNZIP") {
script "modules/nf-core/gunzip/main.nf"
process {
"""
input[0] = Channel.of([
[id:'bacteroides_fragilis'],
file(params.test_data['bacteroides_fragilis']['genome']['genome_fna_gz'], checkIfExists: true)
]
)
"""
}
}
}

when {
workflow {
"""
input[0] = GUNZIP.out.gunzip
//input[1] = '/Net/Groups/ccdata/users/AIbrahim/nf-core/modules/mmseqs_database'
input[1] = []
input[2] = 'Kalamari'
Darcy220606 marked this conversation as resolved.
Show resolved Hide resolved
"""
}
}

then{
assertAll(
{ assert workflow.success},
{ assert workflow.out.db_contig.get(0).get(1) ==~ ".*/bacteroides_fragilis" },
{ assert snapshot (
workflow.out.versions,
file(workflow.out.taxonomy[0][1]).readLines()[0].contains("NZ_CP069563.1"),
file(workflow.out.db_taxonomy.get(0).get(1)).list().sort(),
file(workflow.out.db_contig.get(0).get(1)).name,
file(workflow.out.db_mmseqs.get(0)).name,
).match()
},
)
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"mmseqs_contig_taxonomy - bacteroides_fragilis - contig": {
"content": [
[
"versions.yml:md5,394dfc0a2af83eb4c8ec9e180cb44b37",
"versions.yml:md5,49890601bcc79306ea202d0901d0578e",
"versions.yml:md5,50d8f191f53c3da260e67aa3ca64fd77",
"versions.yml:md5,b74ec13e6b0d418f76a233963be3c61c"
],
true,
[
"bacteroides_fragilis.0",
"bacteroides_fragilis.1",
"bacteroides_fragilis.dbtype",
"bacteroides_fragilis.index"
],
"bacteroides_fragilis",
"mmseqs_database"
],
"meta": {
"nf-test": "0.8.4",
"nextflow": "23.10.1"
},
"timestamp": "2024-05-17T11:25:44.10208454"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
process {

withName: MMSEQS_TAXONOMY {
ext.args = '--search-type 2'
memory = 7.GB
}

}
2 changes: 2 additions & 0 deletions subworkflows/nf-core/mmseqs_contig_taxonomy/tests/tags.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
subworkflows/mmseqs_contig_taxonomy:
- subworkflows/nf-core/mmseqs_contig_taxonomy/**
Loading