diff --git a/.github/workflows/pytest-workflow.yml b/.github/workflows/pytest-workflow.yml index a5d9c2872a..3ffff196ef 100644 --- a/.github/workflows/pytest-workflow.yml +++ b/.github/workflows/pytest-workflow.yml @@ -51,6 +51,8 @@ jobs: tags: snpeff - profile: "conda" tags: vep + - profile: "conda" + tags: concatenate_vcfs - profile: "singularity" tags: merge env: diff --git a/CHANGELOG.md b/CHANGELOG.md index 99b4f6d855..a3866ede27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - [#864](https://github.com/nf-core/sarek/pull/864) - Added possibilities to export assembled haplotypes and locally realigned reads +- [#792](https://github.com/nf-core/sarek/pull/792) - Added the option `--concatenate_vcfs` for concatenating the germline vcf-files. Per default, the resulting vcf-files will be placed under `/variant_calling/concat`. ### Changed diff --git a/conf/modules/post_variant_calling.config b/conf/modules/post_variant_calling.config new file mode 100644 index 0000000000..4f30f9b69b --- /dev/null +++ b/conf/modules/post_variant_calling.config @@ -0,0 +1,44 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = When to run the module. +---------------------------------------------------------------------------------------- +*/ + +// POSTPROCESSING VCFS +// Like, for instance, concatenating the unannotated, germline vcf-files + +process { + withName: 'GERMLINE_VCFS_CONCAT'{ + publishDir = [ + //specify to avoid publishing, overwritten otherwise + enabled: false + ] + } + + withName: 'GERMLINE_VCFS_CONCAT_SORT'{ + ext.prefix = { "${meta.id}.germline" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/concat/${meta.id}/" } + ] + } + + withName: 'TABIX_EXT_VCF_.*' { + ext.prefix = { "${input.baseName}" } + } + + withName: 'TABIX_GERMLINE_VCFS_CONCAT_SORT'{ + ext.prefix = { "${meta.id}.germline" } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/concat/${meta.id}/" } + ] + } +} diff --git a/modules.json b/modules.json index 05c489d2d3..4666c3dc06 100644 --- a/modules.json +++ b/modules.json @@ -15,6 +15,10 @@ "git_sha": "6301e29d77e7ec7ce98b55b8a361b316a9a91bfe", "installed_by": ["modules"] }, + "bcftools/concat": { + "branch": "master", + "git_sha": "5e34754d42cd2d5d248ca8673c0a53cdf5624905" + }, "bcftools/sort": { "branch": "master", "git_sha": "78cf39939fbe160a1410c44a6c5946f9a4c56e7e", diff --git a/modules/local/add_info_to_vcf/main.nf b/modules/local/add_info_to_vcf/main.nf new file mode 100644 index 0000000000..91c4e6b3b4 --- /dev/null +++ b/modules/local/add_info_to_vcf/main.nf @@ -0,0 +1,40 @@ +process ADD_INFO_TO_VCF { + tag "$meta.id" + + conda (params.enable_conda ? "anaconda::gawk=5.1.0" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gawk:5.1.0' : + 'quay.io/biocontainers/gawk:5.1.0' }" + + input: + tuple val(meta), path(vcf_gz) + + output: + tuple val(meta), path("*.added_info.vcf"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + input="input.vcf" + output="${vcf_gz.baseName.minus(".vcf")}.added_info.vcf" + zcat $vcf_gz > \$input + ## Add info header lines + grep -E "^##" \$input > \$output + ## Add description of new INFO value + echo '##INFO=' >> \$output + ## Add column header + grep -E "^#CHROM" \$input >> \$output + ## Add SOURCE value to INFO column of variant calls + if grep -Ev "^#" \$input; then + grep -Ev "^#" \$input | awk 'BEGIN{FS=OFS="\t"} { \$8=="." ? \$8="SOURCE=$vcf_gz" : \$8=\$8";SOURCE=$vcf_gz"; print }' >> \$output + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gawk: \$(awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/concat/main.nf b/modules/nf-core/bcftools/concat/main.nf new file mode 100644 index 0000000000..d2a58a557a --- /dev/null +++ b/modules/nf-core/bcftools/concat/main.nf @@ -0,0 +1,35 @@ +process BCFTOOLS_CONCAT { + tag "$meta.id" + label 'process_medium' + + conda (params.enable_conda ? "bioconda::bcftools=1.15.1" : null) + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.15.1--h0ea216a_0': + 'quay.io/biocontainers/bcftools:1.15.1--h0ea216a_0' }" + + input: + tuple val(meta), path(vcfs), path(tbi) + + output: + tuple val(meta), path("*.gz"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + bcftools concat \\ + --output ${prefix}.vcf.gz \\ + $args \\ + --threads $task.cpus \\ + ${vcfs} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/concat/meta.yml b/modules/nf-core/bcftools/concat/meta.yml new file mode 100644 index 0000000000..167dbe5a05 --- /dev/null +++ b/modules/nf-core/bcftools/concat/meta.yml @@ -0,0 +1,48 @@ +name: bcftools_concat +description: Concatenate VCF files +keywords: + - variant calling + - concat + - bcftools + - VCF + +tools: + - concat: + description: | + Concatenate VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcfs: + type: files + description: | + List containing 2 or more vcf files + e.g. [ 'file1.vcf', 'file2.vcf' ] + - tbi: + type: files + description: | + List containing 2 or more index files (optional) + e.g. [ 'file1.tbi', 'file2.tbi' ] +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: VCF concatenated output file + pattern: "*.{vcf.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" diff --git a/nextflow.config b/nextflow.config index c057d0c43d..cc59e939e7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -63,6 +63,7 @@ params { cf_minqual = 0 // ControlFreec default values cf_window = null // by default we are not using this in Control-FREEC cnvkit_reference = null // by default the reference is build from the fasta file + concatenate_vcfs = false // by default we don't concatenate the germline-vcf-files ignore_soft_clipped_bases = false // no --dont-use-soft-clipped-bases for GATK Mutect2 wes = false // Set to true, if data is exome/targeted sequencing data. Used to use correct models in various variant callers joint_germline = false // g.vcf & joint germline calling are not run by default if HaplotypeCaller is selected @@ -316,6 +317,8 @@ includeConfig 'conf/modules/mutect2.config' includeConfig 'conf/modules/strelka.config' includeConfig 'conf/modules/tiddit.config' +includeConfig 'conf/modules/post_variant_calling.config' + //annotate includeConfig 'conf/modules/annotate.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 1e75f7fa45..f84f1bed2f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -226,6 +226,12 @@ "default": "", "fa_icon": "fas fa-toolbox", "properties": { + "concatenate_vcfs": { + "type": "boolean", + "fa_icon": "fas fa-merge", + "description": "Option for concatenating germline vcf-files.", + "help_text": "Concatenating the germline vcf-files from each applied variant-caller into one vcf-file using bfctools concat." + }, "only_paired_variant_calling": { "type": "boolean", "fa_icon": "fas fa-forward", diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf index 72391de44c..4c82912adf 100644 --- a/subworkflows/local/bam_variant_calling_germline_all/main.nf +++ b/subworkflows/local/bam_variant_calling_germline_all/main.nf @@ -37,7 +37,6 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { //TODO: Temporary until the if's can be removed and printing to terminal is prevented with "when" in the modules.config deepvariant_vcf = Channel.empty() freebayes_vcf = Channel.empty() - genotype_gvcf = Channel.empty() haplotypecaller_vcf = Channel.empty() manta_vcf = Channel.empty() mpileup_vcf = Channel.empty() @@ -95,7 +94,6 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { dict ) - mpileup_germline = BAM_VARIANT_CALLING_MPILEUP.out.mpileup mpileup_vcf = BAM_VARIANT_CALLING_MPILEUP.out.vcf ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_MPILEUP.out.versions) } @@ -116,7 +114,7 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { [] ) - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_CNVKIT.out.versions) + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_CNVKIT.out.versions) } // DEEPVARIANT @@ -128,8 +126,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { fasta_fai ) - deepvariant_vcf = Channel.empty().mix(BAM_VARIANT_CALLING_DEEPVARIANT.out.deepvariant_vcf) - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_DEEPVARIANT.out.versions) + deepvariant_vcf = Channel.empty().mix(BAM_VARIANT_CALLING_DEEPVARIANT.out.deepvariant_vcf) + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_DEEPVARIANT.out.versions) } // FREEBAYES @@ -147,8 +145,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { fasta_fai ) - freebayes_vcf = BAM_VARIANT_CALLING_FREEBAYES.out.freebayes_vcf - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_FREEBAYES.out.versions) + freebayes_vcf = BAM_VARIANT_CALLING_FREEBAYES.out.freebayes_vcf + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_FREEBAYES.out.versions) } // HAPLOTYPECALLER @@ -184,8 +182,9 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { known_sites_snps_tbi, intervals_bed_combined_haplotypec) - haplotypecaller_vcf = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.filtered_vcf - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions) + haplotypecaller_vcf = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.filtered_vcf + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions) + } // MANTA @@ -197,8 +196,9 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { fasta_fai ) - manta_vcf = BAM_VARIANT_CALLING_GERMLINE_MANTA.out.manta_vcf - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_GERMLINE_MANTA.out.versions) + + manta_vcf = BAM_VARIANT_CALLING_GERMLINE_MANTA.out.manta_vcf + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_GERMLINE_MANTA.out.versions) } // STRELKA @@ -210,8 +210,8 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { fasta_fai ) - strelka_vcf = BAM_VARIANT_CALLING_SINGLE_STRELKA.out.strelka_vcf - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SINGLE_STRELKA.out.versions) + strelka_vcf = BAM_VARIANT_CALLING_SINGLE_STRELKA.out.strelka_vcf + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SINGLE_STRELKA.out.versions) } //TIDDIT @@ -222,14 +222,13 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL { bwa ) - tiddit_vcf = BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.tiddit_vcf - ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.versions) + tiddit_vcf = BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.tiddit_vcf + ch_versions = ch_versions.mix(BAM_VARIANT_CALLING_SINGLE_TIDDIT.out.versions) } emit: deepvariant_vcf freebayes_vcf - genotype_gvcf haplotypecaller_vcf manta_vcf mpileup_vcf diff --git a/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf b/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf index ebf13d2d47..62a8fc06b2 100644 --- a/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf +++ b/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf @@ -134,7 +134,20 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER { known_sites_indels.concat(known_sites_snps).flatten().unique().collect(), known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect()) - filtered_vcf = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf.map{ meta, vcf-> [[patient:meta.patient, sample:meta.sample, status:meta.status, sex:meta.sex, id:meta.sample, num_intervals:meta.num_intervals, variantcaller:"haplotypecaller"], vcf]} + filtered_vcf = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf.map{ meta, vcf-> [ + [ + patient:meta.patient, + sample:meta.sample, + status:meta.status, + sex:meta.sex, + id:meta.sample, + num_intervals:meta.num_intervals, + variantcaller:"haplotypecaller" + ], + vcf + ] + } + ch_versions = ch_versions.mix(GATK4_HAPLOTYPECALLER.out.versions) ch_versions = ch_versions.mix(MERGE_HAPLOTYPECALLER.out.versions) ch_versions = ch_versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions) diff --git a/subworkflows/local/bam_variant_calling_mpileup/main.nf b/subworkflows/local/bam_variant_calling_mpileup/main.nf index 211b206388..4416071ed8 100644 --- a/subworkflows/local/bam_variant_calling_mpileup/main.nf +++ b/subworkflows/local/bam_variant_calling_mpileup/main.nf @@ -72,5 +72,5 @@ workflow BAM_VARIANT_CALLING_MPILEUP { emit: versions = ch_versions mpileup = Channel.empty().mix(CAT_MPILEUP.out.file_out, mpileup.no_intervals) - vcf = Channel.empty().mix(GATK4_MERGEVCFS.out.vcf,vcfs.no_intervals) + vcf = Channel.empty().mix(GATK4_MERGEVCFS.out.vcf, vcfs.no_intervals) } diff --git a/subworkflows/local/bam_variant_calling_single_strelka/main.nf b/subworkflows/local/bam_variant_calling_single_strelka/main.nf index a7e0f641c3..4eefb3fa99 100644 --- a/subworkflows/local/bam_variant_calling_single_strelka/main.nf +++ b/subworkflows/local/bam_variant_calling_single_strelka/main.nf @@ -76,7 +76,7 @@ workflow BAM_VARIANT_CALLING_SINGLE_STRELKA { sex: meta.sex, status: meta.status, variantcaller: "strelka" - ],vcf] + ], vcf] } ch_versions = ch_versions.mix(MERGE_STRELKA.out.versions) diff --git a/subworkflows/local/vcf_variant_filtering_gatk/main.nf b/subworkflows/local/vcf_variant_filtering_gatk/main.nf index fc26a8d656..3808d3e533 100644 --- a/subworkflows/local/vcf_variant_filtering_gatk/main.nf +++ b/subworkflows/local/vcf_variant_filtering_gatk/main.nf @@ -66,4 +66,3 @@ workflow VCF_VARIANT_FILTERING_GATK { versions = ch_versions filtered_vcf } - diff --git a/tests/config/pytest_tags.yml b/tests/config/pytest_tags.yml index ed429079d8..64b988e65c 100644 --- a/tests/config/pytest_tags.yml +++ b/tests/config/pytest_tags.yml @@ -263,3 +263,20 @@ vep: - modules/nf-core/ensemblvep/main.nf - modules/nf-core/tabix/bgziptabix/main.nf - subworkflows/nf-core/vcf_annotate_ensemblvep/main.nf + +## concatenate germline vcfs +concatenate_vcfs: + - conf/modules/post_variant_calling.config + - modules/nf-core/deepvariant/main.nf # deepvariant + - modules/nf-core/tabix/tabix/main.nf + - modules/nf-core/freebayes/main.nf # freebayes + - modules/nf-core/gatk4/haplotypecaller/main.nf # haplotypecaller + - modules/nf-core/manta/germline/main.nf # manta + - modules/nf-core/bcftools/mpileup/main.nf # mpileup/bcftools + - modules/nf-core/bcftools/sort/main.nf + - modules/nf-core/tabix/bgziptabix/main.nf + - modules/nf-core/bcftools/concat/main.nf + - modules/nf-core/samtools/mpileup/main.nf + - modules/nf-core/gatk4/mergevcfs/main.nf # gatk4/mergevcfs + - modules/nf-core/strelka/germline/main.nf # strelka + - modules/nf-core/tiddit/sv/main.nf # tiddit diff --git a/tests/csv/3.0/mapped_joint_bam.fixed.csv b/tests/csv/3.0/mapped_joint_bam.fixed.csv new file mode 100644 index 0000000000..1dc3920b1e --- /dev/null +++ b/tests/csv/3.0/mapped_joint_bam.fixed.csv @@ -0,0 +1,3 @@ +patient,status,sample,bam,bai +testN,0,testN,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test.paired_end.sorted.bam.bai +testT,0,testT,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam,https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/illumina/bam/test2.paired_end.sorted.bam.bai diff --git a/tests/test_concat_germline_vcfs.yml b/tests/test_concat_germline_vcfs.yml new file mode 100644 index 0000000000..8b23948a70 --- /dev/null +++ b/tests/test_concat_germline_vcfs.yml @@ -0,0 +1,20 @@ +- name: Run all germline variant callers and check for existence of concatenated vcf-files + command: nextflow run main.nf -profile test --input ./tests/csv/3.0/mapped_joint_bam.fixed.csv --concatenate_vcfs --tools deepvariant,freebayes,haplotypecaller,manta,mpileup,strelka,tiddit --step variant_calling + tags: + - concatenate_vcfs + files: + - path: results/variant_calling/concat/testN/testN.germline.vcf.gz + # binary changes md5sums on reruns. + contains: + [ + "SOURCE=testN.deepvariant.vcf.gz", + "AB=0.167832;ABP=277.102;AC=1;AF=0.5;AN=2;AO=48;CIGAR=1X;DP=286;DPB=286;DPRA=0;EPP=3.0103;EPPR=3.0103;GTI=0;LEN=1;MEANALT=1;MQM=60;MQMR=60;NS=1;NUMALT=1;ODDS=105.855;PAIRED=1;PAIREDR=1;PAO=0;PQA=0;PQR=0;PRO=0;QA=2017;QR=9863;RO=238;RPL=0;RPP=107.241;RPPR=519.821;RPR=48;RUN=1;SAF=24;SAP=3.0103;SAR=24;SRF=119;SRP=3.0103;SRR=119;TYPE=snp;technology.illumina=1;SOURCE=testN.freebayes.vcf.gz", + "SNVHPOL=7;MQ=60;SOURCE=testN.strelka.variants.vcf.gz", + "DP=2;SGB=-0.379885;FS=0;MQ0F=0;AC=2;AN=2;DP4=0,0,0,1;MQ=60;SOURCE=testN.bcftools.vcf.gz", + ] + - path: results/variant_calling/concat/testT/testT.germline.vcf.gz + # binary changes md5sums on reruns. + - path: results/variant_calling/concat/testN/testN.germline.vcf.gz.tbi + # binary changes md5sums on reruns. + - path: results/variant_calling/concat/testT/testT.germline.vcf.gz.tbi + # binary changes md5sums on reruns. diff --git a/workflows/sarek.nf b/workflows/sarek.nf index c67c5789c1..e37346f9bc 100644 --- a/workflows/sarek.nf +++ b/workflows/sarek.nf @@ -267,6 +267,25 @@ include { BAM_VARIANT_CALLING_TUMOR_ONLY_ALL } from '../subworkflows // Variant calling on tumor/normal pair include { BAM_VARIANT_CALLING_SOMATIC_ALL } from '../subworkflows/local/bam_variant_calling_somatic_all/main' +// Concatenation of germline vcf-files +include { ADD_INFO_TO_VCF as ADD_INFO_TO_DV_VCF } from '../modules/local/add_info_to_vcf/main' +include { ADD_INFO_TO_VCF as ADD_INFO_TO_FB_VCF } from '../modules/local/add_info_to_vcf/main' +include { ADD_INFO_TO_VCF as ADD_INFO_TO_HTC_VCF } from '../modules/local/add_info_to_vcf/main' +include { ADD_INFO_TO_VCF as ADD_INFO_TO_MANTA_VCF } from '../modules/local/add_info_to_vcf/main' +include { ADD_INFO_TO_VCF as ADD_INFO_TO_MPILEUP_VCF } from '../modules/local/add_info_to_vcf/main' +include { ADD_INFO_TO_VCF as ADD_INFO_TO_STRELKA_VCF } from '../modules/local/add_info_to_vcf/main' +include { ADD_INFO_TO_VCF as ADD_INFO_TO_TIDDIT_VCF } from '../modules/local/add_info_to_vcf/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF_DV } from '../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF_FB } from '../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF_HTC } from '../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF_MANTA } from '../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF_MPILEUP } from '../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF_STRELKA } from '../modules/nf-core/tabix/bgziptabix/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF_TIDDIT } from '../modules/nf-core/tabix/bgziptabix/main' +include { BCFTOOLS_CONCAT as GERMLINE_VCFS_CONCAT } from '../modules/nf-core/bcftools/concat/main' +include { BCFTOOLS_SORT as GERMLINE_VCFS_CONCAT_SORT } from '../modules/nf-core/bcftools/sort/main' +include { TABIX_TABIX as TABIX_GERMLINE_VCFS_CONCAT_SORT } from '../modules/nf-core/tabix/tabix/main' + // QC on VCF files include { VCF_QC_BCFTOOLS_VCFTOOLS } from '../subworkflows/local/vcf_qc_bcftools_vcftools/main' @@ -1001,14 +1020,66 @@ workflow SAREK { rt_file ) + if (params.concatenate_vcfs) { + // Concatenate vcf-files + + ADD_INFO_TO_DV_VCF(BAM_VARIANT_CALLING_GERMLINE_ALL.out.deepvariant_vcf) + TABIX_EXT_VCF_DV(ADD_INFO_TO_DV_VCF.out.vcf) + + ADD_INFO_TO_FB_VCF(BAM_VARIANT_CALLING_GERMLINE_ALL.out.freebayes_vcf) + TABIX_EXT_VCF_FB(ADD_INFO_TO_FB_VCF.out.vcf) + + ADD_INFO_TO_HTC_VCF(BAM_VARIANT_CALLING_GERMLINE_ALL.out.haplotypecaller_vcf) + TABIX_EXT_VCF_HTC(ADD_INFO_TO_HTC_VCF.out.vcf) + + ADD_INFO_TO_MANTA_VCF(BAM_VARIANT_CALLING_GERMLINE_ALL.out.manta_vcf) + TABIX_EXT_VCF_MANTA(ADD_INFO_TO_MANTA_VCF.out.vcf) + + ADD_INFO_TO_MPILEUP_VCF(BAM_VARIANT_CALLING_GERMLINE_ALL.out.mpileup_vcf) + TABIX_EXT_VCF_MPILEUP(ADD_INFO_TO_MPILEUP_VCF.out.vcf) + + ADD_INFO_TO_STRELKA_VCF(BAM_VARIANT_CALLING_GERMLINE_ALL.out.strelka_vcf) + TABIX_EXT_VCF_STRELKA(ADD_INFO_TO_STRELKA_VCF.out.vcf) + + ADD_INFO_TO_TIDDIT_VCF(BAM_VARIANT_CALLING_GERMLINE_ALL.out.tiddit_vcf) + TABIX_EXT_VCF_TIDDIT(ADD_INFO_TO_TIDDIT_VCF.out.vcf) + + // Gather vcfs and vcf-tbis for concatenating germline-vcfs + germline_vcfs_with_tbis = Channel.empty() + germline_vcfs_with_tbis = germline_vcfs_with_tbis.mix(TABIX_EXT_VCF_DV.out.gz_tbi) + germline_vcfs_with_tbis = germline_vcfs_with_tbis.mix(TABIX_EXT_VCF_FB.out.gz_tbi) + germline_vcfs_with_tbis = germline_vcfs_with_tbis.mix(TABIX_EXT_VCF_HTC.out.gz_tbi) + germline_vcfs_with_tbis = germline_vcfs_with_tbis.mix(TABIX_EXT_VCF_MANTA.out.gz_tbi) + germline_vcfs_with_tbis = germline_vcfs_with_tbis.mix(TABIX_EXT_VCF_MPILEUP.out.gz_tbi) + germline_vcfs_with_tbis = germline_vcfs_with_tbis.mix(TABIX_EXT_VCF_STRELKA.out.gz_tbi) + germline_vcfs_with_tbis = germline_vcfs_with_tbis.mix(TABIX_EXT_VCF_TIDDIT.out.gz_tbi) + + germline_vcfs_with_tbis = germline_vcfs_with_tbis.map{ + meta, vcf, tbi -> + def new_meta = meta.clone() + new_meta.remove('variantcaller') + new_meta.remove('tumor_id') + new_meta.remove('normal_id') + new_meta.remove('sample') + new_meta.remove('status') + new_meta.remove('num_intervals') + new_meta.remove('data_type') + [new_meta, vcf, tbi] + }.groupTuple() + + GERMLINE_VCFS_CONCAT(germline_vcfs_with_tbis) + GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT.out.vcf) + TABIX_GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT_SORT.out.vcf) + } + // Gather vcf files for annotation and QC vcf_to_annotate = Channel.empty() vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.deepvariant_vcf) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.freebayes_vcf) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.haplotypecaller_vcf) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.manta_vcf) - vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.tiddit_vcf) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.strelka_vcf) + vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_GERMLINE_ALL.out.tiddit_vcf) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.freebayes_vcf) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.mutect2_vcf) vcf_to_annotate = vcf_to_annotate.mix(BAM_VARIANT_CALLING_TUMOR_ONLY_ALL.out.manta_vcf)