Merge pull request #1173 from FriederikeHanssen/refactor_hc

nf-core · Aug 16, 2023 · f63596a · f63596a
2 parents 38c57e6 + 89163ce
commit f63596a
Show file tree

Hide file tree

Showing 11 changed files with 250 additions and 147 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#1153](https://github.com/nf-core/sarek/pull/1153) - Add input validation for Sentieon & FGBio UMI incompatibility
 - [#1158](https://github.com/nf-core/sarek/pull/1158) - Add preprint
 - [#1159](https://github.com/nf-core/sarek/pull/1159) - ISMB Poster
+- [#1173](https://github.com/nf-core/sarek/pull/1173) - CI tests for VQSR track with stub runs
 
 ### Changed
 
@@ -23,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#1059](https://github.com/nf-core/sarek/pull/1059) - Add `nf-validation` for samplesheet validation
 - [#1160](https://github.com/nf-core/sarek/pull/1160) - Updating tiddit to v3.6.1
 - [#1166](https://github.com/nf-core/sarek/pull/1166) - More info about `--tools`
+- [#1173](https://github.com/nf-core/sarek/pull/1173) - Refactor single sample filtering of Haplotypecaller generated VCFs ([#1053](https://github.com/nf-core/sarek/pull/1053))
 - [#1174](https://github.com/nf-core/sarek/pull/1174) - Updating multiqc to v1.15
 - [#1179](https://github.com/nf-core/sarek/pull/1179) - Unhide params `trim_fastq`, `umi_read_structure`, and `aligner`
 
@@ -35,6 +37,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#1163](https://github.com/nf-core/sarek/pull/1163) - Correcting location of output folder for joint variant calling with GATK's haplotypecaller
 - [#1169](https://github.com/nf-core/sarek/pull/1169) - Updating Sentieon-modules. (The conda-check in the Sentieon-modules was moved to the script-section. The version of Sentieon remain unchanged.)
 - [#1172](https://github.com/nf-core/sarek/pull/1172) - Publish gvcf files when all intervals are processed at once ([#764](https://github.com/nf-core/sarek/issues/764))
+- [#1173](https://github.com/nf-core/sarek/pull/1173) - Fixed duplicated entries in joint germline recalibrated VCF ([#966](https://github.com/nf-core/sarek/pull/966), [#1102](https://github.com/nf-core/sarek/pull/1102)),
+  fixed grouping joint germline recalibrated VCF ([#1137](https://github.com/nf-core/sarek/pull/1137))
 - [#1177](https://github.com/nf-core/sarek/pull/1177) - Fix status inference when using nf-validation plugin
 
 ### Dependencies

diff --git a/conf/modules/joint_germline.config b/conf/modules/joint_germline.config
@@ -68,15 +68,14 @@ process {
     withName: 'GATK4_APPLYVQSR_SNP' {
         ext.prefix = { "${meta.id}_SNP" }
         ext.args = '--truth-sensitivity-filter-level 99.9 -mode SNP'
+        publishDir = [
+            enabled: false
+        ]
     }
 
     withName: 'GATK4_APPLYVQSR_INDEL' {
-        ext.prefix    = { "${meta.id}_INDEL" }
+        ext.prefix    = { "joint_germline_recalibrated" }
         ext.args      = '--truth-sensitivity-filter-level 99.9 -mode INDEL'
-    }
-
-    withName: 'MERGE_VQSR' {
-        ext.prefix       = "joint_germline_recalibrated"
         publishDir       = [
             mode: params.publish_dir_mode,
             path: { "${params.outdir}/variant_calling/haplotypecaller/joint_variant_calling/"},

diff --git a/conf/test/tools_germline.config b/conf/test/tools_germline.config
@@ -15,6 +15,9 @@ params {
     fasta                  = params.test_data['homo_sapiens']['genome']['genome_21_fasta']
     intervals              = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed']
     known_indels           = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz']
+    known_indels_vqsr      = "--resource:1000G,known=false,training=true,truth=true,prior=10.0 mills_and_1000G.indels.hg38.vcf.gz"
+    known_snps             = params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz']
+    known_snps_vqsr        = "--resource:hapmap,known=false,training=true,truth=true,prior=10.0 hapmap_3.3.hg38.vcf.gz"
     nucleotides_per_second = 20
     step                   = 'variant_calling'
     tools                  = null

diff --git a/modules.json b/modules.json
@@ -173,7 +173,7 @@
                     },
                     "gatk4/applyvqsr": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "359dcb06bda60c43955752e356e25c91cfd38ae0",
                         "installed_by": ["modules"]
                     },
                     "gatk4/baserecalibrator": {
@@ -283,7 +283,7 @@
                     },
                     "gatk4/variantrecalibrator": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "359dcb06bda60c43955752e356e25c91cfd38ae0",
                         "installed_by": ["modules"]
                     },
                     "manta/germline": {

diff --git a/modules/nf-core/gatk4/applyvqsr/main.nf b/modules/nf-core/gatk4/applyvqsr/main.nf
diff --git a/modules/nf-core/gatk4/variantrecalibrator/main.nf b/modules/nf-core/gatk4/variantrecalibrator/main.nf
diff --git a/subworkflows/local/bam_joint_calling_germline_gatk/main.nf b/subworkflows/local/bam_joint_calling_germline_gatk/main.nf
@@ -36,11 +36,13 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
     // Map input for GenomicsDBImport
     // Rename based on num_intervals, group all samples by their interval_name/interval_file and restructure for channel
     // Group by [0, 3] to avoid a list of metas and make sure that any intervals
-
     gendb_input = input
         .map{ meta, gvcf, tbi, intervals -> [ [ id:'joint_variant_calling', intervals_name:intervals.simpleName, num_intervals:meta.num_intervals ], gvcf, tbi, intervals ] }
-        .groupTuple(by:[0, 3])
-        .map{ meta, gvcf, tbi, intervals -> [ meta, gvcf, tbi, intervals, [], [] ] }
+        .groupTuple(by:3) //join on interval file
+        .map{ meta_list, gvcf, tbi, intervals ->
+            // meta is now a list of [meta1, meta2] but they are all the same. So take the first element.
+            [ meta_list[0], gvcf, tbi, intervals, [], [] ]
+        }
 
     // Convert all sample vcfs into a genomicsdb workspace using genomicsdbimport
     GATK4_GENOMICSDBIMPORT(gendb_input, false, false, false)
@@ -82,42 +84,68 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
         fai,
         dict.map{ meta, dict -> [ dict ] })
 
-    //Prepare INDELs and SNPs separately for ApplyVQSR
+    //Prepare SNPs and INDELs for ApplyVQSR
+    // Step 1. : ApplyVQSR to SNPs
+    // Step 2. : Use ApplyVQSR_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html
 
     // Join results of variant recalibration into a single channel tuple
     // Rework meta for variantscalled.csv and annotation tools
     vqsr_input_snp = vqsr_input.join(VARIANTRECALIBRATOR_SNP.out.recal, failOnDuplicate: true)
         .join(VARIANTRECALIBRATOR_SNP.out.idx, failOnDuplicate: true)
         .join(VARIANTRECALIBRATOR_SNP.out.tranches, failOnDuplicate: true)
-        .map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
-
-    // Join results of variant recalibration into a single channel tuple
-    // Rework meta for variantscalled.csv and annotation tools
-    vqsr_input_indel = vqsr_input.join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true)
-        .join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true)
-        .join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true)
-        .map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
+        .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
 
     GATK4_APPLYVQSR_SNP(
         vqsr_input_snp,
         fasta,
         fai,
         dict.map{ meta, dict -> [ dict ] })
 
+    // Join results of ApplyVQSR_SNP and use as input for Indels to avoid duplicate entries in the result
+    // Rework meta for variantscalled.csv and annotation tools
+    vqsr_input_indel = GATK4_APPLYVQSR_SNP.out.vcf.join(GATK4_APPLYVQSR_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]}
+        .join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true)
+        .map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
+
     GATK4_APPLYVQSR_INDEL(
         vqsr_input_indel,
         fasta,
         fai,
         dict.map{ meta, dict -> [ dict ] })
 
-    vqsr_snp_vcf = GATK4_APPLYVQSR_SNP.out.vcf
-    vqsr_indel_vcf = GATK4_APPLYVQSR_INDEL.out.vcf
 
-    //Merge VQSR outputs into final VCF
-    MERGE_VQSR(vqsr_snp_vcf.mix(vqsr_indel_vcf).groupTuple(), dict)
+    // The following is an ugly monster to achieve the following:
+    // When MERGE_GENOTYPEGVCFS and GATK4_APPLYVQSR are run, then use output from APPLYVQSR
+    // When MERGE_GENOTYPEGVCFS and NOT GATK4_APPLYVQSR , then use the output from MERGE_GENOTYPEGVCFS
+
+    merge_vcf_for_join = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]}
+    merge_tbi_for_join = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]}
+
+    // Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements
+    vqsr_vcf_for_join = GATK4_APPLYVQSR_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]}
+    vqsr_tbi_for_join = GATK4_APPLYVQSR_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]}
+
+    // Join on metamap
+    // If both --> meta, vcf_merged, vcf_bqsr
+    // If not VQSR --> meta, vcf_merged, []
+    // if the second is empty, use the first
+    genotype_vcf = merge_vcf_for_join.join(vqsr_vcf_for_join, remainder: true).map{
+        meta, joint_vcf, recal_vcf ->
+
+        vcf_out = recal_vcf ?: joint_vcf
+
+        [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"haplotypecaller"], vcf_out]
+    }
+
+    genotype_index = merge_tbi_for_join.join(vqsr_tbi_for_join, remainder: true).map{
+        meta, joint_tbi, recal_tbi ->
+
+        tbi_out = recal_tbi ?: joint_tbi
 
-    genotype_vcf   = MERGE_GENOTYPEGVCFS.out.vcf.mix(MERGE_VQSR.out.vcf)
-    genotype_index = MERGE_GENOTYPEGVCFS.out.tbi.mix(MERGE_VQSR.out.tbi)
+        [[id:"joint_variant_calling", patient:"all_samples", variantcaller:"haplotypecaller"], tbi_out]
+    }
 
     versions = versions.mix(GATK4_GENOMICSDBIMPORT.out.versions)
     versions = versions.mix(GATK4_GENOTYPEGVCFS.out.versions)

diff --git a/subworkflows/local/bam_variant_calling_germline_all/main.nf b/subworkflows/local/bam_variant_calling_germline_all/main.nf
@@ -13,6 +13,7 @@ include { BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER } from '../bam_variant_calling
 include { BAM_VARIANT_CALLING_MPILEUP             } from '../bam_variant_calling_mpileup/main'
 include { BAM_VARIANT_CALLING_SINGLE_STRELKA      } from '../bam_variant_calling_single_strelka/main'
 include { BAM_VARIANT_CALLING_SINGLE_TIDDIT       } from '../bam_variant_calling_single_tiddit/main'
+include { VCF_VARIANT_FILTERING_GATK              } from '../vcf_variant_filtering_gatk/main'
 
 workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
     take:
@@ -38,6 +39,7 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
     known_sites_snps_tbi
     known_snps_vqsr
     joint_germline                    // boolean: [mandatory] [default: false] joint calling of germline variants
+    skip_haplotypecaller_filter       // boolean: [mandatory] [default: false] whether to filter haplotypecaller single sample vcfs
     sentieon_haplotyper_emit_mode     // channel: [mandatory] value channel with string
 
     main:
@@ -119,22 +121,16 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
             dbsnp,
             dbsnp_tbi,
             dbsnp_vqsr,
-            known_sites_indels,
-            known_sites_indels_tbi,
-            known_indels_vqsr,
-            known_sites_snps,
-            known_sites_snps_tbi,
-            known_snps_vqsr,
-            intervals,
-            intervals_bed_combined_haplotypec,
-            ((skip_tools && skip_tools.split(',').contains('haplotypecaller_filter') || joint_germline)))
+            intervals)
 
         vcf_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.vcf
+        tbi_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.tbi
+
         versions = versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions)
 
         if (joint_germline) {
             BAM_JOINT_CALLING_GERMLINE_GATK(
-                BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.genotype_intervals,
+                BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.gvcf_tbi_intervals,
                 fasta,
                 fasta_fai,
                 dict,
@@ -150,6 +146,24 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
 
             vcf_haplotypecaller = BAM_JOINT_CALLING_GERMLINE_GATK.out.genotype_vcf
             versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_GATK.out.versions)
+        } else {
+
+            // If single sample track, check if filtering should be done
+            if (!skip_haplotypecaller_filter) {
+
+                VCF_VARIANT_FILTERING_GATK(
+                    vcf_haplotypecaller.join(tbi_haplotypecaller, failOnDuplicate: true, failOnMismatch: true),
+                    fasta,
+                    fasta_fai,
+                    dict.map{ meta, dict -> [ dict ] },
+                    intervals_bed_combined_haplotypec,
+                    known_sites_indels.concat(known_sites_snps).flatten().unique().collect(),
+                    known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect())
+
+                vcf_haplotypecaller = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf
+
+                versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions)
+            }
         }
     }
 

diff --git a/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf b/subworkflows/local/bam_variant_calling_haplotypecaller/main.nf
@@ -5,7 +5,6 @@
 // A when clause condition is defined in the conf/modules.config to determine if the module should be run
 
 include { BAM_MERGE_INDEX_SAMTOOLS                            } from '../bam_merge_index_samtools/main'
-include { VCF_VARIANT_FILTERING_GATK                          } from '../vcf_variant_filtering_gatk/main'
 include { GATK4_HAPLOTYPECALLER                               } from '../../../modules/nf-core/gatk4/haplotypecaller/main'
 include { GATK4_MERGEVCFS            as MERGE_HAPLOTYPECALLER } from '../../../modules/nf-core/gatk4/mergevcfs/main'
 
@@ -18,51 +17,51 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
     dbsnp                        // channel: [optional]
     dbsnp_tbi                    // channel: [optional]
     dbsnp_vqsr                   // channel: [optional]
-    known_sites_indels           // channel: [optional]
-    known_sites_indels_tbi       // channel: [optional]
-    known_indels_vqsr            // channel: [optional]
-    known_sites_snps             // channel: [optional]
-    known_sites_snps_tbi         // channel: [optional]
-    known_snps_vqsr              // channel: [optional]
     intervals                    // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals
-    intervals_bed_combined       // channel: [mandatory] intervals/target regions in one file unzipped, no_intervals.bed if no_intervals
-    skip_haplotypecaller_filter  // boolean: [mandatory] [default: false] skip haplotypecaller filter
 
     main:
     versions = Channel.empty()
 
-    vcf = Channel.empty()
+    vcf           = Channel.empty()
     realigned_bam = Channel.empty()
 
     // Combine cram and intervals for spread and gather strategy
     cram_intervals = cram.combine(intervals)
         // Move num_intervals to meta map
-        .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals, variantcaller:'haplotypecaller' ], cram, crai, intervals, [] ] }
+        // Add interval_name to allow correct merging with interval files
+        .map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ interval_name:intervals.simpleName, num_intervals:num_intervals, variantcaller:'haplotypecaller' ], cram, crai, intervals, [] ] }
 
     GATK4_HAPLOTYPECALLER(cram_intervals, fasta, fasta_fai, dict.map{ meta, dict -> [ dict ] }, dbsnp, dbsnp_tbi)
 
     // For joint genotyping
-    genotype_intervals = GATK4_HAPLOTYPECALLER.out.vcf
+    gvcf_tbi_intervals = GATK4_HAPLOTYPECALLER.out.vcf
         .join(GATK4_HAPLOTYPECALLER.out.tbi, failOnMismatch: true)
         .join(cram_intervals, failOnMismatch: true)
         .map{ meta, gvcf, tbi, cram, crai, intervals, dragstr_model -> [ meta, gvcf, tbi, intervals ] }
 
     // Figuring out if there is one or more vcf(s) from the same sample
-    haplotypecaller_vcf = GATK4_HAPLOTYPECALLER.out.vcf.branch{
+    haplotypecaller_vcf = GATK4_HAPLOTYPECALLER.out.vcf.map{
+            meta, vcf -> [ meta - meta.subMap('interval_name'), vcf]
+        }
+        .branch{
         // Use meta.num_intervals to asses number of intervals
             intervals:    it[0].num_intervals > 1
             no_intervals: it[0].num_intervals <= 1
         }
 
     // Figuring out if there is one or more tbi(s) from the same sample
-    haplotypecaller_tbi = GATK4_HAPLOTYPECALLER.out.tbi.branch{
+    haplotypecaller_tbi = GATK4_HAPLOTYPECALLER.out.tbi.map{
+            meta, tbi -> [ meta - meta.subMap('interval_name'), tbi]
+        }.branch{
         // Use meta.num_intervals to asses number of intervals
             intervals:    it[0].num_intervals > 1
             no_intervals: it[0].num_intervals <= 1
         }
 
     // Figuring out if there is one or more bam(s) from the same sample
-    haplotypecaller_bam = GATK4_HAPLOTYPECALLER.out.bam.branch{
+    haplotypecaller_bam = GATK4_HAPLOTYPECALLER.out.bam.map{
+            meta, bam -> [ meta - meta.subMap('interval_name'), bam]
+        }.branch{
         // Use meta.num_intervals to asses number of intervals
             intervals:    it[0].num_intervals > 1
             no_intervals: it[0].num_intervals <= 1
@@ -87,33 +86,18 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
 
     realigned_bam = BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai
 
-    if (!skip_haplotypecaller_filter) {
-
-        VCF_VARIANT_FILTERING_GATK(
-            haplotypecaller_vcf.join(haplotypecaller_tbi, failOnDuplicate: true, failOnMismatch: true),
-            fasta,
-            fasta_fai,
-            dict.map{ meta, dict -> [ dict ] },
-            intervals_bed_combined,
-            known_sites_indels.concat(known_sites_snps).flatten().unique().collect(),
-            known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect())
-
-        vcf = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf
-
-        versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions)
-
-    } else vcf = haplotypecaller_vcf
-
     versions = versions.mix(GATK4_HAPLOTYPECALLER.out.versions)
     versions = versions.mix(MERGE_HAPLOTYPECALLER.out.versions)
 
     // Remove no longer necessary field: num_intervals
-    vcf = vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] }
+    vcf = haplotypecaller_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] }
+    tbi = haplotypecaller_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] }
 
     emit:
-    genotype_intervals // For joint genotyping
+    gvcf_tbi_intervals // For joint genotyping
     realigned_bam      // Optional
-    vcf                // vcf filtered or not
+    vcf                // vcf
+    tbi                // tbi
 
     versions
 }