nf-core · FriederikeHanssen · Aug 16, 2023 · Aug 5, 2023 · Aug 5, 2023 · Aug 5, 2023
@@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#1153](https://github.com/nf-core/sarek/pull/1153) - Add input validation for Sentieon & FGBio UMI incompatibility
 - [#1158](https://github.com/nf-core/sarek/pull/1158) - Add preprint
 - [#1159](https://github.com/nf-core/sarek/pull/1159) - ISMB Poster
+- [#1173](https://github.com/nf-core/sarek/pull/1173) - CI tests for VQSR track with stub runs
 
 ### Changed
 
@@ -23,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#1059](https://github.com/nf-core/sarek/pull/1059) - Add `nf-validation` for samplesheet validation
 - [#1160](https://github.com/nf-core/sarek/pull/1160) - Updating tiddit to v3.6.1
 - [#1166](https://github.com/nf-core/sarek/pull/1166) - More info about `--tools`
+- [#1173](https://github.com/nf-core/sarek/pull/1173) - Refactor single sample filtering of Haplotypecaller generated VCFs ([#1053](https://github.com/nf-core/sarek/pull/1053))
 - [#1174](https://github.com/nf-core/sarek/pull/1174) - Updating multiqc to v1.15
 - [#1179](https://github.com/nf-core/sarek/pull/1179) - Unhide params `trim_fastq`, `umi_read_structure`, and `aligner`
 
@@ -35,6 +37,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#1163](https://github.com/nf-core/sarek/pull/1163) - Correcting location of output folder for joint variant calling with GATK's haplotypecaller
 - [#1169](https://github.com/nf-core/sarek/pull/1169) - Updating Sentieon-modules. (The conda-check in the Sentieon-modules was moved to the script-section. The version of Sentieon remain unchanged.)
 - [#1172](https://github.com/nf-core/sarek/pull/1172) - Publish gvcf files when all intervals are processed at once ([#764](https://github.com/nf-core/sarek/issues/764))
+- [#1173](https://github.com/nf-core/sarek/pull/1173) - Fixed duplicated entries in joint germline recalibrated VCF ([#966](https://github.com/nf-core/sarek/pull/966), [#1102](https://github.com/nf-core/sarek/pull/1102)),
+  fixed grouping joint germline recalibrated VCF ([#1137](https://github.com/nf-core/sarek/pull/1137))
 - [#1177](https://github.com/nf-core/sarek/pull/1177) - Fix status inference when using nf-validation plugin
 
 ### Dependencies

@@ -68,15 +68,14 @@ process {
     withName: 'GATK4_APPLYVQSR_SNP' {
         ext.prefix = { "${meta.id}_SNP" }
         ext.args = '--truth-sensitivity-filter-level 99.9 -mode SNP'
+        publishDir = [
+            enabled: false
+        ]
     }
 
     withName: 'GATK4_APPLYVQSR_INDEL' {
-        ext.prefix    = { "${meta.id}_INDEL" }
+        ext.prefix    = { "joint_germline_recalibrated" }
         ext.args      = '--truth-sensitivity-filter-level 99.9 -mode INDEL'
-    }
-
-    withName: 'MERGE_VQSR' {
-        ext.prefix       = "joint_germline_recalibrated"
         publishDir       = [
             mode: params.publish_dir_mode,
             path: { "${params.outdir}/variant_calling/haplotypecaller/joint_variant_calling/"},

@@ -15,6 +15,9 @@ params {
     fasta                  = params.test_data['homo_sapiens']['genome']['genome_21_fasta']
     intervals              = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed']
     known_indels           = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz']
+    known_indels_vqsr      = "--resource:1000G,known=false,training=true,truth=true,prior=10.0 mills_and_1000G.indels.hg38.vcf.gz"
+    known_snps             = params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz']
+    known_snps_vqsr        = "--resource:hapmap,known=false,training=true,truth=true,prior=10.0 hapmap_3.3.hg38.vcf.gz"
     nucleotides_per_second = 20
     step                   = 'variant_calling'
     tools                  = null

@@ -173,7 +173,7 @@
                     },
                     "gatk4/applyvqsr": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "359dcb06bda60c43955752e356e25c91cfd38ae0",
                         "installed_by": ["modules"]
                     },
                     "gatk4/baserecalibrator": {
@@ -283,7 +283,7 @@
                     },
                     "gatk4/variantrecalibrator": {
                         "branch": "master",
-                        "git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
+                        "git_sha": "359dcb06bda60c43955752e356e25c91cfd38ae0",
                         "installed_by": ["modules"]
                     },
                     "manta/germline": {

@@ -36,11 +36,13 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
     // Map input for GenomicsDBImport
     // Rename based on num_intervals, group all samples by their interval_name/interval_file and restructure for channel
     // Group by [0, 3] to avoid a list of metas and make sure that any intervals
-
     gendb_input = input
         .map{ meta, gvcf, tbi, intervals -> [ [ id:'joint_variant_calling', intervals_name:intervals.simpleName, num_intervals:meta.num_intervals ], gvcf, tbi, intervals ] }
-        .groupTuple(by:[0, 3])
-        .map{ meta, gvcf, tbi, intervals -> [ meta, gvcf, tbi, intervals, [], [] ] }
+        .groupTuple(by:3) //join on interval file
+        .map{ meta_list, gvcf, tbi, intervals ->
+            // meta is now a list of [meta1, meta2] but they are all the same. So take the first element.
+            [ meta_list[0], gvcf, tbi, intervals, [], [] ]
+        }
 
     // Convert all sample vcfs into a genomicsdb workspace using genomicsdbimport
     GATK4_GENOMICSDBIMPORT(gendb_input, false, false, false)
@@ -82,7 +84,9 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
         fai,
         dict.map{ meta, dict -> [ dict ] })
 
-    //Prepare INDELs and SNPs separately for ApplyVQSR
+    //Prepare SNPs and INDELs for ApplyVQSR
+    // Step 1. : ApplyVQSR to SNPs
+    // Step 2. : Use ApplyVQSR_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html
 
     // Join results of variant recalibration into a single channel tuple
     // Rework meta for variantscalled.csv and annotation tools
@@ -91,33 +95,67 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
         .join(VARIANTRECALIBRATOR_SNP.out.tranches, failOnDuplicate: true)
         .map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
 
-    // Join results of variant recalibration into a single channel tuple
-    // Rework meta for variantscalled.csv and annotation tools
-    vqsr_input_indel = vqsr_input.join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true)
-        .join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true)
-        .join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true)
-        .map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
-
     GATK4_APPLYVQSR_SNP(
         vqsr_input_snp,
         fasta,
         fai,
         dict.map{ meta, dict -> [ dict ] })
 
+    // Join results of ApplyVQSR_SNP and use as input for Indels to avoid duplicate entries in the result
+    // Rework meta for variantscalled.csv and annotation tools
+    vqsr_input_indel = GATK4_APPLYVQSR_SNP.out.vcf.join(GATK4_APPLYVQSR_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta - meta.subMap('id') + [ id:'joint_variant_calling' ], vcf, tbi ]}
+        .join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true)
+        .join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true)
+        .map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
+
     GATK4_APPLYVQSR_INDEL(
         vqsr_input_indel,
         fasta,
         fai,
         dict.map{ meta, dict -> [ dict ] })
 
-    vqsr_snp_vcf = GATK4_APPLYVQSR_SNP.out.vcf
-    vqsr_indel_vcf = GATK4_APPLYVQSR_INDEL.out.vcf
 
-    //Merge VQSR outputs into final VCF
-    MERGE_VQSR(vqsr_snp_vcf.mix(vqsr_indel_vcf).groupTuple(), dict)
+    // The following is an ugly monster to achieve the following:
+    // When MERGE_GENOTYPEGVCFS and GATK4_APPLYVQSR are run, then use output from APPLYVQSR
+    // When MERGE_GENOTYPEGVCFS and NOT GATK4_APPLYVQSR , then use the output from MERGE_GENOTYPEGVCFS
+
+    // Remap both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements
+    merge_vcf_for_join = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'recalibrated_joint_variant_calling'] , vcf]}
+    merge_tbi_for_join = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'recalibrated_joint_variant_calling'] , tbi]}
+
+    vqsr_vcf_for_join = GATK4_APPLYVQSR_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'recalibrated_joint_variant_calling'] , vcf]}
+    vqsr_tbi_for_join = GATK4_APPLYVQSR_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'recalibrated_joint_variant_calling'] , tbi]}
+
+    // Join on metamap
+    // If both --> meta, vcf_merged, vcf_bqsr
+    // If not VQSR --> meta, vcf_merged, []
+    // if the second is empty, use the first
+    genotype_vcf = merge_vcf_for_join.join(vqsr_vcf_for_join, remainder: true).map{
+        meta, joint_vcf, recal_vcf ->
+
+        new_id = "joint_variant_calling"
+        vcf_out = joint_vcf
+        if(recal_vcf){
+            new_id = "recalibrated_joint_variant_calling"
+            vcf_out = recal_vcf
+        }
+
+        [[id:new_id, patient:"all_samples", variantcaller:"haplotypecaller"], vcf_out]
+    }
+
+    genotype_index = merge_tbi_for_join.join(vqsr_tbi_for_join, remainder: true).map{
+        meta, joint_tbi, recal_tbi ->
+
+        new_id = "joint_variant_calling"
+        tbi_out = joint_tbi
+        if(recal_tbi){
+            new_id = "recalibrated_joint_variant_calling"
+            tbi_out = recal_tbi
+        }
 
-    genotype_vcf   = MERGE_GENOTYPEGVCFS.out.vcf.mix(MERGE_VQSR.out.vcf)
-    genotype_index = MERGE_GENOTYPEGVCFS.out.tbi.mix(MERGE_VQSR.out.tbi)
+        [[id:new_id, patient:"all_samples", variantcaller:"haplotypecaller"], tbi_out]
+    }
 
     versions = versions.mix(GATK4_GENOMICSDBIMPORT.out.versions)
     versions = versions.mix(GATK4_GENOTYPEGVCFS.out.versions)

@@ -13,6 +13,7 @@ include { BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER } from '../bam_variant_calling
 include { BAM_VARIANT_CALLING_MPILEUP             } from '../bam_variant_calling_mpileup/main'
 include { BAM_VARIANT_CALLING_SINGLE_STRELKA      } from '../bam_variant_calling_single_strelka/main'
 include { BAM_VARIANT_CALLING_SINGLE_TIDDIT       } from '../bam_variant_calling_single_tiddit/main'
+include { VCF_VARIANT_FILTERING_GATK              } from '../vcf_variant_filtering_gatk/main'
 
 workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
     take:
@@ -38,6 +39,7 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
     known_sites_snps_tbi
     known_snps_vqsr
     joint_germline                    // boolean: [mandatory] [default: false] joint calling of germline variants
+    skip_haplotypecaller_filter       // boolean: [mandatory] [default: false] whether to filter haplotypecaller single sample vcfs
     sentieon_haplotyper_emit_mode     // channel: [mandatory] value channel with string
 
     main:
@@ -119,22 +121,16 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
             dbsnp,
             dbsnp_tbi,
             dbsnp_vqsr,
-            known_sites_indels,
-            known_sites_indels_tbi,
-            known_indels_vqsr,
-            known_sites_snps,
-            known_sites_snps_tbi,
-            known_snps_vqsr,
-            intervals,
-            intervals_bed_combined_haplotypec,
-            ((skip_tools && skip_tools.split(',').contains('haplotypecaller_filter') || joint_germline)))
+            intervals)
 
         vcf_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.vcf
+        tbi_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.tbi
+
         versions = versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions)
 
         if (joint_germline) {
             BAM_JOINT_CALLING_GERMLINE_GATK(
-                BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.genotype_intervals,
+                BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.gvcf_tbi_intervals,
                 fasta,
                 fasta_fai,
                 dict,
@@ -150,6 +146,24 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
 
             vcf_haplotypecaller = BAM_JOINT_CALLING_GERMLINE_GATK.out.genotype_vcf
             versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_GATK.out.versions)
+        } else {
+
+            // If single sample track, check if filtering should be done
+            if (!skip_haplotypecaller_filter) {
+
+                VCF_VARIANT_FILTERING_GATK(
+                    vcf_haplotypecaller.join(tbi_haplotypecaller, failOnDuplicate: true, failOnMismatch: true),
+                    fasta,
+                    fasta_fai,
+                    dict.map{ meta, dict -> [ dict ] },
+                    intervals_bed_combined_haplotypec,
+                    known_sites_indels.concat(known_sites_snps).flatten().unique().collect(),
+                    known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect())
+
+                vcf_haplotypecaller = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf
+
+                versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions)
+            }
         }
     }