Skip to content

Commit

Permalink
Merge pull request #1173 from FriederikeHanssen/refactor_hc
Browse files Browse the repository at this point in the history
  • Loading branch information
FriederikeHanssen committed Aug 16, 2023
2 parents 38c57e6 + 89163ce commit f63596a
Show file tree
Hide file tree
Showing 11 changed files with 250 additions and 147 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#1153](https://github.com/nf-core/sarek/pull/1153) - Add input validation for Sentieon & FGBio UMI incompatibility
- [#1158](https://github.com/nf-core/sarek/pull/1158) - Add preprint
- [#1159](https://github.com/nf-core/sarek/pull/1159) - ISMB Poster
- [#1173](https://github.com/nf-core/sarek/pull/1173) - CI tests for VQSR track with stub runs

### Changed

Expand All @@ -23,6 +24,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#1059](https://github.com/nf-core/sarek/pull/1059) - Add `nf-validation` for samplesheet validation
- [#1160](https://github.com/nf-core/sarek/pull/1160) - Updating tiddit to v3.6.1
- [#1166](https://github.com/nf-core/sarek/pull/1166) - More info about `--tools`
- [#1173](https://github.com/nf-core/sarek/pull/1173) - Refactor single sample filtering of Haplotypecaller generated VCFs ([#1053](https://github.com/nf-core/sarek/pull/1053))
- [#1174](https://github.com/nf-core/sarek/pull/1174) - Updating multiqc to v1.15
- [#1179](https://github.com/nf-core/sarek/pull/1179) - Unhide params `trim_fastq`, `umi_read_structure`, and `aligner`

Expand All @@ -35,6 +37,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#1163](https://github.com/nf-core/sarek/pull/1163) - Correcting location of output folder for joint variant calling with GATK's haplotypecaller
- [#1169](https://github.com/nf-core/sarek/pull/1169) - Updating Sentieon-modules. (The conda-check in the Sentieon-modules was moved to the script-section. The version of Sentieon remain unchanged.)
- [#1172](https://github.com/nf-core/sarek/pull/1172) - Publish gvcf files when all intervals are processed at once ([#764](https://github.com/nf-core/sarek/issues/764))
- [#1173](https://github.com/nf-core/sarek/pull/1173) - Fixed duplicated entries in joint germline recalibrated VCF ([#966](https://github.com/nf-core/sarek/pull/966), [#1102](https://github.com/nf-core/sarek/pull/1102)),
fixed grouping joint germline recalibrated VCF ([#1137](https://github.com/nf-core/sarek/pull/1137))
- [#1177](https://github.com/nf-core/sarek/pull/1177) - Fix status inference when using nf-validation plugin

### Dependencies
Expand Down
9 changes: 4 additions & 5 deletions conf/modules/joint_germline.config
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,14 @@ process {
withName: 'GATK4_APPLYVQSR_SNP' {
ext.prefix = { "${meta.id}_SNP" }
ext.args = '--truth-sensitivity-filter-level 99.9 -mode SNP'
publishDir = [
enabled: false
]
}

withName: 'GATK4_APPLYVQSR_INDEL' {
ext.prefix = { "${meta.id}_INDEL" }
ext.prefix = { "joint_germline_recalibrated" }
ext.args = '--truth-sensitivity-filter-level 99.9 -mode INDEL'
}

withName: 'MERGE_VQSR' {
ext.prefix = "joint_germline_recalibrated"
publishDir = [
mode: params.publish_dir_mode,
path: { "${params.outdir}/variant_calling/haplotypecaller/joint_variant_calling/"},
Expand Down
3 changes: 3 additions & 0 deletions conf/test/tools_germline.config
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ params {
fasta = params.test_data['homo_sapiens']['genome']['genome_21_fasta']
intervals = params.test_data['homo_sapiens']['genome']['genome_21_multi_interval_bed']
known_indels = params.test_data['homo_sapiens']['genome']['mills_and_1000g_indels_21_vcf_gz']
known_indels_vqsr = "--resource:1000G,known=false,training=true,truth=true,prior=10.0 mills_and_1000G.indels.hg38.vcf.gz"
known_snps = params.test_data['homo_sapiens']['genome']['hapmap_3_3_hg38_21_vcf_gz']
known_snps_vqsr = "--resource:hapmap,known=false,training=true,truth=true,prior=10.0 hapmap_3.3.hg38.vcf.gz"
nucleotides_per_second = 20
step = 'variant_calling'
tools = null
Expand Down
4 changes: 2 additions & 2 deletions modules.json
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@
},
"gatk4/applyvqsr": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"git_sha": "359dcb06bda60c43955752e356e25c91cfd38ae0",
"installed_by": ["modules"]
},
"gatk4/baserecalibrator": {
Expand Down Expand Up @@ -283,7 +283,7 @@
},
"gatk4/variantrecalibrator": {
"branch": "master",
"git_sha": "911696ea0b62df80e900ef244d7867d177971f73",
"git_sha": "359dcb06bda60c43955752e356e25c91cfd38ae0",
"installed_by": ["modules"]
},
"manta/germline": {
Expand Down
12 changes: 12 additions & 0 deletions modules/nf-core/gatk4/applyvqsr/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions modules/nf-core/gatk4/variantrecalibrator/main.nf

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 46 additions & 18 deletions subworkflows/local/bam_joint_calling_germline_gatk/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
// Map input for GenomicsDBImport
// Rename based on num_intervals, group all samples by their interval_name/interval_file and restructure for channel
// Group by [0, 3] to avoid a list of metas and make sure that any intervals

gendb_input = input
.map{ meta, gvcf, tbi, intervals -> [ [ id:'joint_variant_calling', intervals_name:intervals.simpleName, num_intervals:meta.num_intervals ], gvcf, tbi, intervals ] }
.groupTuple(by:[0, 3])
.map{ meta, gvcf, tbi, intervals -> [ meta, gvcf, tbi, intervals, [], [] ] }
.groupTuple(by:3) //join on interval file
.map{ meta_list, gvcf, tbi, intervals ->
// meta is now a list of [meta1, meta2] but they are all the same. So take the first element.
[ meta_list[0], gvcf, tbi, intervals, [], [] ]
}

// Convert all sample vcfs into a genomicsdb workspace using genomicsdbimport
GATK4_GENOMICSDBIMPORT(gendb_input, false, false, false)
Expand Down Expand Up @@ -82,42 +84,68 @@ workflow BAM_JOINT_CALLING_GERMLINE_GATK {
fai,
dict.map{ meta, dict -> [ dict ] })

//Prepare INDELs and SNPs separately for ApplyVQSR
//Prepare SNPs and INDELs for ApplyVQSR
// Step 1. : ApplyVQSR to SNPs
// Step 2. : Use ApplyVQSR_SNP output and run ApplyVQSR_INDEL. This avoids duplicate entries in the vcf as described here: https://hpc.nih.gov/training/gatk_tutorial/vqsr.html

// Join results of variant recalibration into a single channel tuple
// Rework meta for variantscalled.csv and annotation tools
vqsr_input_snp = vqsr_input.join(VARIANTRECALIBRATOR_SNP.out.recal, failOnDuplicate: true)
.join(VARIANTRECALIBRATOR_SNP.out.idx, failOnDuplicate: true)
.join(VARIANTRECALIBRATOR_SNP.out.tranches, failOnDuplicate: true)
.map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }

// Join results of variant recalibration into a single channel tuple
// Rework meta for variantscalled.csv and annotation tools
vqsr_input_indel = vqsr_input.join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true)
.join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true)
.join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true)
.map{ meta, vcf, tbi, recal, index, tranche -> [ meta - meta.subMap('id') + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }
.map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }

GATK4_APPLYVQSR_SNP(
vqsr_input_snp,
fasta,
fai,
dict.map{ meta, dict -> [ dict ] })

// Join results of ApplyVQSR_SNP and use as input for Indels to avoid duplicate entries in the result
// Rework meta for variantscalled.csv and annotation tools
vqsr_input_indel = GATK4_APPLYVQSR_SNP.out.vcf.join(GATK4_APPLYVQSR_SNP.out.tbi).map{ meta, vcf, tbi -> [ meta + [ id:'joint_variant_calling' ], vcf, tbi ]}
.join(VARIANTRECALIBRATOR_INDEL.out.recal, failOnDuplicate: true)
.join(VARIANTRECALIBRATOR_INDEL.out.idx, failOnDuplicate: true)
.join(VARIANTRECALIBRATOR_INDEL.out.tranches, failOnDuplicate: true)
.map{ meta, vcf, tbi, recal, index, tranche -> [ meta + [ id:'recalibrated_joint_variant_calling' ], vcf, tbi, recal, index, tranche ] }

GATK4_APPLYVQSR_INDEL(
vqsr_input_indel,
fasta,
fai,
dict.map{ meta, dict -> [ dict ] })

vqsr_snp_vcf = GATK4_APPLYVQSR_SNP.out.vcf
vqsr_indel_vcf = GATK4_APPLYVQSR_INDEL.out.vcf

//Merge VQSR outputs into final VCF
MERGE_VQSR(vqsr_snp_vcf.mix(vqsr_indel_vcf).groupTuple(), dict)
// The following is an ugly monster to achieve the following:
// When MERGE_GENOTYPEGVCFS and GATK4_APPLYVQSR are run, then use output from APPLYVQSR
// When MERGE_GENOTYPEGVCFS and NOT GATK4_APPLYVQSR , then use the output from MERGE_GENOTYPEGVCFS

merge_vcf_for_join = MERGE_GENOTYPEGVCFS.out.vcf.map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]}
merge_tbi_for_join = MERGE_GENOTYPEGVCFS.out.tbi.map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]}

// Remap for both to have the same key, if ApplyBQSR is not run, the channel is empty --> populate with empty elements
vqsr_vcf_for_join = GATK4_APPLYVQSR_INDEL.out.vcf.ifEmpty([[:], []]).map{meta, vcf -> [[id: 'joint_variant_calling'] , vcf]}
vqsr_tbi_for_join = GATK4_APPLYVQSR_INDEL.out.tbi.ifEmpty([[:], []]).map{meta, tbi -> [[id: 'joint_variant_calling'] , tbi]}

// Join on metamap
// If both --> meta, vcf_merged, vcf_bqsr
// If not VQSR --> meta, vcf_merged, []
// if the second is empty, use the first
genotype_vcf = merge_vcf_for_join.join(vqsr_vcf_for_join, remainder: true).map{
meta, joint_vcf, recal_vcf ->

vcf_out = recal_vcf ?: joint_vcf

[[id:"joint_variant_calling", patient:"all_samples", variantcaller:"haplotypecaller"], vcf_out]
}

genotype_index = merge_tbi_for_join.join(vqsr_tbi_for_join, remainder: true).map{
meta, joint_tbi, recal_tbi ->

tbi_out = recal_tbi ?: joint_tbi

genotype_vcf = MERGE_GENOTYPEGVCFS.out.vcf.mix(MERGE_VQSR.out.vcf)
genotype_index = MERGE_GENOTYPEGVCFS.out.tbi.mix(MERGE_VQSR.out.tbi)
[[id:"joint_variant_calling", patient:"all_samples", variantcaller:"haplotypecaller"], tbi_out]
}

versions = versions.mix(GATK4_GENOMICSDBIMPORT.out.versions)
versions = versions.mix(GATK4_GENOTYPEGVCFS.out.versions)
Expand Down
34 changes: 24 additions & 10 deletions subworkflows/local/bam_variant_calling_germline_all/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ include { BAM_VARIANT_CALLING_SENTIEON_HAPLOTYPER } from '../bam_variant_calling
include { BAM_VARIANT_CALLING_MPILEUP } from '../bam_variant_calling_mpileup/main'
include { BAM_VARIANT_CALLING_SINGLE_STRELKA } from '../bam_variant_calling_single_strelka/main'
include { BAM_VARIANT_CALLING_SINGLE_TIDDIT } from '../bam_variant_calling_single_tiddit/main'
include { VCF_VARIANT_FILTERING_GATK } from '../vcf_variant_filtering_gatk/main'

workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
take:
Expand All @@ -38,6 +39,7 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
known_sites_snps_tbi
known_snps_vqsr
joint_germline // boolean: [mandatory] [default: false] joint calling of germline variants
skip_haplotypecaller_filter // boolean: [mandatory] [default: false] whether to filter haplotypecaller single sample vcfs
sentieon_haplotyper_emit_mode // channel: [mandatory] value channel with string

main:
Expand Down Expand Up @@ -119,22 +121,16 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {
dbsnp,
dbsnp_tbi,
dbsnp_vqsr,
known_sites_indels,
known_sites_indels_tbi,
known_indels_vqsr,
known_sites_snps,
known_sites_snps_tbi,
known_snps_vqsr,
intervals,
intervals_bed_combined_haplotypec,
((skip_tools && skip_tools.split(',').contains('haplotypecaller_filter') || joint_germline)))
intervals)

vcf_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.vcf
tbi_haplotypecaller = BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.tbi

versions = versions.mix(BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.versions)

if (joint_germline) {
BAM_JOINT_CALLING_GERMLINE_GATK(
BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.genotype_intervals,
BAM_VARIANT_CALLING_HAPLOTYPECALLER.out.gvcf_tbi_intervals,
fasta,
fasta_fai,
dict,
Expand All @@ -150,6 +146,24 @@ workflow BAM_VARIANT_CALLING_GERMLINE_ALL {

vcf_haplotypecaller = BAM_JOINT_CALLING_GERMLINE_GATK.out.genotype_vcf
versions = versions.mix(BAM_JOINT_CALLING_GERMLINE_GATK.out.versions)
} else {

// If single sample track, check if filtering should be done
if (!skip_haplotypecaller_filter) {

VCF_VARIANT_FILTERING_GATK(
vcf_haplotypecaller.join(tbi_haplotypecaller, failOnDuplicate: true, failOnMismatch: true),
fasta,
fasta_fai,
dict.map{ meta, dict -> [ dict ] },
intervals_bed_combined_haplotypec,
known_sites_indels.concat(known_sites_snps).flatten().unique().collect(),
known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect())

vcf_haplotypecaller = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf

versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions)
}
}
}

Expand Down
54 changes: 19 additions & 35 deletions subworkflows/local/bam_variant_calling_haplotypecaller/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
// A when clause condition is defined in the conf/modules.config to determine if the module should be run

include { BAM_MERGE_INDEX_SAMTOOLS } from '../bam_merge_index_samtools/main'
include { VCF_VARIANT_FILTERING_GATK } from '../vcf_variant_filtering_gatk/main'
include { GATK4_HAPLOTYPECALLER } from '../../../modules/nf-core/gatk4/haplotypecaller/main'
include { GATK4_MERGEVCFS as MERGE_HAPLOTYPECALLER } from '../../../modules/nf-core/gatk4/mergevcfs/main'

Expand All @@ -18,51 +17,51 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {
dbsnp // channel: [optional]
dbsnp_tbi // channel: [optional]
dbsnp_vqsr // channel: [optional]
known_sites_indels // channel: [optional]
known_sites_indels_tbi // channel: [optional]
known_indels_vqsr // channel: [optional]
known_sites_snps // channel: [optional]
known_sites_snps_tbi // channel: [optional]
known_snps_vqsr // channel: [optional]
intervals // channel: [mandatory] [ intervals, num_intervals ] or [ [], 0 ] if no intervals
intervals_bed_combined // channel: [mandatory] intervals/target regions in one file unzipped, no_intervals.bed if no_intervals
skip_haplotypecaller_filter // boolean: [mandatory] [default: false] skip haplotypecaller filter

main:
versions = Channel.empty()

vcf = Channel.empty()
vcf = Channel.empty()
realigned_bam = Channel.empty()

// Combine cram and intervals for spread and gather strategy
cram_intervals = cram.combine(intervals)
// Move num_intervals to meta map
.map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ num_intervals:num_intervals, variantcaller:'haplotypecaller' ], cram, crai, intervals, [] ] }
// Add interval_name to allow correct merging with interval files
.map{ meta, cram, crai, intervals, num_intervals -> [ meta + [ interval_name:intervals.simpleName, num_intervals:num_intervals, variantcaller:'haplotypecaller' ], cram, crai, intervals, [] ] }

GATK4_HAPLOTYPECALLER(cram_intervals, fasta, fasta_fai, dict.map{ meta, dict -> [ dict ] }, dbsnp, dbsnp_tbi)

// For joint genotyping
genotype_intervals = GATK4_HAPLOTYPECALLER.out.vcf
gvcf_tbi_intervals = GATK4_HAPLOTYPECALLER.out.vcf
.join(GATK4_HAPLOTYPECALLER.out.tbi, failOnMismatch: true)
.join(cram_intervals, failOnMismatch: true)
.map{ meta, gvcf, tbi, cram, crai, intervals, dragstr_model -> [ meta, gvcf, tbi, intervals ] }

// Figuring out if there is one or more vcf(s) from the same sample
haplotypecaller_vcf = GATK4_HAPLOTYPECALLER.out.vcf.branch{
haplotypecaller_vcf = GATK4_HAPLOTYPECALLER.out.vcf.map{
meta, vcf -> [ meta - meta.subMap('interval_name'), vcf]
}
.branch{
// Use meta.num_intervals to asses number of intervals
intervals: it[0].num_intervals > 1
no_intervals: it[0].num_intervals <= 1
}

// Figuring out if there is one or more tbi(s) from the same sample
haplotypecaller_tbi = GATK4_HAPLOTYPECALLER.out.tbi.branch{
haplotypecaller_tbi = GATK4_HAPLOTYPECALLER.out.tbi.map{
meta, tbi -> [ meta - meta.subMap('interval_name'), tbi]
}.branch{
// Use meta.num_intervals to asses number of intervals
intervals: it[0].num_intervals > 1
no_intervals: it[0].num_intervals <= 1
}

// Figuring out if there is one or more bam(s) from the same sample
haplotypecaller_bam = GATK4_HAPLOTYPECALLER.out.bam.branch{
haplotypecaller_bam = GATK4_HAPLOTYPECALLER.out.bam.map{
meta, bam -> [ meta - meta.subMap('interval_name'), bam]
}.branch{
// Use meta.num_intervals to asses number of intervals
intervals: it[0].num_intervals > 1
no_intervals: it[0].num_intervals <= 1
Expand All @@ -87,33 +86,18 @@ workflow BAM_VARIANT_CALLING_HAPLOTYPECALLER {

realigned_bam = BAM_MERGE_INDEX_SAMTOOLS.out.bam_bai

if (!skip_haplotypecaller_filter) {

VCF_VARIANT_FILTERING_GATK(
haplotypecaller_vcf.join(haplotypecaller_tbi, failOnDuplicate: true, failOnMismatch: true),
fasta,
fasta_fai,
dict.map{ meta, dict -> [ dict ] },
intervals_bed_combined,
known_sites_indels.concat(known_sites_snps).flatten().unique().collect(),
known_sites_indels_tbi.concat(known_sites_snps_tbi).flatten().unique().collect())

vcf = VCF_VARIANT_FILTERING_GATK.out.filtered_vcf

versions = versions.mix(VCF_VARIANT_FILTERING_GATK.out.versions)

} else vcf = haplotypecaller_vcf

versions = versions.mix(GATK4_HAPLOTYPECALLER.out.versions)
versions = versions.mix(MERGE_HAPLOTYPECALLER.out.versions)

// Remove no longer necessary field: num_intervals
vcf = vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] }
vcf = haplotypecaller_vcf.map{ meta, vcf -> [ meta - meta.subMap('num_intervals'), vcf ] }
tbi = haplotypecaller_tbi.map{ meta, tbi -> [ meta - meta.subMap('num_intervals'), tbi ] }

emit:
genotype_intervals // For joint genotyping
gvcf_tbi_intervals // For joint genotyping
realigned_bam // Optional
vcf // vcf filtered or not
vcf // vcf
tbi // tbi

versions
}

0 comments on commit f63596a

Please sign in to comment.