Skip to content

Commit

Permalink
force co-localization of indexes with their targets
Browse files Browse the repository at this point in the history
WDL engines may place files wherever they want
as the rules at https://github.com/openwdl/wdl/blob/main/versions/1.0/SPEC.md#task-input-localization
only apply if the input files come from the same storage directory
(which may not be the case due to the use of object stores like S3 or
inputs from disparate steps)

This also helps with conversion from WDL to CWL as CWL requires explicit
co-localization (in the form of 'secondaryFiles' or via
'InitialWorkDirRequirement', for which there are no WDL analogues) if the CWL
`command` equivalent doesn't include its own co-localizations.
  • Loading branch information
mr-c committed Oct 13, 2021
1 parent 68c56c9 commit 0dd4704
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 22 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Expand Up @@ -7,6 +7,11 @@ Newest changes should be on top.
This document is user facing. Please word the changes in such a way
that users understand how the changes affect the new version.
-->

In Development
--------------
+ gatk.wdl: force co-localiztion (placement) of indexes next to their targets, as there is no guarentee that WDL engines will do so.

version 5.0.1
---------------------------
+ Smoove: enable genotyping
Expand Down
120 changes: 98 additions & 22 deletions gatk.wdl
Expand Up @@ -42,9 +42,13 @@ task AnnotateIntervals {
command {
set -e
mkdir -p "$(dirname ~{annotatedIntervalsPath})"
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \
AnnotateIntervals \
-R ~{referenceFasta} \
-R reference_dir/~{basename (referenceFasta)} \
-L ~{intervals} \
~{"--mappability-track " + mappabilityTrack} \
~{"--segmental-duplication-track " + segmentalDuplicationTrack} \
Expand Down Expand Up @@ -107,12 +111,19 @@ task ApplyBQSR {
command {
set -e
mkdir -p "$(dirname ~{outputBamPath})"
mkdir bam_dir
ln -s ~{inputBam} bam_dir/~{basename(inputBam)}
ln -s ~{inputBamIndex} bam_dir/~{basename(inputBamIndex)}
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1' \
ApplyBQSR \
--create-output-bam-md5 \
--add-output-sam-program-record \
-R ~{referenceFasta} \
-I ~{inputBam} \
-R reference_dir/~{basename(referenceFasta)} \
-I bam_dir/~{basename(inputBam)} \
--use-original-qualities \
-O ~{outputBamPath} \
-bqsr ~{recalibrationReport} \
Expand Down Expand Up @@ -181,10 +192,17 @@ task BaseRecalibrator {
command {
set -e
mkdir -p "$(dirname ~{recalibrationReportPath})"
mkdir bam_dir
ln -s ~{inputBam} bam_dir/~{basename(inputBam)}
ln -s ~{inputBamIndex} bam_dir/~{basename(inputBamIndex)}
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1' \
BaseRecalibrator \
-R ~{referenceFasta} \
-I ~{inputBam} \
-R reference_dir/~{basename(referenceFasta)} \
-I bam_dir/~{basename(inputBam)} \
--use-original-qualities \
-O ~{recalibrationReportPath} \
~{true="--known-sites" false="" length(knownIndelsSitesVCFs) > 0} ~{sep=" --known-sites " knownIndelsSitesVCFs} \
Expand Down Expand Up @@ -340,11 +358,18 @@ task CollectAllelicCounts {
command {
set -e
mkdir -p "$(dirname ~{allelicCountsPath})"
mkdir bam_dir
ln -s ~{inputBam} bam_dir/~{basename(inputBam)}
ln -s ~{inputBamIndex} bam_dir/~{basename(inputBamIndex)}
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \
CollectAllelicCounts \
-L ~{commonVariantSites} \
-I ~{inputBam} \
-R ~{referenceFasta} \
-I bam_dir/~{basename(inputBam)} \
-R reference_dir/~{basename(referenceFasta)} \
-O ~{allelicCountsPath}
}

Expand Down Expand Up @@ -398,11 +423,18 @@ task CollectReadCounts {
command {
set -e
mkdir -p "$(dirname ~{countsPath})"
mkdir bam_dir
ln -s ~{inputBam} bam_dir/~{basename(inputBam)}
ln -s ~{inputBamIndex} bam_dir/~{basename(inputBamIndex)}
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \
CollectReadCounts \
-L ~{intervals} \
-I ~{inputBam} \
-R ~{referenceFasta} \
-I bam_dir/~{basename(inputBam)} \
-R reference_dir/~{basename(referenceFasta)} \
--format HDF5 \
--interval-merging-rule ~{intervalMergingRule} \
-O ~{countsPath}
Expand Down Expand Up @@ -457,11 +489,18 @@ task CombineGVCFs {
command {
set -e
mkdir -p "$(dirname ~{outputPath})"
mkdir wd
for FILE in ${sep(" ", gvcfFiles)}; do ln -s $FILE wd/$(basename $FILE) ; done
for FILE in ${sep(" ", gvcfFilesIndex)}; do ln -s $FILE wd/$(basename $FILE) ; done
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \
CombineGVCFs \
-R ~{referenceFasta} \
-R reference_dir/~{basename(referenceFasta)} \
-O ~{outputPath} \
-V ~{sep=' -V ' gvcfFiles} \
(for FILE in ${sep(" ", gvcfFiles)}; do echo -- "-V wd/"$(basename $FILE); done) \
~{true='-L' false='' length(intervals) > 0} ~{sep=' -L ' intervals}
}

Expand Down Expand Up @@ -516,22 +555,29 @@ task CombineVariants {

command <<<
set -e
mkdir wd
for FILE in ${sep(" ", variantVcfs)}; do ln -s $FILE wd/$(basename $FILE) ; done
for FILE in ${sep(" ", variantIndexes)}; do ln -s $FILE wd/$(basename $FILE) ; done
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
mkdir -p "$(dirname ~{outputPath})"
# Build "-V:<ID> <file.vcf>" arguments according to IDs
# and VCFs to merge.
# Make sure commands are run in bash.
V_args=$(bash -c '
set -eu
ids=(~{sep=" " identifiers})
vars=(~{sep=" " variantVcfs})
vars=($(for file in ${sep(" ", variantVcfs); do echo wd/$(basename $file) ; done))
for (( i = 0; i < ${#ids[@]}; ++i ))
do
printf -- "-V:%s %s " "${ids[i]}" "${vars[i]}"
done
')
java -Xmx~{javaXmx} -XX:ParallelGCThreads=1 -jar /usr/GenomeAnalysisTK.jar \
-T CombineVariants \
-R ~{referenceFasta} \
-R reference_dir/~{basename(referenceFasta)} \
--genotypemergeoption ~{genotypeMergeOption} \
--filteredrecordsmergetype ~{filteredRecordsMergeType} \
--out ~{outputPath} \
Expand Down Expand Up @@ -698,10 +744,17 @@ task FilterMutectCalls {
command {
set -e
mkdir -p "$(dirname ~{outputVcf})"
mkdird unfiltered_vcf_dir
ln -s ~{unfilteredVcf} unfiltered_vcf_dir/~{basename(unfilteredVcf)}
ln -s ~{unfilteredVcfIndex} unfiltered_vcf_dir/~{basename(unfilteredVcfIndex)}
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \
FilterMutectCalls \
-R ~{referenceFasta} \
-V ~{unfilteredVcf} \
-R reference_dir/~{basename(referenceFasta)} \
-V unfiltered_vcf_dir/~{basename(unfilteredVcf)} \
-O ~{outputVcf} \
~{"--contamination-table " + contaminationTable} \
~{"--tumor-segmentation " + mafTumorSegments} \
Expand Down Expand Up @@ -874,14 +927,21 @@ task GenotypeGVCFs {
command {
set -e
mkdir -p "$(dirname ~{outputPath})"
mkdir wd
ln -s ~{gvcfFile} wd/~{basename(gvcfFile)}
ln -s ~{gvcfFileIndex} wd/~{basename(gvcfFileIndex)}
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \
GenotypeGVCFs \
-R ~{referenceFasta} \
-R reference_dir/~{basename(referenceFasta)} \
-O ~{outputPath} \
~{"-D " + dbsnpVCF} \
~{"--pedigree " + pedigree} \
~{true="-G" false="" length(annotationGroups) > 0} ~{sep=" -G " annotationGroups} \
-V ~{gvcfFile} \
-V wd/~{basename(gvcfFile)} \
~{true="--only-output-calls-starting-in-intervals" false="" defined(intervals)} \
~{true="-L" false="" defined(intervals)} ~{sep=' -L ' intervals}
}
Expand Down Expand Up @@ -939,11 +999,20 @@ task GetPileupSummaries {

command {
set -e
mkdir bam_dir
ln -s ~{sampleBam} bam_dir/~{basename(sampleBam)}
ln -s ~{sampleBamIndex} bam_dir/~{basename(sampleBamIndex)}
mkdir variants_dir
ln -s ~{variantsForContamination} variants_dir/~{basename(variantsForContamination)}
ln -s ~{variantsForContamination} variants_dir/~{basename(variantsForContaminationIndex)}
mkdir sites_dir
ln -s ~{sitesForContamination} sites_dir/~{basename(sitesForContamination)}
ln -s ~{sitesForContaminationIndex} sites_dir/~{basename(sitesForContaminationIndex)}
gatk --java-options '-Xmx~{javaXmx} -XX:ParallelGCThreads=1' \
GetPileupSummaries \
-I ~{sampleBam} \
-V ~{variantsForContamination} \
-L ~{sitesForContamination} \
-I bam_dir/~{basename(sampleBam)} \
-V variants_dir/~{basename(variantsForContamination)} \
-L sites_dir/~{basename(sitesForContamination)} \
-O ~{outputPrefix + "-pileups.table"}
}

Expand Down Expand Up @@ -1009,11 +1078,18 @@ task HaplotypeCaller {
command {
set -e
mkdir -p "$(dirname ~{outputPath})"
mkdir wd
for FILE in ${sep(" ", inputBams)}; do ln -s $FILE wd/$(inputBams $FILE) ; done
for FILE in ${sep(" ", inputBamsIndex)}; do ln -s $FILE wd/$(inputBamsIndex $FILE) ; done
mkdir reference_dir
ln -s ~{referenceFasta} reference_dir/~{basename(referenceFasta)}
ln -s ~{referenceFastaDict} reference_dir/~{basename(referenceFastaDict)}
ln -s ~{referenceFastaFai} reference_dir/~{basename(referenceFastaFai)}
gatk --java-options '-Xmx~{javaXmxMb}M -XX:ParallelGCThreads=1' \
HaplotypeCaller \
-R ~{referenceFasta} \
-R reference_dir/~{basename(referenceFasta)} \
-O ~{outputPath} \
-I ~{sep=" -I " inputBams} \
(for FILE in ${sep(" ", inputBams)}; do echo -- "-I wd/"$(basename $FILE); done) \
~{"--sample-ploidy " + ploidy} \
~{true="-L" false="" defined(intervalList)} ~{sep=' -L ' intervalList} \
~{true="-XL" false="" defined(excludeIntervalList)} ~{sep=' -XL ' excludeIntervalList} \
Expand Down

0 comments on commit 0dd4704

Please sign in to comment.