From 289bff95d3fda05a73ed1c869790953d114bd2f1 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Wed, 18 Dec 2019 05:41:17 +0100 Subject: [PATCH 01/12] Major: generalisation of process resources --- conf/base.config | 98 ++++++++++++++++----------------- docs/usage.md | 4 +- environment.yml | 2 +- main.nf | 140 +++++++++++++++++++++++++++++------------------ 4 files changed, 139 insertions(+), 105 deletions(-) diff --git a/conf/base.config b/conf/base.config index 04606b41f..52fb829a8 100644 --- a/conf/base.config +++ b/conf/base.config @@ -11,79 +11,77 @@ process { cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 8.GB * task.attempt, 'memory' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } time = { check_max( 2.h * task.attempt, 'time' ) } errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } - maxRetries = 1 + maxRetries = 3 maxErrors = '-1' - // Process-specific resource requirements (others leave at default, e.g. Fastqc) - withName:get_software_versions { - memory = { check_max( 2.GB, 'memory' ) } - cache = false + // Generic resource requirements - s(ingle)c(ore)/m(ulti)c(ore) + + withLabel:'sc_tiny'{ + cpus = { check_max( 1, 'cpus' ) } + memory = { check_max( 1.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } } - withName:convertBam { - cpus = { check_max(8 * task.attempt, 'cpus') } + + withLabel:'sc_small'{ + cpus = { check_max( 1, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } } - withName:makeSeqDict { - memory = { check_max( 16.GB * task.attempt, 'memory' ) } + + withLabel:'sc_medium'{ + cpus = { check_max( 1, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } } - withname:makeBWAIndex { - time = params.large_ref ? '12.h' : { check_max(8.h * task.attempt, 'time') } + + withLabel:'mc_small'{ + cpus = { check_max( 2, 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } } - withName:bwa { - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - cpus = { check_max(8 * task.attempt, 'cpus') } - time = { check_max(8.h * task.attempt, 'time') } + + withLabel:'mc_medium' { + cpus = { check_max( 4, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } } - withName:bwamem{ - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - cpus = { check_max(8 * task.attempt, 'cpus') } - time = { check_max(8.h * task.attempt, 'time') } + + withLabel:'mc_large'{ + cpus = { check_max( 8, 'cpus' ) } + memory = { check_max( 16.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } } - withName:qualimap{ - cpus = { check_max(8 * task.attempt, 'cpus') } - errorStrategy = 'ignore' + + withLabel:'mc_huge'{ + cpus = { check_max( 32, 'cpus' ) } + memory = { check_max( 256.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } } - withName:bam_trim{ - cpus = { check_max(4 * task.attempt, 'cpus') } + + // Process-specific resource requirements (others leave at default, e.g. Fastqc) + withName:get_software_versions { + memory = { check_max( 2.GB, 'memory' ) } + cache = false } - withName:markDup{ + + withName:qualimap{ cpus = { check_max(8 * task.attempt, 'cpus') } + errorStrategy = 'ignore' } + withName:preseq { errorStrategy = 'ignore' } - withName: fastqc { - errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' } - } + withName: multiqc { errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' } } - withName: damageprofiler { - time = params.large_ref ? { check_max(8.h * task.attempt, 'time') } : { check_max(2.h * task.attempt, 'time')} - } - withName: strip_input_fastq { - cpus = { check_max(8 * task.attempt, 'cpus') } - memory = { check_max( 8.GB * task.attempt, 'memory' ) } - } - withName: malt { - memory = { check_max( 128.GB * task.attempt, 'memory' ) } - cpus = { check_max(16 * task.attempt, 'cpus') } - time = { check_max(2.h * task.attempt, 'time') } - } - withName: maltextract { - memory = { check_max( 8.GB * task.attempt, 'memory' ) } - cpus = { check_max(4 * task.attempt, 'cpus') } - } - withName: vcf2genome { - memory = { check_max( 4.GB * task.attempt, 'memory' ) } - cpus = 1 - } } - params { // Defaults only, expecting to be overwritten max_memory = 128.GB diff --git a/docs/usage.md b/docs/usage.md index feec1a536..a624da2cb 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -114,7 +114,7 @@ Use this parameter to choose a configuration profile. Profiles can give configur For more details on how to set up your own private profile, please see [installation](../configuration/adding_your_own.md). **Basic profiles** -These are basic profiles which primarily define where you derive the pipeline's software packages from. These are typically the profiles you would use if you are running the pipeline on your **own PC*- (vs. a HPC cluster - see below). +These are basic profiles which primarily define where you derive the pipeline's software packages from. These are typically the profiles you would use if you are running the pipeline on your **own PC** (vs. a HPC cluster - see below). - `awsbatch` - A generic configuration profile to be used with AWS Batch. @@ -274,7 +274,7 @@ The output directory where the results will be saved. #### `-w / -work-dir` -The output directory where _intermediate_ files will be saved. It is **highly recommended*- that this is the same path as `--outdir`, otherwise you may 'lose' your intermediate files if you need to re-run a pipeline. By default, if this flag is not given, the intermediate files will be saved in a `work/` and `.nextflow/` directory from wherever you have run EAGER from. +The output directory where _intermediate_ files will be saved. It is **highly recommended** that this is the same path as `--outdir`, otherwise you may 'lose' your intermediate files if you need to re-run a pipeline. By default, if this flag is not given, the intermediate files will be saved in a `work/` and `.nextflow/` directory from wherever you have run EAGER from. ### Optional Reference Options diff --git a/environment.yml b/environment.yml index a1eb6c206..4b18f99d6 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - bioconda::picard=2.21.4 - bioconda::samtools=1.9 - bioconda::dedup=0.12.5 - - bioconda::angsd=0.931 + - bioconda::angsd=0.921 - bioconda::circularmapper=1.93.4 - bioconda::gatk4=4.1.4.1 - bioconda::qualimap=2.2.2d diff --git a/main.nf b/main.nf index a1b8324b7..96c241549 100644 --- a/main.nf +++ b/main.nf @@ -648,6 +648,7 @@ ${summary.collect { k,v -> "
$k
${v ?: ' if (params.saveReference) filename @@ -687,6 +688,7 @@ if (params.fasta_index != '') { } process makeFastaIndex { + label 'sc_small' tag {fasta} publishDir path: "${params.outdir}/reference_genome/fasta_index", mode: 'copy', saveAs: { filename -> if (params.saveReference) filename @@ -732,6 +734,7 @@ if (params.seq_dict != '') { process makeSeqDict { + label 'sc_medium' tag {fasta} publishDir path: "${params.outdir}/reference_genome/seq_dict", mode: 'copy', saveAs: { filename -> if (params.saveReference) filename @@ -763,6 +766,7 @@ ch_dict_for_skipdict.mix(ch_seq_dict) */ process convertBam { + label 'mc_small' tag "$bam" when: @@ -782,10 +786,13 @@ process convertBam { } /* -* PREPROCESSING - Index a input BAM if not being converted to FASTAPQ +* PREPROCESSING - Index a input BAM if not being converted to FASTQ */ process indexinputbam { + label 'sc_small' + tag "$prefix" + when: params.bam && !params.run_convertbam @@ -796,12 +803,11 @@ process indexinputbam { file "*.{bai,csi}" into ch_mappingindex_for_skipmapping,ch_filteringindex_for_skiprmdup script: - size = "${params.large_ref}" ? '-c' : '' - prefix = "${bam.baseName}" + size = "${params.large_ref}" ? '-c' : '' + prefix = "${bam.baseName}" """ - samtools index "${size}" ${bam} + samtools index "${size}" ${bam} """ - } // convertbam bypass @@ -814,12 +820,11 @@ if (params.run_convertbam) { .into { ch_convertbam_for_fastp; ch_convertbam_for_skipfastp; ch_convertbam_for_fastqc; ch_convertbam_for_stripfastq } } - - /* * STEP 1a - FastQC */ process fastqc { + label 'sc_tiny' tag "$name" publishDir "${params.outdir}/FastQC/input_fastq", mode: 'copy', saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} @@ -848,6 +853,7 @@ process fastqc { */ process fastp { + label 'mc_small' tag "$name" publishDir "${params.outdir}/FastP", mode: 'copy' @@ -890,6 +896,7 @@ if (params.complexity_filter_poly_g) { */ process adapter_removal { + label 'mc_small' tag "$name" publishDir "${params.outdir}/read_merging", mode: 'copy' @@ -979,6 +986,7 @@ if (!params.skip_adapterremoval) { * STEP 2b - FastQC after clipping/merging (if applied!) */ process fastqc_after_clipping { + label 'sc_tiny' tag "${name}" publishDir "${params.outdir}/FastQC/after_clipping", mode: 'copy', saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} @@ -1003,6 +1011,7 @@ Step 3a - Mapping with BWA, SAM to BAM, Sort BAM */ process bwa { + label 'mc_medium' tag "${name}" publishDir "${params.outdir}/mapping/bwa", mode: 'copy' @@ -1043,6 +1052,7 @@ process bwa { } process circulargenerator{ + label 'sc_tiny' tag "$prefix" publishDir "${params.outdir}/reference_genome/circularmapper_index", mode: 'copy', saveAs: { filename -> if (params.saveReference) filename @@ -1069,6 +1079,7 @@ process circulargenerator{ process circularmapper{ + label 'mc_medium' tag "$prefix" publishDir "${params.outdir}/mapping/circularmapper", mode: 'copy' @@ -1114,6 +1125,7 @@ process circularmapper{ } process bwamem { + label 'mc_medium' tag "$prefix" publishDir "${params.outdir}/mapping/bwamem", mode: 'copy' @@ -1171,6 +1183,7 @@ if (!params.skip_mapping) { */ process samtools_flagstat { + label 'sc_tiny' tag "$prefix" publishDir "${params.outdir}/samtools/stats", mode: 'copy' @@ -1196,6 +1209,7 @@ process samtools_flagstat { */ process samtools_filter { + label 'mc_medium' tag "$prefix" publishDir "${params.outdir}/samtools/filter", mode: 'copy', saveAs: {filename -> @@ -1276,6 +1290,7 @@ if (params.run_bam_filtering) { process strip_input_fastq { + label 'mc_medium' tag "${bam.baseName}" publishDir "${params.outdir}/samtools/stripped_fastq", mode: 'copy' @@ -1314,6 +1329,7 @@ process strip_input_fastq { process samtools_flagstat_after_filter { + label 'sc_tiny' tag "$prefix" publishDir "${params.outdir}/samtools/stats", mode: 'copy' @@ -1339,6 +1355,7 @@ Step 5a: DeDup */ process dedup{ + label 'mc_small' tag "${bam.baseName}" publishDir "${params.outdir}/deduplication/", mode: 'copy', saveAs: {filename -> "${prefix}/$filename"} @@ -1382,6 +1399,7 @@ process dedup{ */ process markDup{ + label 'mc_small' tag "${bam.baseName}" publishDir "${params.outdir}/deduplication/" @@ -1430,6 +1448,7 @@ Step 6: Preseq */ process preseq { + label 'sc_tiny' tag "${input.baseName}" publishDir "${params.outdir}/preseq", mode: 'copy' @@ -1460,6 +1479,7 @@ Step 7a: DMG Assessment */ process damageprofiler { + label 'sc_tiny' tag "${bam.baseName}" publishDir "${params.outdir}/damageprofiler", mode: 'copy' @@ -1490,6 +1510,7 @@ Step 8: Qualimap */ process qualimap { + label 'mc_small' tag "${bam.baseName}" publishDir "${params.outdir}/qualimap", mode: 'copy' @@ -1525,6 +1546,7 @@ if (!params.run_bedtools_coverage){ } process bedtools { + label 'mc_small' tag "${bam.baseName}" publishDir "${params.outdir}/bedtools", mode: 'copy' @@ -1540,8 +1562,8 @@ process bedtools { script: """ - bedtools coverage -a ${anno_file} -b $bam | pigz -p 4 > "${bam.baseName}".breadth.gz - bedtools coverage -a ${anno_file} -b $bam -mean | pigz -p 4 > "${bam.baseName}".depth.gz + bedtools coverage -a ${anno_file} -b $bam | pigz -p ${task.cpus} > "${bam.baseName}".breadth.gz + bedtools coverage -a ${anno_file} -b $bam -mean | pigz -p ${task.cpus} > "${bam.baseName}".depth.gz """ } @@ -1550,6 +1572,7 @@ process bedtools { */ process pmdtools { + label 'mc_small' tag "${bam.baseName}" publishDir "${params.outdir}/pmdtools", mode: 'copy' @@ -1590,6 +1613,7 @@ process pmdtools { */ process bam_trim { + label 'mc_small' tag "${prefix}" publishDir "${params.outdir}/trimmed_bam", mode: 'copy' @@ -1663,6 +1687,7 @@ if ( params.run_genotyping && params.genotyping_source == 'raw' ) { ch_gatk_download = Channel.value("download") process download_gatk_v3_5 { + label 'sc_tiny' when: params.run_genotyping && params.genotyping_tool == 'ug' input: @@ -1683,6 +1708,7 @@ ch_gatk_download = Channel.value("download") */ process genotyping_ug { + 'mc_small' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1722,6 +1748,7 @@ ch_gatk_download = Channel.value("download") } process genotyping_hc { + label 'mc_small' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1789,6 +1816,7 @@ ch_gatk_download = Channel.value("download") process vcf2genome { + label 'mc_small' tag "${prefix}" publishDir "${params.outdir}/consensus_sequence", mode: 'copy' @@ -1828,6 +1856,7 @@ if (params.additional_vcf_files == '') { } process multivcfanalyzer { + label 'mc_small' publishDir "${params.outdir}/MultiVCFAnalyzer", mode: 'copy' when: @@ -1897,7 +1926,8 @@ if (params.sexdeterrmine_bedfile == '') { process sex_deterrmine { - publishDir "${params.outdir}/sex_determination", mode:"copy" + label 'sc_small' + publishDir "${params.outdir}/sex_determination", mode:"copy" when: params.run_sexdeterrmine @@ -1934,47 +1964,48 @@ if (params.sexdeterrmine_bedfile == '') { * Step 16 Nuclear contamination for Human DNA based on chromosome X heterozygosity. */ process nuclear_contamination{ - publishDir "${params.outdir}/nuclear_contamination", mode:"copy" - validExitStatus 0,134 - /* - * ANGSD Xcontamination will exit with status 134 when the number of SNPs - * is not large enough for estimation. - */ - - when: - params.run_nuclear_contamination - - input: - file input from ch_for_nuclear_contamination - - - output: - file '*.X.contamination.out' into ch_from_nuclear_contamination - - script: - """ - samtools index ${input} - angsd -i ${input} -r ${params.contamination_chrom_name}:5000000-154900000 -doCounts 1 -iCounts 1 -minMapQ 30 -minQ 30 -out ${input.baseName}.doCounts - contamination -a ${input.baseName}.doCounts.icnts.gz -h ${baseDir}/assets/angsd_resources/HapMapChrX.gz 2> ${input.baseName}.X.contamination.out - """ + label 'sc_small' + publishDir "${params.outdir}/nuclear_contamination", mode:"copy" + validExitStatus 0,134 + /* + * ANGSD Xcontamination will exit with status 134 when the number of SNPs + * is not large enough for estimation. + */ + + when: + params.run_nuclear_contamination + + input: + file input from ch_for_nuclear_contamination + + output: + file '*.X.contamination.out' into ch_from_nuclear_contamination + + script: + """ + samtools index ${input} + angsd -i ${input} -r ${params.contamination_chrom_name}:5000000-154900000 -doCounts 1 -iCounts 1 -minMapQ 30 -minQ 30 -out ${input.baseName}.doCounts + contamination -a ${input.baseName}.doCounts.icnts.gz -h ${baseDir}/assets/angsd_resources/HapMapChrX.gz 2> ${input.baseName}.X.contamination.out + """ } - process print_nuclear_contamination{ - publishDir "${params.outdir}/nuclear_contamination", mode:"copy" - - when: - params.run_nuclear_contamination - - input: - val 'Contam' from ch_from_nuclear_contamination.collect() - - output: - file 'nuclear_contamination.txt' - - script: - """ - print_x_contamination.py ${Contam.join(' ')} - """ +process print_nuclear_contamination{ + label 'sc_tiny' + publishDir "${params.outdir}/nuclear_contamination", mode:"copy" + + when: + params.run_nuclear_contamination + + input: + val 'Contam' from ch_from_nuclear_contamination.collect() + + output: + file 'nuclear_contamination.txt' + + script: + """ + print_x_contamination.py ${Contam.join(' ')} + """ } /* @@ -1982,6 +2013,7 @@ if (params.sexdeterrmine_bedfile == '') { */ process malt { + label 'mc_huge' publishDir "${params.outdir}/metagenomic_classification", mode:"copy" when: @@ -2041,6 +2073,7 @@ if (params.maltextract_taxon_list== '') { } process maltextract { + label 'mc_large' publishDir "${params.outdir}/MaltExtract/", mode:"copy" when: @@ -2090,9 +2123,8 @@ Genotyping tools: - sequenceTools Downstream VCF tools: -- vcf2genome -- gencons -- READ/mcMLKin +- gencons? +- READ/mcMLKin? - popGen output? PLINK? */ @@ -2100,6 +2132,7 @@ Downstream VCF tools: * Step 18a - Output Description HTML */ process output_documentation { + label 'sc_tiny' publishDir "${params.outdir}/Documentation", mode: 'copy' input: @@ -2118,6 +2151,7 @@ process output_documentation { * Step 18b - Parse software version numbers */ process get_software_versions { + label 'sc_tiny' publishDir "${params.outdir}/SoftwareVersions", mode: 'copy' output: @@ -2163,6 +2197,8 @@ process get_software_versions { * Step 18c - MultiQC */ process multiqc { + label 'sc_tiny' + publishDir "${params.outdir}/MultiQC", mode: 'copy' input: From 93dc0368892d33ccd4b3e8f4fbd80108b5c8b2ea Mon Sep 17 00:00:00 2001 From: jfy133 Date: Wed, 18 Dec 2019 09:42:16 +0100 Subject: [PATCH 02/12] Try removing tee system for samtools bam filtering due to instability --- main.nf | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 96c241549..e94b27e11 100644 --- a/main.nf +++ b/main.nf @@ -1242,19 +1242,22 @@ process samtools_filter { """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "bam"){ """ - samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) + samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam + samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam samtools index "${size}" ${prefix}.filtered.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "fastq"){ """ - samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) + samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam + samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam samtools index "${size}" ${prefix}.filtered.bam samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz rm ${prefix}.unmapped.bam """ } else if("${params.bam_discard_unmapped}" && "${params.bam_unmapped_type}" == "both"){ """ - samtools view -h $bam | tee >(samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) >(samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) + samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.unmapped.bam) + samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${prefix}.filtered.bam) samtools index "${size}" ${prefix}.filtered.bam samtools fastq -tn ${prefix}.unmapped.bam | pigz -p ${task.cpus} > ${prefix}.unmapped.fastq.gz """ From f08d345f7b27152e1063f288fa94c0a4963dc263 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 18 Dec 2019 11:48:25 +0100 Subject: [PATCH 03/12] Added defaultbasequalities to earlier steps in GATK UG --- main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index e94b27e11..e694969b1 100644 --- a/main.nf +++ b/main.nf @@ -1734,16 +1734,16 @@ ch_gatk_download = Channel.value("download") if (params.gatk_dbsnp == '') """ samtools index -b ${bam} - java -jar ${jar} -T RealignerTargetCreator -R ${fasta} -I ${bam} -nt ${task.cpus} -o ${bam}.intervals - java -jar ${jar} -T IndelRealigner -R ${fasta} -I ${bam} -targetIntervals ${bam}.intervals -o ${bam}.realign.bam + java -jar ${jar} -T RealignerTargetCreator -R ${fasta} -I ${bam} -nt ${task.cpus} -o ${bam}.intervals ${defaultbasequalities} + java -jar ${jar} -T IndelRealigner -R ${fasta} -I ${bam} -targetIntervals ${bam}.intervals -o ${bam}.realign.bam ${defaultbasequalities} java -jar ${jar} -T UnifiedGenotyper -R ${fasta} -I ${bam}.realign.bam -o ${bam}.unifiedgenotyper.vcf -nt ${task.cpus} --genotype_likelihoods_model ${params.gatk_ug_genotype_model} -stand_call_conf ${params.gatk_call_conf} --sample_ploidy ${params.gatk_ploidy} -dcov ${params.gatk_downsample} --output_mode ${params.gatk_ug_out_mode} ${defaultbasequalities} pigz -p ${task.cpus} ${bam}.unifiedgenotyper.vcf """ else if (params.gatk_dbsnp != '') """ samtools index ${bam} - java -jar ${jar} -T RealignerTargetCreator -R ${fasta} -I ${bam} -nt ${task.cpus} -o ${bam}.intervals - java -jar ${jar} -T IndelRealigner -R ${fasta} -I ${bam} -targetIntervals ${bam}.intervals -o ${bam}.realign.bam + java -jar ${jar} -T RealignerTargetCreator -R ${fasta} -I ${bam} -nt ${task.cpus} -o ${bam}.intervals ${defaultbasequalities} + java -jar ${jar} -T IndelRealigner -R ${fasta} -I ${bam} -targetIntervals ${bam}.intervals -o ${bam}.realign.bam ${defaultbasequalities} java -jar ${jar} -T UnifiedGenotyper -R ${fasta} -I ${bam}.realign.bam -o ${bam}.unifiedgenotyper.vcf -nt ${task.cpus} --dbsnp ${params.gatk_dbsnp} --genotype_likelihoods_model ${params.gatk_ug_genotype_model} -stand_call_conf ${params.gatk_call_conf} --sample_ploidy ${params.gatk_ploidy} -dcov ${params.gatk_downsample} --output_mode ${params.gatk_ug_out_mode} ${defaultbasequalities} pigz -p ${task.cpus} ${bam}.unifiedgenotyper.vcf From 0f0a8c441e3e30252199e0fcded7812d939d3d7d Mon Sep 17 00:00:00 2001 From: jfy133 Date: Wed, 18 Dec 2019 14:57:32 +0100 Subject: [PATCH 04/12] Added error message to account for pmdtools error --- conf/base.config | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/conf/base.config b/conf/base.config index 52fb829a8..bd1e3bc64 100644 --- a/conf/base.config +++ b/conf/base.config @@ -77,6 +77,11 @@ process { errorStrategy = 'ignore' } + // Add 141 ignore due to unclean pipe closing by pmdtools https://github.com/pontussk/PMDtools/issues/7 + withName: pmdtools { + errorStrategy = { task.exitStatus in [141] ? 'ignore' : 'retry' } + } + withName: multiqc { errorStrategy = { task.exitStatus in [143,137] ? 'retry' : 'ignore' } } From 469dd88615d6798c828f295a1ddab4f6bd367bcc Mon Sep 17 00:00:00 2001 From: jfy133 Date: Wed, 18 Dec 2019 15:36:15 +0100 Subject: [PATCH 05/12] Bump genotyping resource requiements --- main.nf | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index e694969b1..df47daa54 100644 --- a/main.nf +++ b/main.nf @@ -1711,7 +1711,7 @@ ch_gatk_download = Channel.value("download") */ process genotyping_ug { - 'mc_small' + 'mc_medium' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1751,7 +1751,7 @@ ch_gatk_download = Channel.value("download") } process genotyping_hc { - label 'mc_small' + label 'mc_medium' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1787,6 +1787,7 @@ ch_gatk_download = Channel.value("download") * Step 12c: FreeBayes genotyping, should probably add in some options for users to set */ process genotyping_freebayes { + label 'mc_medium' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' From 3f0e8743a89a23dfda0df55a76be022a1368a487 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Wed, 18 Dec 2019 20:53:54 +0100 Subject: [PATCH 06/12] Tweaks to improve UG calling - but still only running on one file --- conf/base.config | 1 - main.nf | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/conf/base.config b/conf/base.config index bd1e3bc64..86de03c13 100644 --- a/conf/base.config +++ b/conf/base.config @@ -69,7 +69,6 @@ process { } withName:qualimap{ - cpus = { check_max(8 * task.attempt, 'cpus') } errorStrategy = 'ignore' } diff --git a/main.nf b/main.nf index df47daa54..0d706187a 100644 --- a/main.nf +++ b/main.nf @@ -1643,6 +1643,7 @@ process bam_trim { if ( params.run_genotyping && params.genotyping_source == 'raw' ) { ch_rmdup_for_skipdamagemanipulation.mix(ch_output_from_pmdtools,ch_output_from_bamutils) + .view { it -> "BAM is: $it" } .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes } ch_rmdupindex_for_skipdamagemanipulation.mix(ch_outputindex_from_pmdtools,ch_outputindex_from_bamutils) @@ -1711,7 +1712,7 @@ ch_gatk_download = Channel.value("download") */ process genotyping_ug { - 'mc_medium' + 'mc_large' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1734,9 +1735,9 @@ ch_gatk_download = Channel.value("download") if (params.gatk_dbsnp == '') """ samtools index -b ${bam} - java -jar ${jar} -T RealignerTargetCreator -R ${fasta} -I ${bam} -nt ${task.cpus} -o ${bam}.intervals ${defaultbasequalities} - java -jar ${jar} -T IndelRealigner -R ${fasta} -I ${bam} -targetIntervals ${bam}.intervals -o ${bam}.realign.bam ${defaultbasequalities} - java -jar ${jar} -T UnifiedGenotyper -R ${fasta} -I ${bam}.realign.bam -o ${bam}.unifiedgenotyper.vcf -nt ${task.cpus} --genotype_likelihoods_model ${params.gatk_ug_genotype_model} -stand_call_conf ${params.gatk_call_conf} --sample_ploidy ${params.gatk_ploidy} -dcov ${params.gatk_downsample} --output_mode ${params.gatk_ug_out_mode} ${defaultbasequalities} + java -Xmx${task.memory.toGiga()}g -jar ${jar} -T RealignerTargetCreator -R ${fasta} -I ${bam} -nt ${task.cpus} -o ${bam}.intervals ${defaultbasequalities} + java -Xmx${task.memory.toGiga()}g -jar ${jar} -T IndelRealigner -R ${fasta} -I ${bam} -targetIntervals ${bam}.intervals -o ${bam}.realign.bam ${defaultbasequalities} + java -Xmx${task.memory.toGiga()}g -jar ${jar} -T UnifiedGenotyper -R ${fasta} -I ${bam}.realign.bam -o ${bam}.unifiedgenotyper.vcf -nt ${task.cpus} --genotype_likelihoods_model ${params.gatk_ug_genotype_model} -stand_call_conf ${params.gatk_call_conf} --sample_ploidy ${params.gatk_ploidy} -dcov ${params.gatk_downsample} --output_mode ${params.gatk_ug_out_mode} ${defaultbasequalities} pigz -p ${task.cpus} ${bam}.unifiedgenotyper.vcf """ else if (params.gatk_dbsnp != '') @@ -1745,13 +1746,12 @@ ch_gatk_download = Channel.value("download") java -jar ${jar} -T RealignerTargetCreator -R ${fasta} -I ${bam} -nt ${task.cpus} -o ${bam}.intervals ${defaultbasequalities} java -jar ${jar} -T IndelRealigner -R ${fasta} -I ${bam} -targetIntervals ${bam}.intervals -o ${bam}.realign.bam ${defaultbasequalities} java -jar ${jar} -T UnifiedGenotyper -R ${fasta} -I ${bam}.realign.bam -o ${bam}.unifiedgenotyper.vcf -nt ${task.cpus} --dbsnp ${params.gatk_dbsnp} --genotype_likelihoods_model ${params.gatk_ug_genotype_model} -stand_call_conf ${params.gatk_call_conf} --sample_ploidy ${params.gatk_ploidy} -dcov ${params.gatk_downsample} --output_mode ${params.gatk_ug_out_mode} ${defaultbasequalities} - pigz -p ${task.cpus} ${bam}.unifiedgenotyper.vcf """ } process genotyping_hc { - label 'mc_medium' + label 'mc_large' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' From 3665b04fa072edfbca1635e0e8cf0a13e2d037d1 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Wed, 18 Dec 2019 21:28:02 +0100 Subject: [PATCH 07/12] Try replace file with val to so single-file channels not consumed --- main.nf | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/main.nf b/main.nf index 0d706187a..a6313fe23 100644 --- a/main.nf +++ b/main.nf @@ -1720,11 +1720,11 @@ ch_gatk_download = Channel.value("download") params.run_genotyping && params.genotyping_tool == 'ug' input: - file fasta from fasta_for_indexing - file jar from ch_unifiedgenotyper_jar + val fasta from fasta_for_indexing + val jar from ch_unifiedgenotyper_jar file bam from ch_damagemanipulation_for_genotyping_ug - file fai from ch_fai_for_ug - file dict from ch_dict_for_ug + val fai from ch_fai_for_ug + val dict from ch_dict_for_ug output: file "*vcf.gz" into ch_ug_for_multivcfanalyzer,ch_ug_for_vcf2genome @@ -1759,11 +1759,11 @@ ch_gatk_download = Channel.value("download") params.run_genotyping && params.genotyping_tool == 'hc' input: - file fasta from fasta_for_indexing + val fasta from fasta_for_indexing file bam from ch_damagemanipulation_for_genotyping_hc - file fai from ch_fai_for_hc - file dict from ch_dict_for_hc - file bai from ch_damagemanipulationindex_for_genotyping_hc + val fai from ch_fai_for_hc + val dict from ch_dict_for_hc + val bai from ch_damagemanipulationindex_for_genotyping_hc output: file "*vcf.gz" into ch_vcf_hc @@ -1795,11 +1795,11 @@ ch_gatk_download = Channel.value("download") params.run_genotyping && params.genotyping_tool == 'freebayes' input: - file fasta from fasta_for_indexing + val fasta from fasta_for_indexing file bam from ch_damagemanipulation_for_genotyping_freebayes - file fai from ch_fai_for_freebayes - file dict from ch_dict_for_freebayes - file bai from ch_damagemanipulationindex_for_genotyping_freebayes + val fai from ch_fai_for_freebayes + val dict from ch_dict_for_freebayes + val bai from ch_damagemanipulationindex_for_genotyping_freebayes output: file "*vcf.gz" into ch_vcf_freebayes @@ -1829,7 +1829,7 @@ process vcf2genome { input: file vcf from ch_ug_for_vcf2genome - file fasta from fasta_for_indexing + val fasta from fasta_for_indexing output: file "*.fasta.gz" From a54ae09c42eecc650383a6e71e801065e8a456ce Mon Sep 17 00:00:00 2001 From: jfy133 Date: Thu, 19 Dec 2019 09:23:22 +0100 Subject: [PATCH 08/12] Try setting referecnes to single value with .collect() --- main.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index a6313fe23..6aa934514 100644 --- a/main.nf +++ b/main.nf @@ -1720,11 +1720,11 @@ ch_gatk_download = Channel.value("download") params.run_genotyping && params.genotyping_tool == 'ug' input: - val fasta from fasta_for_indexing - val jar from ch_unifiedgenotyper_jar + val fasta from fasta_for_indexing.collect() + val jar from ch_unifiedgenotyper_jar.collect() file bam from ch_damagemanipulation_for_genotyping_ug - val fai from ch_fai_for_ug - val dict from ch_dict_for_ug + val fai from ch_fai_for_ug.collect() + val dict from ch_dict_for_ug.collect() output: file "*vcf.gz" into ch_ug_for_multivcfanalyzer,ch_ug_for_vcf2genome From 34dee8cfce5f2bdea8575410f5025fe88be568ba Mon Sep 17 00:00:00 2001 From: jfy133 Date: Thu, 19 Dec 2019 10:57:53 +0100 Subject: [PATCH 09/12] Fixed re-using of FASTA across pipeline with independent channels, and within processes with .collect() --- main.nf | 64 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/main.nf b/main.nf index 6aa934514..7c107f5bf 100644 --- a/main.nf +++ b/main.nf @@ -252,7 +252,7 @@ if ( params.fasta.isEmpty () ){ file zipped_fasta output: - file "*.{fa,fn,fna,fasta}" into fasta_for_indexing + file "*.{fa,fn,fna,fasta}" into ch_fasta_for_bwaindex,ch_fasta_for_faidx,ch_fasta_for_seqdict,ch_fasta_for_circulargenerator,ch_fasta_for_circularmapper,ch_fasta_for_damageprofiler,ch_fasta_for_qualimap,ch_fasta_for_pmdtools,ch_fasta_for_genotyping_ug,ch_fasta_for_genotyping_hc,ch_fasta_for_genotyping_freebayes,ch_fasta_for_vcf2genome,ch_fasta_for_multivcfanalyzer script: rm_zip = zipped_fasta - '.gz' @@ -262,7 +262,10 @@ if ( params.fasta.isEmpty () ){ } } else { - fasta_for_indexing = file("${params.fasta}") + fasta_for_indexing = Channel + .fromPath("${params.fasta}", checkIfExists: true) + .into{ ch_fasta_for_bwaindex; ch_fasta_for_faidx; ch_fasta_for_seqdict; ch_fasta_for_circulargenerator; ch_fasta_for_circularmapper; ch_fasta_for_damageprofiler; ch_fasta_for_qualimap; ch_fasta_for_pmdtools; ch_fasta_for_genotyping_ug; ch_fasta__for_genotyping_hc; ch_fasta_for_genotyping_hc; ch_fasta_for_genotyping_freebayes; ch_fasta_for_vcf2genome; ch_fasta_for_multivcfanalyzer } + lastPath = params.fasta.lastIndexOf(File.separator) bwa_base = params.fasta.substring(lastPath+1) } @@ -657,7 +660,7 @@ process makeBWAIndex { } input: - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_bwaindex file where_are_my_files output: @@ -699,7 +702,7 @@ process makeFastaIndex { when: params.fasta_index == '' && !params.fasta.isEmpty() && ( params.mapper == 'bwaaln' || params.mapper == 'bwamem' || params.mapper == 'circularmapper') input: - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_faidx file where_are_my_files output: @@ -745,7 +748,7 @@ process makeSeqDict { when: params.seq_dict == '' && !params.fasta.isEmpty() input: - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_seqdict file where_are_my_files output: @@ -1063,7 +1066,7 @@ process circulargenerator{ when: params.mapper == 'circularmapper' && !params.skip_mapping input: - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_circulargenerator output: file "${prefix}.{amb,ann,bwt,sa,pac}" into ch_circularmapper_indices @@ -1088,7 +1091,7 @@ process circularmapper{ input: set val(name), file(reads) from ch_adapteremoval_for_cm file index from ch_circularmapper_indices.collect() - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_circularmapper.collect() output: file "*.mapped.bam" into ch_output_from_cm @@ -1491,7 +1494,7 @@ process damageprofiler { input: file bam from ch_rmdup_for_damageprofiler - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_damageprofiler.collect() file bai from ch_rmdupindex_for_damageprofiler @@ -1522,7 +1525,7 @@ process qualimap { input: file bam from ch_rmdup_for_qualimap - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_qualimap.collect() output: file "*" into ch_qualimap_results @@ -1558,7 +1561,7 @@ process bedtools { input: file bam from ch_rmdup_for_bedtools - file anno_file from ch_anno_for_bedtools + file anno_file from ch_anno_for_bedtools.collect() output: file "*" @@ -1583,7 +1586,7 @@ process pmdtools { input: file bam from ch_rmdup_for_pmdtools - file fasta from fasta_for_indexing + file fasta from ch_fasta_for_pmdtools.collect() output: file "*.bam" into ch_output_from_pmdtools @@ -1643,7 +1646,6 @@ process bam_trim { if ( params.run_genotyping && params.genotyping_source == 'raw' ) { ch_rmdup_for_skipdamagemanipulation.mix(ch_output_from_pmdtools,ch_output_from_bamutils) - .view { it -> "BAM is: $it" } .into { ch_damagemanipulation_for_skipgenotyping; ch_damagemanipulation_for_genotyping_ug; ch_damagemanipulation_for_genotyping_hc; ch_damagemanipulation_for_genotyping_freebayes } ch_rmdupindex_for_skipdamagemanipulation.mix(ch_outputindex_from_pmdtools,ch_outputindex_from_bamutils) @@ -1712,7 +1714,7 @@ ch_gatk_download = Channel.value("download") */ process genotyping_ug { - 'mc_large' + label 'mc_small' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1720,11 +1722,11 @@ ch_gatk_download = Channel.value("download") params.run_genotyping && params.genotyping_tool == 'ug' input: - val fasta from fasta_for_indexing.collect() - val jar from ch_unifiedgenotyper_jar.collect() + file fasta from ch_fasta_for_genotyping_ug.collect() + file jar from ch_unifiedgenotyper_jar.collect() file bam from ch_damagemanipulation_for_genotyping_ug - val fai from ch_fai_for_ug.collect() - val dict from ch_dict_for_ug.collect() + file fai from ch_fai_for_ug.collect() + file dict from ch_dict_for_ug.collect() output: file "*vcf.gz" into ch_ug_for_multivcfanalyzer,ch_ug_for_vcf2genome @@ -1751,7 +1753,7 @@ ch_gatk_download = Channel.value("download") } process genotyping_hc { - label 'mc_large' + label 'mc_small' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1759,11 +1761,11 @@ ch_gatk_download = Channel.value("download") params.run_genotyping && params.genotyping_tool == 'hc' input: - val fasta from fasta_for_indexing + file fasta from ch_fasta_for_genotyping_hc.collect() file bam from ch_damagemanipulation_for_genotyping_hc - val fai from ch_fai_for_hc - val dict from ch_dict_for_hc - val bai from ch_damagemanipulationindex_for_genotyping_hc + file fai from ch_fai_for_hc.collect() + file dict from ch_dict_for_hc.collect() + file bai from ch_damagemanipulationindex_for_genotyping_hc.collect() output: file "*vcf.gz" into ch_vcf_hc @@ -1787,7 +1789,7 @@ ch_gatk_download = Channel.value("download") * Step 12c: FreeBayes genotyping, should probably add in some options for users to set */ process genotyping_freebayes { - label 'mc_medium' + label 'mc_small' tag "${prefix}" publishDir "${params.outdir}/genotyping", mode: 'copy' @@ -1795,11 +1797,11 @@ ch_gatk_download = Channel.value("download") params.run_genotyping && params.genotyping_tool == 'freebayes' input: - val fasta from fasta_for_indexing + file fasta from ch_fasta_for_genotyping_freebayes.collect() file bam from ch_damagemanipulation_for_genotyping_freebayes - val fai from ch_fai_for_freebayes - val dict from ch_dict_for_freebayes - val bai from ch_damagemanipulationindex_for_genotyping_freebayes + file fai from ch_fai_for_freebayes.collect() + file dict from ch_dict_for_freebayes.collect() + file bai from ch_damagemanipulationindex_for_genotyping_freebayes.collect() output: file "*vcf.gz" into ch_vcf_freebayes @@ -1813,12 +1815,10 @@ ch_gatk_download = Channel.value("download") """ } - /* * Step 13: VCF2Genome */ - process vcf2genome { label 'mc_small' tag "${prefix}" @@ -1829,7 +1829,7 @@ process vcf2genome { input: file vcf from ch_ug_for_vcf2genome - val fasta from fasta_for_indexing + file fasta from ch_fasta_for_vcf2genome.collect() output: file "*.fasta.gz" @@ -1867,8 +1867,8 @@ if (params.additional_vcf_files == '') { params.genotyping_tool == 'ug' && params.run_multivcfanalyzer && params.gatk_ploidy == '2' input: - file fasta from fasta_for_indexing - file vcf from ch_vcfs_for_multivcfanalyzer + file fasta from ch_fasta_for_multivcfanalyzer.collect() + file vcf from ch_vcfs_for_multivcfanalyzer.collect() output: file 'fullAlignment.fasta.gz' into ch_output_multivcfanalyzer_fullalignment From 65395bb450323d3f62f857f83bda283242cabe3e Mon Sep 17 00:00:00 2001 From: jfy133 Date: Thu, 19 Dec 2019 11:05:42 +0100 Subject: [PATCH 10/12] Update CHANGELOG.md --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 12e1fedd0..db8350977 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * [#302](https://github.com/nf-core/eager/issues/302) - Added mitochondrial to nuclear ratio calculation * [#302](https://github.com/nf-core/eager/issues/302) - Added VCF2Genome for concensus sequence generation * Fancy new logo from [ZandraFagernas](https://github.com/ZandraFagernas) -* [#286](https://github.com/nf-core/eager/issues/286) Adds pipeline-specific profiles (loaded from nf-core configs) +* [#286](https://github.com/nf-core/eager/issues/286) - Adds pipeline-specific profiles (loaded from nf-core configs) +* [#310](https://github.com/nf-core/eager/issues/310) - Generalises base.config ### `Fixed` From d4dc0c6fb1e16ef26428b8c575cf7eb515f1785c Mon Sep 17 00:00:00 2001 From: jfy133 Date: Thu, 19 Dec 2019 11:56:24 +0100 Subject: [PATCH 11/12] Increased minimum walltime for strip_fastq --- conf/base.config | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/conf/base.config b/conf/base.config index 86de03c13..cde1f9071 100644 --- a/conf/base.config +++ b/conf/base.config @@ -68,6 +68,10 @@ process { cache = false } + withName:strip_fastq { + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withName:qualimap{ errorStrategy = 'ignore' } From ce90aee89bdf9c3af2d3f10e016eab4d46d57d6a Mon Sep 17 00:00:00 2001 From: jfy133 Date: Thu, 19 Dec 2019 12:00:39 +0100 Subject: [PATCH 12/12] Process typo fix for strip_input_fastq fix --- conf/base.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/base.config b/conf/base.config index cde1f9071..e8d193a08 100644 --- a/conf/base.config +++ b/conf/base.config @@ -68,7 +68,7 @@ process { cache = false } - withName:strip_fastq { + withName:strip_input_fastq { time = { check_max( 4.h * task.attempt, 'time' ) } }