From 36276e836d1fe86385d6885e19f29417877228b6 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Mon, 20 Jul 2020 15:39:05 +0200 Subject: [PATCH 01/13] Fix booleans being interpeted as strings --- main.nf | 111 +++++++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 57 deletions(-) diff --git a/main.nf b/main.nf index 3a9b11734..e68644d02 100644 --- a/main.nf +++ b/main.nf @@ -41,6 +41,7 @@ def helpMessage() { Reference --fasta Path or URL to a FASTA reference file (required if not iGenome reference). File suffixes can be: '.fa', '.fn', '.fna', '.fasta' --genome Name of iGenomes reference (required if not FASTA reference). + --large_ref Specify to generate more recent '.csi' BAM indices. If your reference genome is larger than 3.5GB, this is recommended due to more efficent data handling with the '.csi' format over the older '.bai'. Output options: --outdir The output directory where the results will be saved. Default: ${params.outdir} @@ -926,9 +927,9 @@ process indexinputbam { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file("*.{bai,csi}") into ch_indexbam_for_filtering script: - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' """ - samtools index "${size}" ${bam} + samtools index ${bam} ${size} """ } @@ -1075,10 +1076,10 @@ process adapter_removal { script: base = "${r1.baseName}_L${lane}" //This checks whether we skip trimming and defines a variable respectively - trim_me = params.skip_trim ? '' : "--trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}" - collapse_me = params.skip_collapse ? '' : '--collapse' - preserve5p = params.preserve5p ? '--preserve5p' : '' - mergedonly = params.mergedonly ? "Y" : "N" + def trim_me = params.skip_trim ? '' : "--trimns --trimqualities --adapter1 ${params.clip_forward_adaptor} --adapter2 ${params.clip_reverse_adaptor} --minlength ${params.clip_readlength} --minquality ${params.clip_min_read_quality} --minadapteroverlap ${params.min_adap_overlap}" + def collapse_me = params.skip_collapse ? '' : '--collapse' + def preserve5p = params.preserve5p ? '--preserve5p' : '' + def mergedonly = params.mergedonly ? "Y" : "N" //PE mode, dependent on trim_me and collapse_me the respective procedure is run or not :-) if ( seqtype == 'PE' && !params.skip_collapse && !params.skip_trim ){ @@ -1334,7 +1335,7 @@ process bwa { params.mapper == 'bwaaln' script: - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' fasta = "${index}/${bwa_base}" //PE data without merging, PE data without any AR applied @@ -1343,14 +1344,14 @@ process bwa { bwa aln -t ${task.cpus} $fasta ${r1} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.r1.sai bwa aln -t ${task.cpus} $fasta ${r2} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.r2.sai bwa sampe -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${libraryid}\\tPL:illumina" $fasta ${libraryid}.r1.sai ${libraryid}.r2.sai ${r1} ${r2} | samtools sort -@ ${task.cpus} -O bam - > ${libraryid}_"${seqtype}".mapped.bam - samtools index "${size}" "${libraryid}"_"${seqtype}".mapped.bam + samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } else { //PE collapsed, or SE data """ bwa aln -t ${task.cpus} ${fasta} ${r1} -n ${params.bwaalnn} -l ${params.bwaalnl} -k ${params.bwaalnk} -f ${libraryid}.sai bwa samse -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${libraryid}\\tPL:illumina" $fasta ${libraryid}.sai $r1 | samtools sort -@ ${task.cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam - samtools index "${size}" "${libraryid}"_"${seqtype}".mapped.bam + samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } @@ -1375,17 +1376,17 @@ process bwamem { script: fasta = "${index}/${bwa_base}" - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' if (!params.single_end && params.skip_collapse){ """ bwa mem -t ${task.cpus} $fasta $r1 $r2 -R "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${libraryid}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam - samtools index "${size}" -@ ${task.cpus} "${libraryid}".mapped.bam + samtools index ${size} -@ ${task.cpus} "${libraryid}".mapped.bam """ } else { """ bwa mem -t ${task.cpus} $fasta $r1 -R "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${libraryid}\\tPL:illumina" | samtools sort -@ ${task.cpus} -O bam - > "${libraryid}"_"${seqtype}".mapped.bam - samtools index "${size}" -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam + samtools index -@ ${task.cpus} "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } @@ -1437,10 +1438,9 @@ process circularmapper{ params.mapper == 'circularmapper' script: - filter = "${params.circularfilter}" ? '' : '-f true -x false' + def filter = params.circularfilter ? '' : '-f true -x false' elongated_root = "${fasta.baseName}_${params.circularextension}.fasta" - - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' if (!params.single_end && params.skip_collapse ){ """ @@ -1449,7 +1449,7 @@ process circularmapper{ bwa sampe -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${libraryid}\\tPL:illumina" $elongated_root ${libraryid}.r1.sai ${libraryid}.r2.sai $r1 $r2 > tmp.out realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > ${libraryid}_"${seqtype}".mapped.bam - samtools index "${size}" ${libraryid}_"${seqtype}".mapped.bam + samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } else { """ @@ -1457,7 +1457,7 @@ process circularmapper{ bwa samse -r "@RG\\tID:ILLUMINA-${libraryid}\\tSM:${libraryid}\\tPL:illumina" $elongated_root ${libraryid}.sai $r1 > tmp.out realignsamfile -e ${params.circularextension} -i tmp.out -r $fasta $filter samtools sort -@ ${task.cpus} -O bam tmp_realigned.bam > "${libraryid}"_"${seqtype}".mapped.bam - samtools index "${size}" "${libraryid}"_"${seqtype}".mapped.bam + samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } @@ -1480,7 +1480,7 @@ process bowtie2 { params.mapper == 'bowtie2' script: - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' fasta = "${index}/${bt2_base}" trim5 = "${params.bt2_trim5}" != 0 ? "--trim5 ${params.bt2_trim5}" : "" trim3 = "${params.bt2_trim3}" != 0 ? "--trim3 ${params.bt2_trim3}" : "" @@ -1524,13 +1524,13 @@ process bowtie2 { if ( seqtype == 'PE' && ( params.skip_collapse || params.skip_adapterremoval ) ){ """ bowtie2 -x ${fasta} -1 ${r1} -2 ${r2} -p ${task.cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} 2> "${libraryid}"_bt2.log | samtools sort -@ ${task.cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam - samtools index "${size}" "${libraryid}"_"${seqtype}".mapped.bam + samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } else { //PE collapsed, or SE data """ bowtie2 -x ${fasta} -U ${r1} -p ${task.cpus} ${sensitivity} ${bt2n} ${bt2l} ${trim5} ${trim3} 2> "${libraryid}"_bt2.log | samtools sort -@ ${task.cpus} -O bam > "${libraryid}"_"${seqtype}".mapped.bam - samtools index "${size}" "${libraryid}"_"${seqtype}".mapped.bam + samtools index "${libraryid}"_"${seqtype}".mapped.bam ${size} """ } @@ -1657,12 +1657,12 @@ ch_branched_for_seqtypemerge = ch_mapping_for_seqtype_merging tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*_seqtypemerged_rg.bam"), file("*_seqtypemerged_rg*.{bai,csi}") into ch_seqtypemerge_for_filtering script: - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' """ samtools merge ${libraryid}_seqtypemerged.bam ${bam} ## Have to set validation as lenient because of BWA issue: "I see a read stands out the end of a chromosome and is flagged as unmapped (flag 0x4). [...]" http://bio-bwa.sourceforge.net/ picard AddOrReplaceReadGroups I=${libraryid}_seqtypemerged.bam O=${libraryid}_seqtypemerged_rg.bam RGID=1 RGLB="${libraryid}_seqtypemerged" RGPL=illumina RGPU=4410 RGSM="${libraryid}_seqtypemerged" VALIDATION_STRINGENCY=LENIENT - samtools index "${size}" ${libraryid}_seqtypemerged_rg.bam + samtools index ${libraryid}_seqtypemerged_rg.bam ${size} """ } @@ -1716,29 +1716,29 @@ process samtools_filter { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true script: - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' if ( "${params.bam_unmapped_type}" == "keep" ) { """ samtools view -h -b $bam -@ ${task.cpus} -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam - samtools index "${size}" ${libraryid}.filtered.bam + samtools index ${libraryid}.filtered.bam ${size} """ } else if("${params.bam_unmapped_type}" == "discard"){ """ samtools view -h -b $bam -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam - samtools index "${size}" ${libraryid}.filtered.bam + samtools index ${libraryid}.filtered.bam ${size} """ } else if("${params.bam_unmapped_type}" == "bam"){ """ samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam - samtools index "${size}" ${libraryid}.filtered.bam + samtools index ${libraryid}.filtered.bam ${size} """ } else if("${params.bam_unmapped_type}" == "fastq"){ """ samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam - samtools index "${size}" ${libraryid}.filtered.bam + samtools index ${libraryid}.filtered.bam ${size} samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz rm ${libraryid}.unmapped.bam """ @@ -1746,7 +1746,7 @@ process samtools_filter { """ samtools view -h $bam | samtools view - -@ ${task.cpus} -f4 -o ${libraryid}.unmapped.bam samtools view -h $bam | samtools view - -@ ${task.cpus} -F4 -q ${params.bam_mapping_quality_threshold} -o ${libraryid}.filtered.bam - samtools index "${size}" ${libraryid}.filtered.bam + samtools index ${libraryid}.filtered.bam ${size} samtools fastq -tn ${libraryid}.unmapped.bam | pigz -p ${task.cpus} > ${libraryid}.unmapped.fastq.gz """ } @@ -1856,8 +1856,8 @@ process dedup{ script: outname = "${bam.baseName}" - treat_merged="${params.dedup_all_merged}" ? '-m' : '' - size = "${params.large_ref}" ? '-c' : '' + def treat_merged = params.dedup_all_merged ? '-m' : '' + def size = params.large_ref ? '-c' : '' """ ## To make sure direct BAMs have a clean name @@ -1868,11 +1868,11 @@ process dedup{ dedup -Xmx${task.memory.toGiga()}g -i ${libraryid}.bam $treat_merged -o . -u mv *.log dedup.log samtools sort -@ ${task.cpus} "${libraryid}"_rmdup.bam -o "${libraryid}"_rmdup.bam - samtools index "${size}" "${libraryid}"_rmdup.bam + samtools index "${libraryid}"_rmdup.bam ${size} """ } -process markDup{ +process markduplicates{ label 'mc_small' tag "${outname}" publishDir "${params.outdir}/deduplication/", mode: 'copy', @@ -1890,10 +1890,10 @@ process markDup{ script: outname = "${bam.baseName}" - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' """ picard -Xmx${task.memory.toMega()}M -Xms${task.memory.toMega()}M MarkDuplicates INPUT=$bam OUTPUT=${libraryid}_rmdup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${libraryid}_rmdup.metrics" VALIDATION_STRINGENCY=SILENT - samtools index "${size}" ${libraryid}_rmdup.bam + samtools index ${libraryid}_rmdup.bam ${size} """ } @@ -1946,12 +1946,12 @@ process library_merge { tuple samplename, val("${samplename}_libmerged"), lane, seqtype, organism, strandedness, udg, file("*_libmerged_rg_rmdup.bam"), file("*_libmerged_rg_rmdup.bam.{bai,csi}") into ch_output_from_librarymerging script: - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' """ samtools merge ${samplename}_libmerged_rmdup.bam ${bam} ## Have to set validation as lenient because of BWA issue: "I see a read stands out the end of a chromosome and is flagged as unmapped (flag 0x4). [...]" http://bio-bwa.sourceforge.net/ picard AddOrReplaceReadGroups I=${samplename}_libmerged_rmdup.bam O=${samplename}_libmerged_rg_rmdup.bam RGID=1 RGLB="${samplename}_merged" RGPL=illumina RGPU=4410 RGSM="${samplename}_merged" VALIDATION_STRINGENCY=LENIENT - samtools index "${size}" ${samplename}_libmerged_rg_rmdup.bam + samtools index ${samplename}_libmerged_rg_rmdup.bam ${size} """ } @@ -2089,13 +2089,13 @@ process pmdtools { } else { snpcap = '' } - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' """ #Run Filtering step samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --threshold ${params.pmdtools_threshold} $treatment $snpcap --header | samtools view -@ ${task.cpus} -Sb - > "${libraryid}".pmd.bam #Run Calc Range step samtools calmd -b $bam $fasta | samtools view -h - | pmdtools --deamination --range ${params.pmdtools_range} $treatment $snpcap -n ${params.pmdtools_max_reads} > "${libraryid}".cpg.range."${params.pmdtools_range}".txt - samtools index "${size}" ${libraryid}.pmd.bam + samtools index ${libraryid}.pmd.bam ${size} """ } @@ -2134,12 +2134,12 @@ process bam_trim { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.trimmed.bam"), file("*.{bai,csi}") into ch_trimmed_from_bamutils script: - softclip = "${params.bamutils_softclip}" ? '-c' : '' - size = "${params.large_ref}" ? '-c' : '' + def softclip = params.bamutils_softclip ? '-c' : '' + def size = params.large_ref ? '-c' : '' """ bam trimBam $bam tmp.bam -L ${params.bamutils_clip_left} -R ${params.bamutils_clip_right} ${softclip} samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}.trimmed.bam - samtools index "${size}" ${libraryid}.trimmed.bam + samtools index ${libraryid}.trimmed.bam ${size} """ } @@ -2182,11 +2182,11 @@ process additional_library_merge { tuple samplename, val("${samplename}_libmerged"), lane, seqtype, organism, strandedness, udg, file("*_libmerged_rg_add.bam"), file("*_libmerged_rg_add.bam.{bai,csi}") into ch_output_from_trimmerge script: - size = "${params.large_ref}" ? '-c' : '' + def size = params.large_ref ? '-c' : '' """ samtools merge ${samplename}_libmerged_add.bam ${bam} picard AddOrReplaceReadGroups I=${samplename}_libmerged_add.bam O=${samplename}_libmerged_rg_add.bam RGID=1 RGLB="${samplename}_additionalmerged" RGPL=illumina RGPU=4410 RGSM="${samplename}_additionalmerged" VALIDATION_STRINGENCY=LENIENT - samtools index "${size}" ${samplename}_libmerged_rg_add.bam + samtools index ${samplename}_libmerged_rg_add.bam ${size} """ } @@ -2212,8 +2212,7 @@ process qualimap { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*") into ch_qualimap_results script: - snpcap = '' - if(params.snpcapture) snpcap = "-gff ${params.bedfile}" + def snpcap = params.snpcapture ? "-gff ${params.bedfile}" : '' """ qualimap bamqc -bam $bam -nt ${task.cpus} -outdir . -outformat "HTML" ${snpcap} """ @@ -2478,7 +2477,7 @@ if (params.pileupcaller_snpfile.isEmpty ()) { angsd_glformat = "3"; break } - angsd_fasta = !params.angsd_createfasta ? '' : params.angsd_fastamethod == 'random' ? '-doFasta 1 -doCounts 1' : '-doFasta 2 -doCounts 1' + def angsd_fasta = !params.angsd_createfasta ? '' : params.angsd_fastamethod == 'random' ? '-doFasta 1 -doCounts 1' : '-doFasta 2 -doCounts 1' """ echo ${bam} > bam.filelist mkdir angsd @@ -2553,7 +2552,7 @@ if (params.additional_vcf_files == '') { file('MultiVCFAnalyzer.json') optional true into ch_multivcfanalyzer_for_multiqc script: - write_freqs = "$params.write_allele_frequencies" ? "T" : "F" + def write_freqs = params.write_allele_frequencies ? "T" : "F" """ gunzip -f *.vcf.gz multivcfanalyzer ${params.snp_eff_results} ${fasta} ${params.reference_gff_annotations} . ${write_freqs} ${params.min_genotype_quality} ${params.min_base_coverage} ${params.min_allele_freq_hom} ${params.min_allele_freq_het} ${params.reference_gff_exclude} *.vcf @@ -2612,7 +2611,7 @@ process sex_deterrmine { params.run_sexdeterrmine script: - def filter = bed.name != 'NO_FILE' ? "-b $bed" : '' + filter = bed.name != 'NO_FILE' ? "-b $bed" : '' """ for i in *.bam; do @@ -2660,7 +2659,6 @@ process print_nuclear_contamination{ output: file 'nuclear_contamination.txt' - file 'nuclear_contamination_mqc.json' into ch_nuclear_contamination_for_multiqc script: """ @@ -2770,13 +2768,13 @@ process maltextract { file "results/*_Wevid.json" optional true into ch_hops_for_multiqc script: - destack = params.maltextract_destackingoff ? "--destackingOff" : "" - downsam = params.maltextract_downsamplingoff ? "--downSampOff" : "" - dupremo = params.maltextract_duplicateremovaloff ? "--dupRemOff" : "" - matches = params.maltextract_matches ? "--matches" : "" - megsum = params.maltextract_megansummary ? "--meganSummary" : "" - topaln = params.maltextract_topalignment ? "--useTopAlignment" : "" - ss = params.single_stranded ? "--singleStranded" : "" + def destack = params.maltextract_destackingoff ? "--destackingOff" : "" + def downsam = params.maltextract_downsamplingoff ? "--downSampOff" : "" + def dupremo = params.maltextract_duplicateremovaloff ? "--dupRemOff" : "" + def matches = params.maltextract_matches ? "--matches" : "" + def megsum = params.maltextract_megansummary ? "--meganSummary" : "" + def topaln = params.maltextract_topalignment ? "--useTopAlignment" : "" + def ss = params.single_stranded ? "--singleStranded" : "" """ MaltExtract \ -Xmx${task.memory.toGiga()}g \ @@ -2984,7 +2982,6 @@ process multiqc { file ('malt/*') from ch_malt_for_multiqc.collect().ifEmpty([]) file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([]) file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([]) - file ('nuclear_contamination/*') from ch_nuclear_contamination_for_multiqc.collect().ifEmpty([]) file logo from ch_eager_logo file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml") From 3c6c02098f97a8fd637dd3b6059543ed77a9351b Mon Sep 17 00:00:00 2001 From: TCLamnidis Date: Mon, 20 Jul 2020 16:34:22 +0200 Subject: [PATCH 02/13] Split trimming parameter of half and non UDG libraries --- main.nf | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index 769a07f37..e59b114e5 100644 --- a/main.nf +++ b/main.nf @@ -128,8 +128,10 @@ def helpMessage() { BAM Trimming --run_trim_bam Turn on BAM trimming, for example for for full-UDG or half-UDG protocols. - --bamutils_clip_left Specify the number of bases to clip off reads from 'left' end of read. Default: ${params.bamutils_clip_left} - --bamutils_clip_right Specify the number of bases to clip off reads from 'right' end of read. Default: ${params.bamutils_clip_right} + --bamutils_clip_half_udg_left Specify the number of bases to clip off reads from 'left' end of read for UDG half libaries. Default: ${params.bamutils_clip_half_udg_left} + --bamutils_clip_half_udg_right Specify the number of bases to clip off reads from 'right' end of read for UDG half libaries. Default: ${params.bamutils_clip_half_udg_right} + --bamutils_clip_non_udg_left Specify the number of bases to clip off reads from 'left' end of read for non-UDG libaries. Default: ${params.bamutils_clip_non_udg_left} + --bamutils_clip_non_udg_right Specify the number of bases to clip off reads from 'right' end of read for non-UDG libaries. Default: ${params.bamutils_clip_non_udg_right} --bamutils_softclip Turn on using softclip instead of hard masking. Genotyping @@ -2131,13 +2133,15 @@ process bam_trim { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file(bam), file(bai) from ch_bamutils_decision.totrim output: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.trimmed.bam"), file("*.{bai,csi}") into ch_trimmed_from_bamutils + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.trimmed.bam"), file("*.trimmed.bam.{bai,csi}") into ch_trimmed_from_bamutils script: softclip = "${params.bamutils_softclip}" ? '-c' : '' size = "${params.large_ref}" ? '-c' : '' + left_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_left}" : "${params.bamutils_clip_non_udg_left}" + right_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_right}" : "${params.bamutils_clip_non_udg_right}" """ - bam trimBam $bam tmp.bam -L ${params.bamutils_clip_left} -R ${params.bamutils_clip_right} ${softclip} + bam trimBam $bam tmp.bam -L ${left_clipping} -R ${right_clipping} ${softclip} samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}.trimmed.bam samtools index "${size}" ${libraryid}.trimmed.bam """ From af6ba7233e3fcf84cdb1e81442fd1f3035c26e8c Mon Sep 17 00:00:00 2001 From: TCLamnidis Date: Mon, 20 Jul 2020 16:35:04 +0200 Subject: [PATCH 03/13] Added default parameters for trimming of half and non udg libraries --- nextflow.config | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nextflow.config b/nextflow.config index 1c12a24be..506494f2c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -108,8 +108,10 @@ params { //bamUtils trimbam settings run_trim_bam = false - bamutils_clip_left = 1 - bamutils_clip_right = 1 + bamutils_clip_half_udg_left = 1 + bamutils_clip_half_udg_right = 1 + bamutils_clip_non_udg_left = 1 + bamutils_clip_non_udg_right = 1 bamutils_softclip = false //Mapped read stripping from input FASTQ From 442b6bd4e545da2d3c6329f3a709f5ad29d79965 Mon Sep 17 00:00:00 2001 From: jfy133 Date: Tue, 21 Jul 2020 11:17:13 +0200 Subject: [PATCH 04/13] Correct test for now working dedup --- .github/workflows/ci.yml | 2 +- docs/usage.md | 17 ++++++++++++++--- main.nf | 4 ++++ 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d007d0742..8dde64a58 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -109,7 +109,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_discard_unmapped --bam_unmapped_type 'fastq' - name: DEDUPLICATION Test with dedup run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mergedonly --dedupper 'dedup' - name: GENOTYPING_HC Test running GATK HaplotypeCaller run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker --run_genotyping --genotyping_tool 'hc' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_hc_emitrefconf 'BP_RESOLUTION' diff --git a/docs/usage.md b/docs/usage.md index 9570d0118..7a8340cac 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -588,7 +588,11 @@ Turns off quality based trimming at the 5p end of reads when any of the --trimns #### `--mergedonly` -This flag means that only merged reads are sent downstream for analysis. Singletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded. You may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). +Specify that only merged reads are sent downstream for analysis. + +Singletons (i.e. reads missing a pair), or un-merged reads (where there wasn't sufficient overlap) are discarded. + +You may want to use this if you want ensure only the best quality reads for your analysis, but with the penalty of potentially losing still valid data (even if some reads have slightly lower quality). It is highly recommended when using `--dedupper 'dedup'` (see below). ### Read Mapping Parameters @@ -707,11 +711,18 @@ If using TSV input, deduplication is performed library, i.e. after lane merging. #### `--dedupper` -Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool 'dedup' ([Pelter et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered. This utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data. +Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool 'dedup' ([Pelter et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered. + +This utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data. + +Note that if you run without the `--mergedonly` flag for AdapterRemoval, DeDup will +likely fail. If you absolutely want to use both PE and SE data, you can supply the +`--dedup_all_merged` flag to consider singletons to also be merged paired-end reads. This +may result in over-zealous deduplication. #### `--dedup_all_merged` -Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. +Sets DeDup to treat all reads as merged reads. This is useful if reads are for example not prefixed with `M_` in all cases. Therefore, this can be used as a workaround when also using a mixture of paired-end and single-end data, however this is not recommended (see above). ### Library Complexity Estimation Parameters diff --git a/main.nf b/main.nf index e68644d02..cd9ebfaeb 100644 --- a/main.nf +++ b/main.nf @@ -383,6 +383,10 @@ if (params.dedupper != 'dedup' && params.dedupper != 'markduplicates') { exit 1, "[nf-core/eager] error: Selected deduplication tool is not recognised. Options: 'dedup' or 'markduplicates'. You gave: --dedupper '${params.dedupper}'." } +if (params.dedupper == 'dedup' && !params.mergedonly) { + log.warn "[nf-core/eager] Warning: you are using DeDup but without specifying --mergedonly for AdapterRemoval, dedup will likely fail! See documentation for more information." +} + // Genotyping validation if (params.run_genotyping){ if (params.genotyping_tool != 'ug' && params.genotyping_tool != 'hc' && params.genotyping_tool != 'freebayes' && params.genotyping_tool != 'pileupcaller' && params.genotyping_tool != 'angsd' ) { From 8d51340cec3396404ece35920bcf633f08d8043b Mon Sep 17 00:00:00 2001 From: jfy133 Date: Tue, 21 Jul 2020 11:20:39 +0200 Subject: [PATCH 05/13] Linting --- docs/usage.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 7a8340cac..5a767c58a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -711,9 +711,9 @@ If using TSV input, deduplication is performed library, i.e. after lane merging. #### `--dedupper` -Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool 'dedup' ([Pelter et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered. +Sets the duplicate read removal tool. By default uses `markduplicates` from Picard. Alternatively an ancient DNA specific read deduplication tool 'dedup' ([Pelter et al. 2016](http://dx.doi.org/10.1186/s13059-016-0918-z)) is offered. -This utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data. +This utilises both ends of paired-end data to remove duplicates (i.e. true exact duplicates, as markduplicates will over-zealously deduplicate anything with the same starting position even if the ends are different). DeDup should only be used solely on paired-end data otherwise suboptimal deduplication can occur if applied to either single-end or a mix of single-end/paired-end data. Note that if you run without the `--mergedonly` flag for AdapterRemoval, DeDup will likely fail. If you absolutely want to use both PE and SE data, you can supply the From 762c8e580c2c4948d7be33558d0475750d825ddb Mon Sep 17 00:00:00 2001 From: jfy133 Date: Tue, 21 Jul 2020 12:59:29 +0200 Subject: [PATCH 06/13] As test_tsv mixes SE/PE switch to dedup-based workaround --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8dde64a58..a8487da41 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -109,7 +109,7 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_mapping_quality_threshold 37 --bam_discard_unmapped --bam_unmapped_type 'fastq' - name: DEDUPLICATION Test with dedup run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --mergedonly --dedupper 'dedup' + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --dedupper 'dedup' --dedup_all_merged - name: GENOTYPING_HC Test running GATK HaplotypeCaller run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_fna,docker --run_genotyping --genotyping_tool 'hc' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_hc_emitrefconf 'BP_RESOLUTION' From 153464d8de296b0042c4dee960ea2265d461f73a Mon Sep 17 00:00:00 2001 From: TCLamnidis Date: Tue, 21 Jul 2020 15:23:03 +0200 Subject: [PATCH 07/13] added def before if else parameter definitions --- main.nf | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/main.nf b/main.nf index 36fcd3679..e2f8fa6d5 100644 --- a/main.nf +++ b/main.nf @@ -1338,7 +1338,7 @@ process bwa { script: def size = params.large_ref ? '-c' : '' - fasta = "${index}/${bwa_base}" + def fasta = "${index}/${bwa_base}" //PE data without merging, PE data without any AR applied if ( seqtype == 'PE' && ( params.skip_collapse || params.skip_adapterremoval ) ){ @@ -1377,7 +1377,7 @@ process bwamem { params.mapper == 'bwamem' script: - fasta = "${index}/${bwa_base}" + def fasta = "${index}/${bwa_base}" def size = params.large_ref ? '-c' : '' if (!params.single_end && params.skip_collapse){ @@ -1441,7 +1441,7 @@ process circularmapper{ script: def filter = params.circularfilter ? '' : '-f true -x false' - elongated_root = "${fasta.baseName}_${params.circularextension}.fasta" + def elongated_root = "${fasta.baseName}_${params.circularextension}.fasta" def size = params.large_ref ? '-c' : '' if (!params.single_end && params.skip_collapse ){ @@ -1483,11 +1483,11 @@ process bowtie2 { script: def size = params.large_ref ? '-c' : '' - fasta = "${index}/${bt2_base}" - trim5 = "${params.bt2_trim5}" != 0 ? "--trim5 ${params.bt2_trim5}" : "" - trim3 = "${params.bt2_trim3}" != 0 ? "--trim3 ${params.bt2_trim3}" : "" - bt2n = "${params.bt2n}" != 0 ? "-N ${params.bt2n}" : "" - bt2l = "${params.bt2l}" != 0 ? "-L ${params.bt2l}" : "" + def fasta = "${index}/${bt2_base}" + def trim5 = params.bt2_trim5 != 0 ? "--trim5 ${params.bt2_trim5}" : "" + def trim3 = params.bt2_trim3 != 0 ? "--trim3 ${params.bt2_trim3}" : "" + def bt2n = params.bt2n != 0 ? "-N ${params.bt2n}" : "" + def bt2l = params.bt2l != 0 ? "-L ${params.bt2l}" : "" if ( "${params.bt2_alignmode}" == "end-to-end" ) { switch ( "${params.bt2_sensitivity}" ) { @@ -1857,7 +1857,7 @@ process dedup{ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("${libraryid}_rmdup.bam"), file("*.{bai,csi}") into ch_output_from_dedup, ch_dedup_for_libeval script: - outname = "${bam.baseName}" + def outname = "${bam.baseName}" def treat_merged = params.dedup_all_merged ? '-m' : '' def size = params.large_ref ? '-c' : '' @@ -1891,7 +1891,7 @@ process markduplicates{ tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("${libraryid}_rmdup.bam"), file("*.{bai,csi}") into ch_output_from_markdup, ch_markdup_for_libeval script: - outname = "${bam.baseName}" + def outname = "${bam.baseName}" def size = params.large_ref ? '-c' : '' """ picard -Xmx${task.memory.toMega()}M -Xms${task.memory.toMega()}M MarkDuplicates INPUT=$bam OUTPUT=${libraryid}_rmdup.bam REMOVE_DUPLICATES=TRUE AS=TRUE METRICS_FILE="${libraryid}_rmdup.metrics" VALIDATION_STRINGENCY=SILENT @@ -2138,8 +2138,8 @@ process bam_trim { script: def softclip = params.bamutils_softclip ? '-c' : '' def size = params.large_ref ? '-c' : '' - left_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_left}" : "${params.bamutils_clip_non_udg_left}" - right_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_right}" : "${params.bamutils_clip_non_udg_right}" + def left_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_left}" : "${params.bamutils_clip_non_udg_left}" + def right_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_right}" : "${params.bamutils_clip_non_udg_right}" """ bam trimBam $bam tmp.bam -L ${left_clipping} -R ${right_clipping} ${softclip} samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}.trimmed.bam @@ -2291,7 +2291,7 @@ if ( params.gatk_ug_jar != '' ) { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*realign.{bam,bai}") optional true script: - defaultbasequalities = params.gatk_ug_defaultbasequalities == '' ? '' : " --defaultBaseQualities ${params.gatk_ug_defaultbasequalities}" + def defaultbasequalities = params.gatk_ug_defaultbasequalities == '' ? '' : " --defaultBaseQualities ${params.gatk_ug_defaultbasequalities}" def keep_realign = params.gatk_ug_keep_realign_bam ? "" : "rm ${samplename}.realign.bam" def index_realign = params.gatk_ug_keep_realign_bam ? "samtools index ${samplename}.realign.bam" : "" if (params.gatk_dbsnp == '') @@ -2373,7 +2373,7 @@ if ( params.gatk_ug_jar != '' ) { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*vcf.gz") script: - skip_coverage = "${params.freebayes_g}" == 0 ? "" : "-g ${params.freebayes_g}" + def skip_coverage = "${params.freebayes_g}" == 0 ? "" : "-g ${params.freebayes_g}" """ freebayes -f ${fasta} -p ${params.freebayes_p} -C ${params.freebayes_C} ${skip_coverage} ${bam} > ${samplename}.freebayes.vcf pigz -p ${task.cpus} ${samplename}.freebayes.vcf @@ -2431,11 +2431,11 @@ if (params.pileupcaller_snpfile.isEmpty ()) { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("pileupcaller.${strandedness}.*") script: - transitions_mode = strandedness == "single" ? "" : "${params.pileupcaller_transitions_mode}" == 'SkipTransitions' ? "--skipTransitions" : "${params.pileupcaller_transitions_mode}" == 'TransitionsMissing' ? "--transitionsMissing" : "" - caller = "--${params.pileupcaller_method}" - ssmode = strandedness == "single" ? "--singleStrandMode" : "" - bam_list = bam.flatten().join(" ") - sample_names = samplename.flatten().join(",") + def transitions_mode = strandedness == "single" ? "" : "${params.pileupcaller_transitions_mode}" == 'SkipTransitions' ? "--skipTransitions" : "${params.pileupcaller_transitions_mode}" == 'TransitionsMissing' ? "--transitionsMissing" : "" + def caller = "--${params.pileupcaller_method}" + def ssmode = strandedness == "single" ? "--singleStrandMode" : "" + def bam_list = bam.flatten().join(" ") + def sample_names = samplename.flatten().join(",") """ samtools mpileup -B -q 30 -Q 30 -l ${bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} -f ${snp} -e pileupcaller.${strandedness} """ @@ -2511,8 +2511,8 @@ process vcf2genome { tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.fasta.gz") script: - out = "${params.vcf2genome_outfile}" == '' ? "${samplename}.fasta" : "${params.vcf2genome_outfile}" - fasta_head = "${params.vcf2genome_header}" == '' ? "${samplename}" : "${params.vcf2genome_header}" + def out = "${params.vcf2genome_outfile}" == '' ? "${samplename}.fasta" : "${params.vcf2genome_outfile}" + def fasta_head = "${params.vcf2genome_header}" == '' ? "${samplename}" : "${params.vcf2genome_header}" """ pigz -f -d -p ${task.cpus} *.vcf.gz vcf2genome -draft ${out}.fasta -draftname "${fasta_head}" -in ${vcf.baseName} -minc ${params.vcf2genome_minc} -minfreq ${params.vcf2genome_minfreq} -minq ${params.vcf2genome_minq} -ref ${fasta} -refMod ${out}_refmod.fasta -uncertain ${out}_uncertainy.fasta @@ -2615,7 +2615,7 @@ process sex_deterrmine { params.run_sexdeterrmine script: - filter = bed.name != 'NO_FILE' ? "-b $bed" : '' + def filter = bed.name != 'NO_FILE' ? "-b $bed" : '' """ for i in *.bam; do @@ -2995,9 +2995,9 @@ process multiqc { file "*_data" script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' - custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' + def rtitle = custom_runName ? "--title \"$custom_runName\"" : '' + def rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + def custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : '' """ multiqc -f $rtitle $rfilename $multiqc_config $custom_config_file . """ From 99ed47315feadac25c93c8b34581f9eaed467c3d Mon Sep 17 00:00:00 2001 From: TCLamnidis Date: Tue, 21 Jul 2020 15:55:25 +0200 Subject: [PATCH 08/13] Updated CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b2f41843..f39e9f8c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. * [#504] Removed sexdeterrmine-snps plot from MultiQC report. * Nuclear contamination results are now shown in the MultiQC report. * Tutorial on how to use profiles for reproducible science (i.e. parameter sharing between different groups) +* Added flexible trimming of bams by library type. 'half' and 'none' UDG libraries can now be trimmed differentially within a single eager run. ### `Fixed` From 1dcda18212bc71616338aaba7ba896f02ae2bc7d Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Tue, 21 Jul 2020 16:08:23 +0200 Subject: [PATCH 09/13] Update usage.md --- docs/usage.md | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 9570d0118..86f055c72 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -776,13 +776,17 @@ More documentation can be seen in the [bamUtil documentation](https://genome.sph Turns on the BAM trimming method. Trims off `[n]` bases from reads in the deduplicated BAM file Damage assessment in PMDTools or DamageProfiler remains untouched, as data is routed through this independently. BAM trimming os typically performed to reduce errors during genotyping that can be caused by aDNA damage. -BAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should already have all damage removed. The amount of bases that will be trimmed off (see `--bamutils_clip_left` / `--bamutils_clip_right`) will be the same regardless whether `'none'` of `'half'`. +BAM trimming will only be performed on libraries indicated as `--udg_type 'none'` or `--udg_type 'half'`. Complete UDG treatment ('full') should have removed all damage. The amount of bases that will be trimmed off can be set separately for libraries with `--udg_type` `'none'` and `'half'` (see `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` / `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right`). > Note: additional artefacts such as bar-codes or adapters that could potentially also be trimmed should be removed prior mapping. -#### `--bamutils_clip_left` / `--bamutils_clip_right` +#### `--bamutils_clip_half_udg_left` / `--bamutils_clip_half_udg_right` -Default set to `1` and clips off one base of the left or right side of reads. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read). +Default set to `1` and clips off one base of the left or right side of reads from libraries whose UDG treatment is set to `half`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read). + +#### `--bamutils_clip_none_udg_left` / `--bamutils_clip_none_udg_right` + +Default set to `1` and clips off one base of the left or right side of reads from libraries whose UDG treatment is set to `none`. Note that reverse reads will automatically be clipped off at the reverse side with this (automatically reverses left and right for the reverse read). #### `--bamutils_softclip` From 956b656a4bdd2df37cbd5aacde00c0cf7a4482e8 Mon Sep 17 00:00:00 2001 From: TCLamnidis Date: Tue, 21 Jul 2020 16:13:31 +0200 Subject: [PATCH 10/13] renamed bamutils_clip_non_udg_* -> bamutils_clip_none_udg_* for cinsistency with udg_type specification. --- main.nf | 16 ++++++++-------- nextflow.config | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index e2f8fa6d5..736b0e9ca 100644 --- a/main.nf +++ b/main.nf @@ -128,12 +128,12 @@ def helpMessage() { --anno_file Path to GFF or BED file containing positions of features in reference file (--fasta). Path should be enclosed in quotes. BAM Trimming - --run_trim_bam Turn on BAM trimming, for example for for full-UDG or half-UDG protocols. - --bamutils_clip_half_udg_left Specify the number of bases to clip off reads from 'left' end of read for UDG half libaries. Default: ${params.bamutils_clip_half_udg_left} - --bamutils_clip_half_udg_right Specify the number of bases to clip off reads from 'right' end of read for UDG half libaries. Default: ${params.bamutils_clip_half_udg_right} - --bamutils_clip_non_udg_left Specify the number of bases to clip off reads from 'left' end of read for non-UDG libaries. Default: ${params.bamutils_clip_non_udg_left} - --bamutils_clip_non_udg_right Specify the number of bases to clip off reads from 'right' end of read for non-UDG libaries. Default: ${params.bamutils_clip_non_udg_right} - --bamutils_softclip Turn on using softclip instead of hard masking. + --run_trim_bam Turn on BAM trimming, for example for for full-UDG or half-UDG protocols. + --bamutils_clip_half_udg_left Specify the number of bases to clip off reads from 'left' end of read for UDG half libaries. Default: ${params.bamutils_clip_half_udg_left} + --bamutils_clip_half_udg_right Specify the number of bases to clip off reads from 'right' end of read for UDG half libaries. Default: ${params.bamutils_clip_half_udg_right} + --bamutils_clip_none_udg_left Specify the number of bases to clip off reads from 'left' end of read for non-UDG libaries. Default: ${params.bamutils_clip_none_udg_left} + --bamutils_clip_none_udg_right Specify the number of bases to clip off reads from 'right' end of read for non-UDG libaries. Default: ${params.bamutils_clip_none_udg_right} + --bamutils_softclip Turn on using softclip instead of hard masking. Genotyping --run_genotyping Turn on genotyping of BAM files. @@ -2138,8 +2138,8 @@ process bam_trim { script: def softclip = params.bamutils_softclip ? '-c' : '' def size = params.large_ref ? '-c' : '' - def left_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_left}" : "${params.bamutils_clip_non_udg_left}" - def right_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_right}" : "${params.bamutils_clip_non_udg_right}" + def left_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_left}" : "${params.bamutils_clip_none_udg_left}" + def right_clipping = udg == "half" ? "${params.bamutils_clip_half_udg_right}" : "${params.bamutils_clip_none_udg_right}" """ bam trimBam $bam tmp.bam -L ${left_clipping} -R ${right_clipping} ${softclip} samtools sort -@ ${task.cpus} tmp.bam -o ${libraryid}.trimmed.bam diff --git a/nextflow.config b/nextflow.config index 506494f2c..be19ebcd7 100644 --- a/nextflow.config +++ b/nextflow.config @@ -110,8 +110,8 @@ params { run_trim_bam = false bamutils_clip_half_udg_left = 1 bamutils_clip_half_udg_right = 1 - bamutils_clip_non_udg_left = 1 - bamutils_clip_non_udg_right = 1 + bamutils_clip_none_udg_left = 1 + bamutils_clip_none_udg_right = 1 bamutils_softclip = false //Mapped read stripping from input FASTQ From ba52f2d6bd555da3d43ec7c0165c04f0e9533595 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Tue, 21 Jul 2020 16:16:08 +0200 Subject: [PATCH 11/13] Update output.md --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index b5fc93709..d51c2e301 100644 --- a/docs/output.md +++ b/docs/output.md @@ -672,7 +672,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir - `qualimap/` - this contains a sub-directory for every sample, which includes a qualimap report and associated raw statistic files. You can open the `.html` file in your internet browser to see the in-depth report (this will be more detailed than in MultiQC). This includes stuff like percent coverage, depth coverage, GC content and so on of your mapped reads. - `damageprofiler/` - this contains sample specific directories containing raw statistics and damage plots from DamageProfiler. The `.pdf` files can be used to visualise C to T miscoding lesions or read length distributions of your mapped reads. All raw statistics used for the PDF plots are contained in the `.txt` files. - `pmdtools/` this contains raw output statistics of pmdtools (estimates of frequencies of substitutions), and BAM files which have been filtered to remove reads that do not have a Post-mortem damage (PMD) score of `--pmdtools_threshold`. The BAM files do not have corresponding index files. -- `trimmed_bam/` this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_left` and `--bamutils_clip_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). +- `trimmed_bam/` this contains the BAM files with X number of bases trimmed off as defined with the `--bamutils_clip_half_udg_left`, `--bamutils_clip_half_udg_right`, `--bamutils_clip_none_udg_left`, and `--bamutils_clip_none_udg_right` flags and corresponding index files. You can use these BAM files for downstream analysis such as re-mapping data with more stringent parameters (if you set trimming to remove the most likely places containing damage in the read). - `genotyping/` this contains all the (gzipped) genotyping files produced by your genotyping module. The file suffix will have the genotyping tool name. You will have files corresponding to each of your deduplicated BAM files (except pileupcaller), or any turned-on downstream processes that create BAMs (e.g. trimmed bams or pmd tools). If `--gatk_ug_keep_realign_bam` supplied, this may also contain BAM files from InDel realignment when using GATK 3 and UnifiedGenotyping for variant calling. - `MultiVCFAnalyzer/` this contains all output from MultiVCFAnalyzer, including SNP calling statistics, various SNP table(s) and FASTA alignment files. - `sex_determination/` this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the Sample Name, the Nr of Autosomal SNPs, Nr of SNPs on the X/Y chromosome, the Nr of reads mapping to the Autosomes, the Nr of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per bam. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. From de6e686c9ee13209cf8ffc232f94adc816c71aa3 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Tue, 21 Jul 2020 16:21:47 +0200 Subject: [PATCH 12/13] Correct name of PMDTOOLS process --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d007d0742..f9beabd31 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,7 +128,7 @@ jobs: - name: TRIMBAM Test bamutils works alone run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_trim_bam - - name: TRIMBAM Test PMDtools works alone + - name: PMDTOOLS Test PMDtools works alone run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_pmdtools - name: GATK 3.5 Download resource files From efdc28fe36fda0dd699de0fb9b0ecf613395a611 Mon Sep 17 00:00:00 2001 From: "Thiseas C. Lamnidis" Date: Thu, 23 Jul 2020 09:11:05 +0200 Subject: [PATCH 13/13] Added nuc_cont multiqc file to output channels --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 1b0c40b3a..a02de0e04 100644 --- a/main.nf +++ b/main.nf @@ -2667,6 +2667,7 @@ process print_nuclear_contamination{ output: file 'nuclear_contamination.txt' + file 'nuclear_contamination_mqc.json' into ch_nuclear_contamination_for_multiqc script: """