From 39b24d8bcd835cce29a7ba97ca1d98ef8c614b37 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 22 Dec 2020 11:27:49 +0100 Subject: [PATCH 1/6] Add metagenomic filtering --- .github/workflows/ci.yml | 11 ++- assets/multiqc_config.yaml | 3 +- bin/scrape_software_versions.py | 2 + docs/output.md | 1 + environment.yml | 1 + main.nf | 150 ++++++++++++++++++++------------ nextflow.config | 6 +- nextflow_schema.json | 15 ++++ 8 files changed, 127 insertions(+), 62 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9a004f3d3..1d09323b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -146,16 +146,16 @@ jobs: nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_pmdtools - name: GENOTYPING_UG AND MULTIVCFANALYZER Test running GATK UnifiedGenotyper and MultiVCFAnalyzer, additional VCFS run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies - name: COMPLEX LANE/LIBRARY MERGING Test running lane and library merging prior to GATK UnifiedGenotyper and running MultiVCFAnalyzer run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer - name: GENOTYPING_UG ON TRIMMED BAM Test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' - name: BAM_INPUT Run the basic pipeline with the bam input profile, skip AdapterRemoval as no convertBam run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval - name: BAM_INPUT Run the basic pipeline with the bam input profile, convert to FASTQ for adapterremoval test and downstream run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --run_convertinputbam @@ -167,6 +167,9 @@ jobs: - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT run: | nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --malt_sam_output + - name: METAGENOMIC Run the basic pipeline but low-complexity filtered reads going into MALT + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter - name: MALTEXTRACT Download resource files run: | mkdir -p databases/maltextract diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 7fc6cabd5..c105fcb4e 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -6,7 +6,6 @@ report_comment: > This report has been generated by the nf-core/eager analysis pipeline. For information about how to interpret these results, please see the documentation. - run_modules: - adapterRemoval - bowtie2 @@ -270,4 +269,4 @@ report_section_order: nf-core-eager-summary: order: -1001 -export_plots: true +export_plots: true \ No newline at end of file diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 201df4a58..2c63320d3 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -36,6 +36,7 @@ 'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"], 'kraken':['v_kraken.txt', r"Kraken version (\S+)"], 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"] + 'bbduk':['v_bbduk.txt',r"(\S+)"] } results = OrderedDict() @@ -71,6 +72,7 @@ results['kraken'] = 'N/A' results['maltextract'] = 'N/A' results['eigenstrat_snp_coverage'] = 'N/A' +results['bbduk'] = 'N/A' # Search each file using its regex for k, v in regexes.items(): diff --git a/docs/output.md b/docs/output.md index 7c316e7db..fe8bc019b 100644 --- a/docs/output.md +++ b/docs/output.md @@ -663,6 +663,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir - `sex_determination/` - this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer. - `nuclear_contamination/` - this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2. - `bedtools/` - this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position). +- `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering. - `metagenomic_classification/` - this contains the output for a given metagenomic classifier. - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested. - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table. diff --git a/environment.yml b/environment.yml index 7204f8de7..248f02c74 100644 --- a/environment.yml +++ b/environment.yml @@ -47,3 +47,4 @@ dependencies: - conda-forge::xopen=0.9.0 - bioconda::bowtie2=2.4.1 - bioconda::eigenstratdatabasetools=1.0.2 + - bioconda::bbmap=38.87 diff --git a/main.nf b/main.nf index daad89f87..25023eae9 100644 --- a/main.nf +++ b/main.nf @@ -193,19 +193,21 @@ def helpMessage() { --contamination_chrom_name [str] The name of the X chromosome in your bam or FASTA header. 'X' for hs37d5, 'chrX' for HG19. Default: '${params.contamination_chrom_name}' Metagenomic Screening - --run_metagenomic_screening [bool] Turn on metagenomic screening module for reference-unmapped reads - --metagenomic_tool [str] Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}' - --database [dir] Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory. - --metagenomic_min_support_reads [num] Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads} - --percent_identity [num] Percent identity value threshold for MALT. Default: ${params.percent_identity} - --malt_mode [str] Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}' - --malt_alignment_mode [str] Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}' - --malt_top_percent [num] Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent} - --malt_min_support_mode [str] Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}' - --malt_min_support_percent [num] Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent} - --malt_max_queries [num] Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries} - --malt_memory_mode [str] Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}' - --malt_sam_output [bool] Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes. + --metagenomic_complexity_filter Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk. + --metagenomic_complexity_entropy Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1. Default: '${params.metagenomic_complexity_entropy}' + --run_metagenomic_screening [bool] Turn on metagenomic screening module for reference-unmapped reads + --metagenomic_tool [str] Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}' + --database [dir] Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory. + --metagenomic_min_support_reads [num] Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads} + --percent_identity [num] Percent identity value threshold for MALT. Default: ${params.percent_identity} + --malt_mode [str] Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}' + --malt_alignment_mode [str] Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}' + --malt_top_percent [num] Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent} + --malt_min_support_mode [str] Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}' + --malt_min_support_percent [num] Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent} + --malt_max_queries [num] Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries} + --malt_memory_mode [str] Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}' + --malt_sam_output [bool] Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes. Metagenomic Authentication --run_maltextract [bool] Turn on MaltExtract for MALT aDNA characteristics authentication @@ -506,6 +508,7 @@ if (params.run_multivcfanalyzer) { } // Metagenomic validation + if (params.run_metagenomic_screening) { if ( params.bam_unmapped_type == "discard" ) { exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads. Please supply --bam_unmapped_type 'fastq'. Supplied: --bam_unmapped_type '${params.bam_unmapped_type}'." @@ -1079,7 +1082,7 @@ process fastp { """ } else { """ - fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_fastp.json + fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_polyg_fastp.json """ } } @@ -1853,7 +1856,7 @@ process samtools_filter { output: tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*filtered.bam"), file("*.{bai,csi}") into ch_output_from_filtering - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic,ch_metagenomic_for_skipentropyfilter tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true // Using shell block rather than script because we are playing with awk @@ -2640,36 +2643,36 @@ process genotyping_pileupcaller { """ samtools mpileup -B -q 30 -Q 30 ${use_bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} ${use_snp} -e pileupcaller.${strandedness} """ - } - +} + process eigenstrat_snp_coverage { - label 'mc_tiny' - tag "${strandedness}" - publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode - - when: - params.run_genotyping && params.genotyping_tool == 'pileupcaller' - - input: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump() - - output: - tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc - path("*_eigenstrat_coverage.txt") - - script: - /* - The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available. - """ - eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json - """ - */ - """ - eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt - parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt - """ - } - + label 'mc_tiny' + tag "${strandedness}" + publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode + + when: + params.run_genotyping && params.genotyping_tool == 'pileupcaller' + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump(tag:'eigenstrat_input') + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc + path("*_eigenstrat_coverage.txt") + + script: + /* + The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available. + """ + eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json + """ + */ + """ + eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt + parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt + """ +} + process genotyping_angsd { label 'mc_small' tag "${samplename}" @@ -2897,22 +2900,57 @@ process print_nuclear_contamination{ /* -- METAGENOMICS-SPECIFIC ADDITIONAL STEPS -- */ ///////////////////////////////////////////////////////// +// Low entropy read filter to reduce input sequences of reads that are highly uninformative, and thus reduce runtime/false positives + +process metagenomic_complexity_filter { + label 'mc_small' + tag "${samplename}" + publishDir "${params.outdir}/metagenomic_complexity_filter/", mode: params.publish_dir_mode + + when: + params.metagenomic_complexity_filter + + input: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(fastq) from ch_bam_filtering_for_metagenomic + + + output: + tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_lowcomplexityremoved.fq.gz") into ch_lowcomplexityfiltered_for_metagenomic + path("*_bbduk.stats") into ch_metagenomic_complexity_filter_for_multiqc + + script: + """ + bbduk.sh -Xmx${task.memory.toGiga()}g in=${fastq} threads=${task.cpus} entropymask=f entropy=${params.metagenomic_complexity_entropy} out=${fastq}_lowcomplexityremoved.fq.gz 2> ${fastq}_bbduk.stats + """ + +} + +// metagenomic complexity filter bypass + +if ( params.metagenomic_complexity_filter ) { + ch_lowcomplexityfiltered_for_metagenomic + .set{ ch_filtered_for_metagenomic } +} else { + ch_metagenomic_for_skipentropyfilter + .set{ ch_filtered_for_metagenomic } +} + // MALT is a super-fast BLAST replacement typically used for pathogen detection or microbiome profiling against large databases, here using off-target reads from mapping // As we collect all files for a all metagenomic runs, we DO NOT use the normal input/output tuple! if (params.metagenomic_tool == 'malt') { - ch_bam_filtering_for_metagenomic - .set {ch_bam_filtering_for_metagenomic_malt} + ch_filtered_for_metagenomic + .set {ch_input_for_metagenomic_malt} - ch_bam_filtering_for_metagenomic_kraken = Channel.empty() + ch_input_for_metagenomic_kraken = Channel.empty() } else if (params.metagenomic_tool == 'kraken') { - ch_bam_filtering_for_metagenomic - .set {ch_bam_filtering_for_metagenomic_kraken} + ch_filtered_for_metagenomic + .set {ch_input_for_metagenomic_kraken} - ch_bam_filtering_for_metagenomic_malt = Channel.empty() + ch_input_for_metagenomic_malt = Channel.empty() } else if ( params.metagenomic_tool == '' ) { - ch_bam_filtering_for_metagenomic_malt = Channel.empty() - ch_bam_filtering_for_metagenomic_kraken = Channel.empty() + ch_input_for_metagenomic_malt = Channel.empty() + ch_input_for_metagenomic_kraken = Channel.empty() } @@ -2925,7 +2963,7 @@ process malt { params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'malt' input: - file fastqs from ch_bam_filtering_for_metagenomic_malt.map { it[7] }.collect() + file fastqs from ch_input_for_metagenomic_malt.map { it[7] }.collect() file db from ch_db_for_malt output: @@ -3043,7 +3081,7 @@ process kraken { params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'kraken' input: - path(fastq) from ch_bam_filtering_for_metagenomic_kraken.map { it[7] } + path(fastq) from ch_input_for_metagenomic_kraken.map { it[7] } path(krakendb) from ch_krakendb output: @@ -3165,6 +3203,7 @@ process get_software_versions { pileupCaller --version &> v_sequencetools.txt 2>&1 || true bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true + bbduk.sh | grep 'Last modified' | cut -d' ' -f 3-99 > v_bbduk.txt || true scrape_software_versions.py &> software_versions_mqc.yaml """ @@ -3198,6 +3237,7 @@ process multiqc { file ('mutnucratio/*') from ch_mtnucratio_for_multiqc.collect().ifEmpty([]) file ('endorspy/*') from ch_endorspy_for_multiqc.collect().ifEmpty([]) file ('multivcfanalyzer/*') from ch_multivcfanalyzer_for_multiqc.collect().ifEmpty([]) + file ('fastp_lowcomplexityfilter/*') from ch_metagenomic_complexity_filter_for_multiqc.collect().ifEmpty([]) file ('malt/*') from ch_malt_for_multiqc.collect().ifEmpty([]) file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([]) file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([]) @@ -3391,7 +3431,7 @@ def checkHostname() { def extract_data(tsvFile) { Channel.fromPath(tsvFile) .splitCsv(header: true, sep: '\t') - .dump() + .dump(tag:'tsv_extract') .map { row -> def expected_keys = ['Sample_Name', 'Library_ID', 'Lane', 'Colour_Chemistry', 'SeqType', 'Organism', 'Strandedness', 'UDG_Treatment', 'R1', 'R2', 'BAM'] diff --git a/nextflow.config b/nextflow.config index 684356b7e..90fd2d124 100644 --- a/nextflow.config +++ b/nextflow.config @@ -185,8 +185,12 @@ params { run_nuclear_contamination = false contamination_chrom_name = 'X' // Default to using hs37d5 name - // taxonomic classifer + // taxonomic classifier run_metagenomic_screening = false + + metagenomic_complexity_filter = false + metagenomic_complexity_entropy = 0.3 + metagenomic_tool = '' database = '' metagenomic_min_support_reads = 1 diff --git a/nextflow_schema.json b/nextflow_schema.json index 6c14dc9c9..09116597f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1291,6 +1291,21 @@ "description": "Options for metagenomic screening of off-target reads.", "default": "", "properties": { + "metagenomic_complexity_filter": { + "type": "boolean", + "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk", + "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", + "fa_icon": "fas fa-filter" + }, + "metagenomic_complexity_filter_threshold": { + "type": "number", + "default": 0.3, + "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", + "minimum": 0, + "maximum": 1, + "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`", + "fa_icon": "fas fa-percent" + }, "run_metagenomic_screening": { "type": "boolean", "description": "Turn on metagenomic screening module for reference-unmapped reads.", From e20452e09530887f356d75e47ab6f048ec999278 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 22 Dec 2020 11:34:43 +0100 Subject: [PATCH 2/6] Linting fixes and add tool citation to README --- README.md | 77 ++++++++++++++++++++------------------------ nextflow_schema.json | 2 +- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index a85b30644..c071bc98d 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,39 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool nf-core/eager schematic workflow -## Pipeline steps +## Quick Start + +1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0) + +2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ + +3. Download the pipeline and test it on a minimal dataset with a single command: + + ```bash + nextflow run nf-core/eager -profile test, + ``` + + > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. + +4. Start running your own analysis! + + ```bash + nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' + ``` + +5. Once your run has completed successfully, clean up the intermediate files. + + ```bash + nextflow clean -f -k + ``` + +See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline. + +**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html` + +Modifications to the default pipeline are easily made using various options as described in the documentation. + +## Pipeline Summary ### Default Steps @@ -77,6 +109,7 @@ Additional functionality contained by the pipeline currently includes: #### Metagenomic Screening +* Low-sequenced complexity filtering (`BBduk`) * Taxonomic binner with alignment (`MALT`) * Taxonomic binner without alignment (`Kraken2`) * aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`) @@ -89,47 +122,6 @@ A graphical overview of suggested routes through the pipeline depending on conte nf-core/eager metro map -## Quick Start - -1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0) - -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ - -3. Download the pipeline and test it on a minimal dataset with a single command: - - ```bash - nextflow run nf-core/eager -profile test, - ``` - - > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - -4. Start running your own analysis! - - ```bash - nextflow run nf-core/eager -profile --input '*_R{1,2}.fastq.gz' --fasta '.fasta' - ``` - -5. Once your run has completed successfully, clean up the intermediate files. - - ```bash - nextflow clean -f -k - ``` - -See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline. - -**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html` - -Modifications to the default pipeline are easily made using various options -as described in the documentation. - -## Pipeline Summary - -By default, the pipeline currently performs the following: - - - -* Sequencing quality control (`FastQC`) -* Overall pipeline run summaries (`MultiQC`) ## Documentation @@ -236,6 +228,7 @@ In addition, references of tools and data used in this pipeline are as follows: * **Bowtie2** Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923). * **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools) * **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git) +* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) ## Data References diff --git a/nextflow_schema.json b/nextflow_schema.json index 09116597f..73d3e60ce 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -1297,7 +1297,7 @@ "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n", "fa_icon": "fas fa-filter" }, - "metagenomic_complexity_filter_threshold": { + "metagenomic_complexity_entropy": { "type": "number", "default": 0.3, "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.", From 82a90d89c4c4c08bb9165bdfd3b11201dc95d284 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 22 Dec 2020 11:36:34 +0100 Subject: [PATCH 3/6] Markdown lint --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index c071bc98d..43935e764 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,6 @@ A graphical overview of suggested routes through the pipeline depending on conte nf-core/eager metro map - ## Documentation The nf-core/eager pipeline comes with documentation about the pipeline: [usage](https://nf-co.re/eager/usage) and [output](https://nf-co.re/eager/output). From 2faa5cee13ff990406e4e8b57d5962b4f26f5235 Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 22 Dec 2020 12:07:51 +0100 Subject: [PATCH 4/6] Fix scrap_software_versions.py --- bin/scrape_software_versions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index 2c63320d3..047b00dc7 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -35,7 +35,7 @@ 'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "], 'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"], 'kraken':['v_kraken.txt', r"Kraken version (\S+)"], - 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"] + 'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"], 'bbduk':['v_bbduk.txt',r"(\S+)"] } From c3e744561a70bc4845753a58998ef14016e3c84e Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 22 Dec 2020 16:11:12 +0100 Subject: [PATCH 5/6] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bff232d59..12f2da760 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### `Added` +- [#640](https://github.com/nf-core/eager/issues/640) - Added a pre-metagenomic screening filtering of low-sequence complexity reads with `bbduk` + ### `Fixed` - Removed leftover old DockerHub push CI commands. From 8cca08d37e304024ff7fb01c6308f77f39351c45 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Wed, 23 Dec 2020 16:42:38 +0100 Subject: [PATCH 6/6] Linting --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 262362c5b..6b4c6f17a 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,6 @@ In addition, references of tools and data used in this pipeline are as follows: * **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193) * **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/) - ## Data References This repository uses test data from the following studies: