From 39b24d8bcd835cce29a7ba97ca1d98ef8c614b37 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 22 Dec 2020 11:27:49 +0100
Subject: [PATCH 1/6] Add metagenomic filtering

---
 .github/workflows/ci.yml        |  11 ++-
 assets/multiqc_config.yaml      |   3 +-
 bin/scrape_software_versions.py |   2 +
 docs/output.md                  |   1 +
 environment.yml                 |   1 +
 main.nf                         | 150 ++++++++++++++++++++------------
 nextflow.config                 |   6 +-
 nextflow_schema.json            |  15 ++++
 8 files changed, 127 insertions(+), 62 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9a004f3d3..1d09323b3 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -146,16 +146,16 @@ jobs:
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_pmdtools
       - name: GENOTYPING_UG AND MULTIVCFANALYZER Test running GATK UnifiedGenotyper and MultiVCFAnalyzer, additional VCFS
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer --additional_vcf_files 'https://raw.githubusercontent.com/nf-core/test-datasets/eager/testdata/Mammoth/vcf/JK2772_CATCAGTGAGTAGA_L008_R1_001.fastq.gz.tengrand.fq.combined.fq.mapped_rmdup.bam.unifiedgenotyper.vcf.gz' --write_allele_frequencies
       - name: COMPLEX LANE/LIBRARY MERGING Test running lane and library merging prior to GATK UnifiedGenotyper and running MultiVCFAnalyzer
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_complex,docker --run_genotyping --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP' --run_multivcfanalyzer
       - name: GENOTYPING_UG ON TRIMMED BAM Test
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP'
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_genotyping --run_trim_bam --genotyping_source 'trimmed' --genotyping_tool 'ug' --gatk_out_mode 'EMIT_ALL_SITES' --gatk_ug_genotype_model 'SNP'
       - name: BAM_INPUT Run the basic pipeline with the bam input profile, skip AdapterRemoval as no convertBam
         run: |
-         nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --skip_adapterremoval
       - name: BAM_INPUT Run the basic pipeline with the bam input profile, convert to FASTQ for adapterremoval test and downstream
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv_bam,docker --run_convertinputbam
@@ -167,6 +167,9 @@ jobs:
       - name: METAGENOMIC Run the basic pipeline but with unmapped reads going into MALT
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering  --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --malt_sam_output
+      - name: METAGENOMIC Run the basic pipeline but low-complexity filtered reads going into MALT
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_tsv,docker --run_bam_filtering  --bam_unmapped_type 'fastq' --run_metagenomic_screening --metagenomic_tool 'malt' --database "/home/runner/work/eager/eager/databases/malt/" --metagenomic_complexity_filter
       - name: MALTEXTRACT Download resource files
         run: |
             mkdir -p databases/maltextract
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
index 7fc6cabd5..c105fcb4e 100644
--- a/assets/multiqc_config.yaml
+++ b/assets/multiqc_config.yaml
@@ -6,7 +6,6 @@ report_comment: >
     This report has been generated by the <a href="https://github.com/nf-core/eager" target="_blank">nf-core/eager</a>
     analysis pipeline. For information about how to interpret these results, please see the
     <a href="https://github.com/nf-core/eager" target="_blank">documentation</a>.
-
 run_modules:
     - adapterRemoval
     - bowtie2
@@ -270,4 +269,4 @@ report_section_order:
     nf-core-eager-summary:
         order: -1001
 
-export_plots: true
+export_plots: true
\ No newline at end of file
diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 201df4a58..2c63320d3 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -36,6 +36,7 @@
     'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
     'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
     'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
+    'bbduk':['v_bbduk.txt',r"(\S+)"]
 }
 
 results = OrderedDict()
@@ -71,6 +72,7 @@
 results['kraken'] = '<span style="color:#999999;\">N/A</span>'
 results['maltextract'] = '<span style="color:#999999;\">N/A</span>'
 results['eigenstrat_snp_coverage'] = '<span style="color:#999999;\">N/A</span>'
+results['bbduk'] = '<span style="color:#999999;\">N/A</span>'
 
 # Search each file using its regex
 for k, v in regexes.items():
diff --git a/docs/output.md b/docs/output.md
index 7c316e7db..fe8bc019b 100644
--- a/docs/output.md
+++ b/docs/output.md
@@ -663,6 +663,7 @@ Each module has it's own output directory which sit alongside the `MultiQC/` dir
 - `sex_determination/` - this contains the output for the sex determination run. This is a single `.tsv` file that includes a table with the sample name, the number of autosomal SNPs, number of SNPs on the X/Y chromosome, the number of reads mapping to the autosomes, the number of reads mapping to the X/Y chromosome, the relative coverage on the X/Y chromosomes, and the standard error associated with the relative coverages. These measures are provided for each bam file, one row per file. If the `sexdeterrmine_bedfile` option has not been provided, the error bars cannot be trusted, and runtime will be considerably longer.
 - `nuclear_contamination/` - this contains the output of the nuclear contamination processes. The directory contains one `*.X.contamination.out` file per individual, as well as `nuclear_contamination.txt` which is a summary table of the results for all individual. `nuclear_contamination.txt` contains a header, followed by one line per individual, comprised of the Method of Moments (MOM) and Maximum Likelihood (ML) contamination estimate (with their respective standard errors) for both Method1 and Method2.
 - `bedtools/` - this contains two files as the output from bedtools coverage. One file contains the 'breadth' coverage (`*.breadth.gz`). This file will have the contents of your annotation file (e.g. BED/GFF), and the following subsequent columns: no. reads on feature, # bases at depth, length of feature, and % of feature. The second file (`*.depth.gz`), contains the contents of your annotation file (e.g. BED/GFF), and an additional column which is mean depth coverage (i.e. average number of reads covering each position).
+- `metagenomic_complexity_filter` - this contains the output from filtering of input reads to metagenomic classification of low-sequence complexity reads as performed by `bbduk`. This will include the filtered FASTQ files (`*_lowcomplexityremoved.fq.gz`) and also the run-time log (`_bbduk.stats`) for each sample. **Note:** there are no sections in the MultiQC report for this module, therefore you must check the `._bbduk.stats` files to get summary statistics of the filtering.
 - `metagenomic_classification/` - this contains the output for a given metagenomic classifier.
   - Running MALT will contain RMA6 files that can be loaded into MEGAN6 or MaltExtract for phylogenetic visualisation of read taxonomic assignments and aDNA characteristics respectively. Additional a `malt.log` file is provided which gives additional information such as run-time, memory usage and per-sample statistics of numbers of alignments with taxonomic assignment etc. This will also include gzip SAM files if requested.
   - Running kraken will contain the Kraken output and report files, as well as a merged Taxon count table.
diff --git a/environment.yml b/environment.yml
index 7204f8de7..248f02c74 100644
--- a/environment.yml
+++ b/environment.yml
@@ -47,3 +47,4 @@ dependencies:
   - conda-forge::xopen=0.9.0
   - bioconda::bowtie2=2.4.1
   - bioconda::eigenstratdatabasetools=1.0.2
+  - bioconda::bbmap=38.87
diff --git a/main.nf b/main.nf
index daad89f87..25023eae9 100644
--- a/main.nf
+++ b/main.nf
@@ -193,19 +193,21 @@ def helpMessage() {
       --contamination_chrom_name [str]    The name of the X chromosome in your bam or FASTA header. 'X' for hs37d5, 'chrX' for HG19. Default: '${params.contamination_chrom_name}'
 
     Metagenomic Screening
-      --run_metagenomic_screening [bool]     Turn on metagenomic screening module for reference-unmapped reads
-      --metagenomic_tool [str]               Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}'
-      --database [dir]                       Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory.
-      --metagenomic_min_support_reads [num]  Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads}
-      --percent_identity [num]               Percent identity value threshold for MALT. Default: ${params.percent_identity}
-      --malt_mode [str]                      Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}'
-      --malt_alignment_mode [str]            Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}'
-      --malt_top_percent [num]               Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent}
-      --malt_min_support_mode [str]          Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}'
-      --malt_min_support_percent [num]       Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent}
-      --malt_max_queries [num]               Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries}
-      --malt_memory_mode [str]               Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}'
-      --malt_sam_output [bool]               Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.
+      --metagenomic_complexity_filter             Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk.
+      --metagenomic_complexity_entropy            Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1. Default: '${params.metagenomic_complexity_entropy}'
+      --run_metagenomic_screening [bool]          Turn on metagenomic screening module for reference-unmapped reads
+      --metagenomic_tool [str]                    Specify which classifier to use. Options: 'malt', 'kraken'. Default: '${params.contamination_chrom_name}'
+      --database [dir]                            Specify path to classifer database directory. For Kraken2 this can also be a `.tar.gz` of the directory.
+      --metagenomic_min_support_reads [num]       Specify a minimum number of reads a taxon of sample total is required to have to be retained. Not compatible with . Default: ${params.metagenomic_min_support_reads}
+      --percent_identity [num]                    Percent identity value threshold for MALT. Default: ${params.percent_identity}
+      --malt_mode [str]                           Specify which alignment method to use for MALT. Options: 'Unknown', 'BlastN', 'BlastP', 'BlastX', 'Classifier'. Default: '${params.malt_mode}'
+      --malt_alignment_mode [str]                 Specify alignment method for MALT. Options: 'Local', 'SemiGlobal'. Default: '${params.malt_alignment_mode}'
+      --malt_top_percent [num]                    Specify the percent for LCA algorithm for MALT (see MEGAN6 CE manual). Default: ${params.malt_top_percent}
+      --malt_min_support_mode [str]               Specify whether to use percent or raw number of reads for minimum support required for taxon to be retained for MALT. Options: 'percent', 'reads'. Default: '${params.malt_min_support_mode}'
+      --malt_min_support_percent [num]            Specify the minimum percentage of reads a taxon of sample total is required to have to be retained for MALT. Default: Default: ${params.malt_min_support_percent}
+      --malt_max_queries [num]                    Specify the maximium number of queries a read can have for MALT. Default: ${params.malt_max_queries}
+      --malt_memory_mode [str]                    Specify the memory load method. Do not use 'map' with GPFS file systems for MALT as can be very slow. Options: 'load', 'page', 'map'. Default: '${params.malt_memory_mode}'
+      --malt_sam_output [bool]                    Specify to also produce SAM alignment files. Note this includes both aligned and unaligned reads, and are gzipped. Note this will result in very large file sizes.
 
     Metagenomic Authentication
       --run_maltextract [bool]                  Turn on MaltExtract for MALT aDNA characteristics authentication
@@ -506,6 +508,7 @@ if (params.run_multivcfanalyzer) {
 }
 
 // Metagenomic validation
+
 if (params.run_metagenomic_screening) {
   if ( params.bam_unmapped_type == "discard" ) {
   exit 1, "[nf-core/eager] error: metagenomic classification can only run on unmapped reads. Please supply --bam_unmapped_type 'fastq'. Supplied: --bam_unmapped_type '${params.bam_unmapped_type}'."
@@ -1079,7 +1082,7 @@ process fastp {
     """
     } else {
     """
-    fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_fastp.json 
+    fastp --in1 ${r1} --in2 ${r2} --out1 "${r1.baseName}.pG.fq.gz" --out2 "${r2.baseName}.pG.fq.gz" -A -g --poly_g_min_len "${params.complexity_filter_poly_g_min}" -Q -L -w ${task.cpus} --json "${libraryid}"_L${lane}_polyg_fastp.json 
     """
     }
 }
@@ -1853,7 +1856,7 @@ process samtools_filter {
 
     output:
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*filtered.bam"), file("*.{bai,csi}") into ch_output_from_filtering
-    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic
+    tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.fastq.gz") optional true into ch_bam_filtering_for_metagenomic,ch_metagenomic_for_skipentropyfilter
     tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, file("*.unmapped.bam") optional true
 
     // Using shell block rather than script because we are playing with awk
@@ -2640,36 +2643,36 @@ process genotyping_pileupcaller {
   """
   samtools mpileup -B -q 30 -Q 30 ${use_bed} -f ${fasta} ${bam_list} | pileupCaller ${caller} ${ssmode} ${transitions_mode} --sampleNames ${sample_names} ${use_snp} -e pileupcaller.${strandedness}
   """
- }
- 
+}
+
 process eigenstrat_snp_coverage {
-   label 'mc_tiny'
-   tag "${strandedness}"
-   publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode
-   
-   when:
-   params.run_genotyping && params.genotyping_tool == 'pileupcaller'
-   
-   input:
-   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump()
-   
-   output:
-   tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc
-   path("*_eigenstrat_coverage.txt")
-   
-   script:
-   /* 
-   The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available.
-   """
-   eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json
-   """
-   */
-   """
-   eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt
-   parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt
-   """
- }
- 
+  label 'mc_tiny'
+  tag "${strandedness}"
+  publishDir "${params.outdir}/genotyping", mode: params.publish_dir_mode
+  
+  when:
+  params.run_genotyping && params.genotyping_tool == 'pileupcaller'
+  
+  input:
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*") from ch_for_eigenstrat_snp_coverage.dump(tag:'eigenstrat_input')
+  
+  output:
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*.json") into ch_eigenstrat_snp_cov_for_multiqc
+  path("*_eigenstrat_coverage.txt")
+  
+  script:
+  /* 
+  The following code block can be swapped in once the eigenstratdatabasetools MultiQC module becomes available.
+  """
+  eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt -j ${strandedness}_eigenstrat_coverage_mqc.json
+  """
+  */
+  """
+  eigenstrat_snp_coverage -i pileupcaller.${strandedness} -s ".txt" >${strandedness}_eigenstrat_coverage.txt
+  parse_snp_cov.py ${strandedness}_eigenstrat_coverage.txt
+  """
+}
+
 process genotyping_angsd {
   label 'mc_small'
   tag "${samplename}"
@@ -2897,22 +2900,57 @@ process print_nuclear_contamination{
 /* --    METAGENOMICS-SPECIFIC ADDITIONAL STEPS     -- */
 /////////////////////////////////////////////////////////
 
+// Low entropy read filter to reduce input sequences of reads that are highly uninformative, and thus reduce runtime/false positives
+
+process metagenomic_complexity_filter {
+  label 'mc_small'
+  tag "${samplename}"
+  publishDir "${params.outdir}/metagenomic_complexity_filter/", mode: params.publish_dir_mode
+
+  when:
+  params.metagenomic_complexity_filter
+  
+  input:
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path(fastq) from ch_bam_filtering_for_metagenomic
+
+
+  output:
+  tuple samplename, libraryid, lane, seqtype, organism, strandedness, udg, path("*_lowcomplexityremoved.fq.gz") into ch_lowcomplexityfiltered_for_metagenomic
+  path("*_bbduk.stats") into ch_metagenomic_complexity_filter_for_multiqc
+
+  script:
+  """
+  bbduk.sh -Xmx${task.memory.toGiga()}g in=${fastq} threads=${task.cpus} entropymask=f entropy=${params.metagenomic_complexity_entropy} out=${fastq}_lowcomplexityremoved.fq.gz 2> ${fastq}_bbduk.stats
+  """
+
+}
+
+// metagenomic complexity filter bypass
+
+if ( params.metagenomic_complexity_filter ) {
+  ch_lowcomplexityfiltered_for_metagenomic
+    .set{ ch_filtered_for_metagenomic }
+} else {
+  ch_metagenomic_for_skipentropyfilter
+    .set{ ch_filtered_for_metagenomic }
+}
+
 // MALT is a super-fast BLAST replacement typically used for pathogen detection or microbiome profiling against large databases, here using off-target reads from mapping
 
 // As we collect all files for a all metagenomic runs, we DO NOT use the normal input/output tuple!
 if (params.metagenomic_tool == 'malt') {
-  ch_bam_filtering_for_metagenomic
-    .set {ch_bam_filtering_for_metagenomic_malt}
+  ch_filtered_for_metagenomic
+    .set {ch_input_for_metagenomic_malt}
 
-  ch_bam_filtering_for_metagenomic_kraken = Channel.empty()
+  ch_input_for_metagenomic_kraken = Channel.empty()
 } else if (params.metagenomic_tool == 'kraken') {
-  ch_bam_filtering_for_metagenomic
-    .set {ch_bam_filtering_for_metagenomic_kraken}
+  ch_filtered_for_metagenomic
+    .set {ch_input_for_metagenomic_kraken}
 
-  ch_bam_filtering_for_metagenomic_malt = Channel.empty()
+  ch_input_for_metagenomic_malt = Channel.empty()
 } else if ( params.metagenomic_tool == '' ) {
-  ch_bam_filtering_for_metagenomic_malt = Channel.empty()
-  ch_bam_filtering_for_metagenomic_kraken = Channel.empty()
+  ch_input_for_metagenomic_malt = Channel.empty()
+  ch_input_for_metagenomic_kraken = Channel.empty()
 
 }
 
@@ -2925,7 +2963,7 @@ process malt {
   params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'malt'
 
   input:
-  file fastqs from ch_bam_filtering_for_metagenomic_malt.map { it[7] }.collect()
+  file fastqs from ch_input_for_metagenomic_malt.map { it[7] }.collect()
   file db from ch_db_for_malt
 
   output:
@@ -3043,7 +3081,7 @@ process kraken {
   params.run_metagenomic_screening && params.run_bam_filtering && params.bam_unmapped_type == 'fastq' && params.metagenomic_tool == 'kraken'
 
   input:
-  path(fastq) from ch_bam_filtering_for_metagenomic_kraken.map { it[7] }
+  path(fastq) from ch_input_for_metagenomic_kraken.map { it[7] }
   path(krakendb) from ch_krakendb
 
   output:
@@ -3165,6 +3203,7 @@ process get_software_versions {
     pileupCaller --version &> v_sequencetools.txt 2>&1 || true
     bowtie2 --version | grep -a 'bowtie2-.* -fdebug' > v_bowtie2.txt || true
     eigenstrat_snp_coverage --version | cut -d ' ' -f2 >v_eigenstrat_snp_coverage.txt || true
+    bbduk.sh | grep 'Last modified' | cut -d' ' -f 3-99 > v_bbduk.txt || true
 
     scrape_software_versions.py &> software_versions_mqc.yaml
     """
@@ -3198,6 +3237,7 @@ process multiqc {
     file ('mutnucratio/*') from ch_mtnucratio_for_multiqc.collect().ifEmpty([])
     file ('endorspy/*') from ch_endorspy_for_multiqc.collect().ifEmpty([])
     file ('multivcfanalyzer/*') from ch_multivcfanalyzer_for_multiqc.collect().ifEmpty([])
+    file ('fastp_lowcomplexityfilter/*') from ch_metagenomic_complexity_filter_for_multiqc.collect().ifEmpty([])
     file ('malt/*') from ch_malt_for_multiqc.collect().ifEmpty([])
     file ('kraken/*') from ch_kraken_for_multiqc.collect().ifEmpty([])
     file ('hops/*') from ch_hops_for_multiqc.collect().ifEmpty([])
@@ -3391,7 +3431,7 @@ def checkHostname() {
 def extract_data(tsvFile) {
     Channel.fromPath(tsvFile)
         .splitCsv(header: true, sep: '\t')
-        .dump()
+        .dump(tag:'tsv_extract')
         .map { row ->
 
             def expected_keys = ['Sample_Name', 'Library_ID', 'Lane', 'Colour_Chemistry', 'SeqType', 'Organism', 'Strandedness', 'UDG_Treatment', 'R1', 'R2', 'BAM']
diff --git a/nextflow.config b/nextflow.config
index 684356b7e..90fd2d124 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -185,8 +185,12 @@ params {
   run_nuclear_contamination = false
   contamination_chrom_name = 'X' // Default to using hs37d5 name
 
-  // taxonomic classifer
+  // taxonomic classifier
   run_metagenomic_screening  = false
+  
+  metagenomic_complexity_filter = false
+  metagenomic_complexity_entropy = 0.3
+
   metagenomic_tool = ''
   database  = ''
   metagenomic_min_support_reads = 1
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 6c14dc9c9..09116597f 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -1291,6 +1291,21 @@
             "description": "Options for metagenomic screening of off-target reads.",
             "default": "",
             "properties": {
+                "metagenomic_complexity_filter": {
+                    "type": "boolean",
+                    "description": "Turn on removal of low-sequence complexity reads for metagenomic screening with bbduk",
+                    "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n",
+                    "fa_icon": "fas fa-filter"
+                },
+                "metagenomic_complexity_filter_threshold": {
+                    "type": "number",
+                    "default": 0.3,
+                    "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.",
+                    "minimum": 0,
+                    "maximum": 1,
+                    "help_text": "Specify a minimum entropy threshold that under which it will be _removed_ from the FASTQ file that goes into metagenomic screening. \n\nA mono-nucleotide read such as GGGGGG will have an entropy of 0, a completely random sequence has an entropy of almost 1.\n\nSee the `bbduk` [documentation](https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/-filter) on entropy for more information.\n\n> Modifies`bbduk` parameter `entropy=`",
+                    "fa_icon": "fas fa-percent"
+                },
                 "run_metagenomic_screening": {
                     "type": "boolean",
                     "description": "Turn on metagenomic screening module for reference-unmapped reads.",

From e20452e09530887f356d75e47ab6f048ec999278 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 22 Dec 2020 11:34:43 +0100
Subject: [PATCH 2/6] Linting fixes and add tool citation to README

---
 README.md            | 77 ++++++++++++++++++++------------------------
 nextflow_schema.json |  2 +-
 2 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index a85b30644..c071bc98d 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,39 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool
     <img src="docs/images/output/overview/eager2_workflow.png" alt="nf-core/eager schematic workflow" width="70%"
 </p>
 
-## Pipeline steps
+## Quick Start
+
+1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0)
+
+2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_
+
+3. Download the pipeline and test it on a minimal dataset with a single command:
+
+    ```bash
+    nextflow run nf-core/eager -profile test,<docker/singularity/podman/conda/institute>
+    ```
+
+    > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
+
+4. Start running your own analysis!
+
+    ```bash
+    nextflow run nf-core/eager -profile <docker/singularity/conda> --input '*_R{1,2}.fastq.gz' --fasta '<your_reference>.fasta'
+    ```
+
+5. Once your run has completed successfully, clean up the intermediate files.
+
+    ```bash
+    nextflow clean -f -k
+    ```
+
+See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline.
+
+**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html`
+
+Modifications to the default pipeline are easily made using various options as described in the documentation.
+
+## Pipeline Summary
 
 ### Default Steps
 
@@ -77,6 +109,7 @@ Additional functionality contained by the pipeline currently includes:
 
 #### Metagenomic Screening
 
+* Low-sequenced complexity filtering (`BBduk`)
 * Taxonomic binner with alignment (`MALT`)
 * Taxonomic binner without alignment (`Kraken2`)
 * aDNA characteristic screening of taxonomically binned data from MALT (`MaltExtract`)
@@ -89,47 +122,6 @@ A graphical overview of suggested routes through the pipeline depending on conte
     <img src="docs/images/output/overview/eager2_metromap_complex.png" alt="nf-core/eager metro map" width="70%"
 </p>
 
-## Quick Start
-
-1. Install [`nextflow`](https://nf-co.re/usage/installation) (version >= 20.04.0)
-
-2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or [`Podman`](https://podman.io/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_
-
-3. Download the pipeline and test it on a minimal dataset with a single command:
-
-    ```bash
-    nextflow run nf-core/eager -profile test,<docker/singularity/podman/conda/institute>
-    ```
-
-    > Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile <institute>` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment.
-
-4. Start running your own analysis!
-
-    ```bash
-    nextflow run nf-core/eager -profile <docker/singularity/conda> --input '*_R{1,2}.fastq.gz' --fasta '<your_reference>.fasta'
-    ```
-
-5. Once your run has completed successfully, clean up the intermediate files.
-
-    ```bash
-    nextflow clean -f -k
-    ```
-
-See [usage docs](https://nf-co.re/eager/docs/usage.md) for all of the available options when running the pipeline.
-
-**N.B.** You can see an overview of the run in the MultiQC report located at `./results/MultiQC/multiqc_report.html`
-
-Modifications to the default pipeline are easily made using various options
-as described in the documentation.
-
-## Pipeline Summary
-
-By default, the pipeline currently performs the following:
-
-<!-- TODO nf-core: Fill in short bullet-pointed list of default steps of pipeline -->
-
-* Sequencing quality control (`FastQC`)
-* Overall pipeline run summaries (`MultiQC`)
 
 ## Documentation
 
@@ -236,6 +228,7 @@ In addition, references of tools and data used in this pipeline are as follows:
 * **Bowtie2**  Langmead, B. and Salzberg, S. L. 2012 Fast gapped-read alignment with Bowtie 2. Nature methods, 9(4), p. 357–359. doi: [10.1038/nmeth.1923](https:/dx.doi.org/10.1038/nmeth.1923).
 * **sequenceTools** Stephan Schiffels (Unpublished). Download: [https://github.com/stschiff/sequenceTools](https://github.com/stschiff/sequenceTools)
 * **EigenstratDatabaseTools** Thiseas C. Lamnidis (Unpublished). Download: [https://github.com/TCLamnidis/EigenStratDatabaseTools.git](https://github.com/TCLamnidis/EigenStratDatabaseTools.git)
+* **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/)
 
 ## Data References
 
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 09116597f..73d3e60ce 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -1297,7 +1297,7 @@
                     "help_text": "Turns on low-sequence complexity filtering of off-target reads using `bbduk`.\n\nThis is typically performed to reduce the number of uninformative reads or potential false-positive reads, typically for input for metagenomic screening. This thus reduces false positive species IDs and also run-time and resource requirements.\n\nSee `--metagenomic_complexity_entropy` for how complexity is calculated. **Important** There are no MultiQC output results for this module, you must check the number of reads removed with the `_bbduk.stats` output file.\n\nDefault: off\n",
                     "fa_icon": "fas fa-filter"
                 },
-                "metagenomic_complexity_filter_threshold": {
+                "metagenomic_complexity_entropy": {
                     "type": "number",
                     "default": 0.3,
                     "description": "Specify the entropy threshold that under which a sequencing read will be complexity filtered out. This should be between 0-1.",

From 82a90d89c4c4c08bb9165bdfd3b11201dc95d284 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 22 Dec 2020 11:36:34 +0100
Subject: [PATCH 3/6] Markdown lint

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index c071bc98d..43935e764 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,6 @@ A graphical overview of suggested routes through the pipeline depending on conte
     <img src="docs/images/output/overview/eager2_metromap_complex.png" alt="nf-core/eager metro map" width="70%"
 </p>
 
-
 ## Documentation
 
 The nf-core/eager pipeline comes with documentation about the pipeline: [usage](https://nf-co.re/eager/usage) and [output](https://nf-co.re/eager/output).

From 2faa5cee13ff990406e4e8b57d5962b4f26f5235 Mon Sep 17 00:00:00 2001
From: James Fellows Yates <jfy133@gmail.com>
Date: Tue, 22 Dec 2020 12:07:51 +0100
Subject: [PATCH 4/6] Fix scrap_software_versions.py

---
 bin/scrape_software_versions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
index 2c63320d3..047b00dc7 100755
--- a/bin/scrape_software_versions.py
+++ b/bin/scrape_software_versions.py
@@ -35,7 +35,7 @@
     'VCF2genome':['v_vcf2genome.txt', r"VCF2Genome \(v. ([0-9].[0-9]+) "],
     'endorS.py':['v_endorSpy.txt', r"endorS.py (\S+)"],
     'kraken':['v_kraken.txt', r"Kraken version (\S+)"],
-    'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"]
+    'eigenstrat_snp_coverage':['v_eigenstrat_snp_coverage.txt',r"(\S+)"],
     'bbduk':['v_bbduk.txt',r"(\S+)"]
 }
 

From c3e744561a70bc4845753a58998ef14016e3c84e Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Tue, 22 Dec 2020 16:11:12 +0100
Subject: [PATCH 5/6] Update CHANGELOG.md

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bff232d59..12f2da760 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 
 ### `Added`
 
+- [#640](https://github.com/nf-core/eager/issues/640) - Added a pre-metagenomic screening filtering of low-sequence complexity reads with `bbduk`
+
 ### `Fixed`
 
 - Removed leftover old DockerHub push CI commands.

From 8cca08d37e304024ff7fb01c6308f77f39351c45 Mon Sep 17 00:00:00 2001
From: "James A. Fellows Yates" <jfy133@gmail.com>
Date: Wed, 23 Dec 2020 16:42:38 +0100
Subject: [PATCH 6/6] Linting

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 262362c5b..6b4c6f17a 100644
--- a/README.md
+++ b/README.md
@@ -230,7 +230,6 @@ In addition, references of tools and data used in this pipeline are as follows:
 * **mapDamage2** Jónsson, H., et al 2013. mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters. Bioinformatics , 29(13), 1682–1684. [https://doi.org/10.1093/bioinformatics/btt193](https://doi.org/10.1093/bioinformatics/btt193)
 * **BBduk** Brian Bushnell (Unpublished). Download: [https://sourceforge.net/projects/bbmap/](sourceforge.net/projects/bbmap/)
 
-
 ## Data References
 
 This repository uses test data from the following studies: