Merge pull request #707 from nf-core/dev

Dev -> Master for 3.4 release
nf-core · Oct 5, 2021 · 964425e · 964425e
2 parents 8094c42 + 9fa4cdd
commit 964425e
Show file tree

Hide file tree

Showing 169 changed files with 2,945 additions and 1,988 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -48,14 +48,14 @@ jobs:
       matrix:
         parameters:
           - "--skip_qc"
-          - "--remove_ribo_rna --skip_qualimap"
           - "--skip_trimming"
           - "--gtf false"
           - "--star_index false"
           - "--transcript_fasta false"
           - "--min_mapped_reads 90"
           - "--with_umi"
           - "--with_umi --skip_trimming"
+          - "--remove_ribo_rna --skip_qualimap"
           - "--bam_csi_index"
           - "--save_align_intermeds --save_reference"
           - "--featurecounts_group_type false"

diff --git a/.nf-core.yml b/.nf-core.yml
@@ -5,15 +5,15 @@ lint:
     - assets/email_template.txt
     - lib/NfcoreTemplate.groovy
     - assets/multiqc_config.yaml
+  files_exist:
+    - bin/scrape_software_versions.py
+    - modules/local/get_software_versions.nf
 update:
   nf-core/modules:
-    rseqc/bamstat: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    rseqc/inferexperiment: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    rseqc/innerdistance: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    rseqc/junctionannotation: "5dd049047d01e72c01a519422f17e203bca343ac"
-    rseqc/junctionsaturation: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    rseqc/readdistribution: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    rseqc/readduplication: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    sortmerna: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    star/align: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
-    star/genomegenerate: "e937c7950af70930d1f34bb961403d9d2aa81c7d"
+    rseqc/bamstat: "49da8642876ae4d91128168cd0db4f1c858d7792"
+    rseqc/inferexperiment: "49da8642876ae4d91128168cd0db4f1c858d7792"
+    rseqc/innerdistance: "49da8642876ae4d91128168cd0db4f1c858d7792"
+    rseqc/junctionannotation: "49da8642876ae4d91128168cd0db4f1c858d7792"
+    rseqc/junctionsaturation: "49da8642876ae4d91128168cd0db4f1c858d7792"
+    rseqc/readdistribution: "49da8642876ae4d91128168cd0db4f1c858d7792"
+    rseqc/readduplication: "49da8642876ae4d91128168cd0db4f1c858d7792"
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,12 +3,56 @@
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [[3.4](https://github.com/nf-core/rnaseq/releases/tag/3.3)] - 2021-10-05
+
+### Enhancements & fixes
+
+* Software version(s) will now be reported for every module imported during a given pipeline execution
+* Added `python3` shebang to appropriate scripts in `bin/` directory
+* [[#407](https://github.com/nf-core/rnaseq/issues/407)] - Filter mouse reads from PDX samples
+* [[#570](https://github.com/nf-core/rnaseq/issues/570)] - Update SortMeRNA to use SilvaDB 138 (for commercial use)
+* [[#690](https://github.com/nf-core/rnaseq/issues/690)] - Error with post-trimmed read 2 sample names from FastQC in MultiQC
+* [[#693](https://github.com/nf-core/rnaseq/issues/693)] - Cutadapt version missing from MultiQC report
+* [[#697](https://github.com/nf-core/rnaseq/issues/697)] - pipeline_report.{txt,html} missing from pipeline_info directory
+* [[#705](https://github.com/nf-core/rnaseq/issues/705)] - Sample sheet error check false positive
+
+### Parameters
+
+| Old parameter               | New parameter                  |
+|-----------------------------|--------------------------------|
+|                             | `--bbsplit_fasta_list`         |
+|                             | `--bbsplit_index`              |
+|                             | `--save_bbsplit_reads`         |
+|                             | `--skip_bbsplit`               |
+
+> **NB:** Parameter has been __updated__ if both old and new parameter information is present.
+> **NB:** Parameter has been __added__ if just the new parameter information is present.
+> **NB:** Parameter has been __removed__ if parameter information isn't present.
+
+### Software dependencies
+
+Note, since the pipeline is now using Nextflow DSL2, each process will be run with its own [Biocontainer](https://biocontainers.pro/#/registry). This means that on occasion it is entirely possible for the pipeline to be using different versions of the same tool. However, the overall software dependency changes compared to the last release have been listed below for reference.
+
+| Dependency    | Old version | New version |
+|---------------|-------------|-------------|
+| `bbmap`       |             | 38.93       |
+| `hisat2`      | 2.2.0       | 2.2.1       |
+| `picard`      | 2.23.9      | 2.25.7      |
+| `salmon`      | 1.4.0       | 1.5.2       |
+| `samtools`    | 1.12        | 1.13        |
+| `sortmerna`   | 4.2.0       | 4.3.4       |
+| `trim-galore` | 0.6.6       | 0.6.7       |
+
+> **NB:** Dependency has been __updated__ if both old and new version information is present.
+> **NB:** Dependency has been __added__ if just the new version information is present.
+> **NB:** Dependency has been __removed__ if version information isn't present.
+
 ## [[3.3](https://github.com/nf-core/rnaseq/releases/tag/3.3)] - 2021-07-29
 
 ### Enhancements & fixes
 
 * Updated pipeline template to [nf-core/tools 2.1](https://github.com/nf-core/tools/releases/tag/2.1)
-* [[#556](https://github.com/nf-core/rnaseq/issues/556)] - Genome index isn't recreated with --additional_fasta unless --star_index false
+* [[#556](https://github.com/nf-core/rnaseq/issues/556)] - Genome index is not recreated with --additional_fasta unless --star_index false
 * [[#668](https://github.com/nf-core/rnaseq/issues/668)] - Salmon quant with UMI-tools does not work
 * [[#674](https://github.com/nf-core/rnaseq/issues/674)] - Launch pipeline regex fails
 

diff --git a/CITATIONS.md b/CITATIONS.md
@@ -10,6 +10,8 @@
 
 ## Pipeline tools
 
+* [BBMap](https://sourceforge.net/projects/bbmap/)
+
 * [BEDTools](https://pubmed.ncbi.nlm.nih.gov/20110278/)
     > Quinlan AR, Hall IM. BEDTools: a flexible suite of utilities for comparing genomic features. Bioinformatics. 2010 Mar 15;26(6):841-2. doi: 10.1093/bioinformatics/btq033. Epub 2010 Jan 28. PubMed PMID: 20110278; PubMed Central PMCID: PMC2832824.
 

diff --git a/README.md b/README.md
@@ -30,24 +30,25 @@ The SRA download functionality has been removed from the pipeline (`>=3.2`) and
 2. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
 3. UMI extraction ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools))
 4. Adapter and quality trimming ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/))
-5. Removal of ribosomal RNA ([`SortMeRNA`](https://github.com/biocore/sortmerna))
-6. Choice of multiple alignment and quantification routes:
+5. Removal of genome contaminants ([`BBSplit`](http://seqanswers.com/forums/showthread.php?t=41288))
+6. Removal of ribosomal RNA ([`SortMeRNA`](https://github.com/biocore/sortmerna))
+7. Choice of multiple alignment and quantification routes:
     1. [`STAR`](https://github.com/alexdobin/STAR) -> [`Salmon`](https://combine-lab.github.io/salmon/)
     2. [`STAR`](https://github.com/alexdobin/STAR) -> [`RSEM`](https://github.com/deweylab/RSEM)
     3. [`HiSAT2`](https://ccb.jhu.edu/software/hisat2/index.shtml) -> **NO QUANTIFICATION**
-7. Sort and index alignments ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
-8. UMI-based deduplication ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools))
-9. Duplicate read marking ([`picard MarkDuplicates`](https://broadinstitute.github.io/picard/))
-10. Transcript assembly and quantification ([`StringTie`](https://ccb.jhu.edu/software/stringtie/))
-11. Create bigWig coverage files ([`BEDTools`](https://github.com/arq5x/bedtools2/), [`bedGraphToBigWig`](http://hgdownload.soe.ucsc.edu/admin/exe/))
-12. Extensive quality control:
+8. Sort and index alignments ([`SAMtools`](https://sourceforge.net/projects/samtools/files/samtools/))
+9. UMI-based deduplication ([`UMI-tools`](https://github.com/CGATOxford/UMI-tools))
+10. Duplicate read marking ([`picard MarkDuplicates`](https://broadinstitute.github.io/picard/))
+11. Transcript assembly and quantification ([`StringTie`](https://ccb.jhu.edu/software/stringtie/))
+12. Create bigWig coverage files ([`BEDTools`](https://github.com/arq5x/bedtools2/), [`bedGraphToBigWig`](http://hgdownload.soe.ucsc.edu/admin/exe/))
+13. Extensive quality control:
     1. [`RSeQC`](http://rseqc.sourceforge.net/)
     2. [`Qualimap`](http://qualimap.bioinfo.cipf.es/)
     3. [`dupRadar`](https://bioconductor.org/packages/release/bioc/html/dupRadar.html)
     4. [`Preseq`](http://smithlabresearch.org/software/preseq/)
     5. [`DESeq2`](https://bioconductor.org/packages/release/bioc/html/DESeq2.html)
-13. Pseudo-alignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/); *optional*)
-14. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
+14. Pseudo-alignment and quantification ([`Salmon`](https://combine-lab.github.io/salmon/); *optional*)
+15. Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks ([`MultiQC`](http://multiqc.info/), [`R`](https://www.r-project.org/))
 
 > * **NB:** Quantification isn't performed if using `--aligner hisat2` due to the lack of an appropriate option to calculate accurate expression estimates from HISAT2 derived genomic alignments. However, you can use this route if you have a preference for the alignment, QC and other types of downstream analysis compatible with the output of HISAT2.
 > * **NB:** The `--aligner star_rsem` option will require STAR indices built from version 2.7.6a or later. However, in order to support legacy usage of genomes hosted on AWS iGenomes the `--aligner star_salmon` option requires indices built with STAR 2.6.1d or earlier. Please refer to this [issue](https://github.com/nf-core/rnaseq/issues/498) for further details.
@@ -92,7 +93,7 @@ The nf-core/rnaseq pipeline comes with documentation about the pipeline [usage](
 
 These scripts were originally written for use at the [National Genomics Infrastructure](https://ngisweden.scilifelab.se), part of [SciLifeLab](http://www.scilifelab.se/) in Stockholm, Sweden, by Phil Ewels ([@ewels](https://github.com/ewels)) and Rickard Hammarén ([@Hammarn](https://github.com/Hammarn)).
 
-The pipeline was re-written in Nextflow DSL2 by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [The Bioinformatics & Biostatistics Group](https://www.crick.ac.uk/research/science-technology-platforms/bioinformatics-and-biostatistics/) at [The Francis Crick Institute](https://www.crick.ac.uk/), London.
+The pipeline was re-written in Nextflow DSL2 and is primarily maintained by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/).
 
 Many thanks to other who have helped out along the way too, including (but not limited to):
 [@Galithil](https://github.com/Galithil),

diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml
@@ -54,13 +54,12 @@ report_section_order:
 
 # Don't show % Dups in the General Stats table (we have this from Picard)
 table_columns_visible:
-    FastQC:
+    fastqc:
         percent_duplicates: False
 
 extra_fn_clean_exts:
     - '.umi_dedup'
-    - '_2_val_2'
-    - '_val_1'
+    - '_val'
 
 # Customise the module search patterns to speed up execution time
 #  - Skip module sub-tools that we are not interested in

diff --git a/assets/rrna-db-defaults.txt b/assets/rrna-db-defaults.txt
@@ -1,8 +1,8 @@
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/rfam-5.8s-database-id98.fasta
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/rfam-5s-database-id98.fasta
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/silva-arc-16s-id95.fasta
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/silva-arc-23s-id98.fasta
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/silva-bac-16s-id90.fasta
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/silva-bac-23s-id98.fasta
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/silva-euk-18s-id95.fasta
-https://raw.githubusercontent.com/biocore/sortmerna/v4.2.0/data/rRNA_databases/silva-euk-28s-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5.8s-database-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/rfam-5s-database-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-16s-id95.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-arc-23s-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-16s-id90.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-bac-23s-id98.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-18s-id95.fasta
+https://raw.githubusercontent.com/biocore/sortmerna/v4.3.4/data/rRNA_databases/silva-euk-28s-id98.fasta
diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import os
 import sys
@@ -47,7 +47,7 @@ def check_samplesheet(file_in, file_out):
     """
 
     sample_mapping_dict = {}
-    with open(file_in, "r") as fin:
+    with open(file_in, "r", encoding='utf-8-sig') as fin:
 
         ## Check header
         MIN_COLS = 3

diff --git a/bin/fasta2gtf.py b/bin/fasta2gtf.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 """
 Read a custom fasta file and create a custom GTF containing each entry
 """

diff --git a/bin/fastq_dir_to_samplesheet.py b/bin/fastq_dir_to_samplesheet.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 import os
 import sys

diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py
diff --git a/conf/base.config b/conf/base.config
@@ -47,4 +47,7 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
+    withName:CUSTOM_DUMPSOFTWAREVERSIONS {
+        cache = false
+    }
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -22,26 +22,6 @@
 
 params {
     modules {
-        'sra_ids_to_runinfo' {
-            publish_dir     = 'public_data'
-            publish_files   = ['tsv':'runinfo']
-        }
-        'sra_runinfo_to_ftp' {
-            publish_dir     = 'public_data'
-            publish_files   = ['tsv':'runinfo']
-        }
-        'sra_fastq_ftp' {
-            publish_dir     = 'public_data'
-            publish_files   = ['fastq.gz':'', 'md5':'md5']
-            args            = '-C - --max-time 1200'
-        }
-        'sra_to_samplesheet' {
-            publish_dir     = 'public_data'
-            publish_files   = false
-        }
-        'sra_merge_samplesheet' {
-            publish_dir     = 'public_data'
-        }
         'gffread' {
             args            = '--keep-exon-attrs -F -T'
             publish_dir     = 'genome'
@@ -60,6 +40,18 @@ params {
             args            = ''
             publish_files   = ['log':'']
         }
+        'bbsplit_untar' {
+            publish_dir     = 'genome/index'
+        }
+        'bbsplit_index' {
+            args            = 'build=1'
+            publish_dir     = 'genome/index'
+        }
+        'bbmap_bbsplit' {
+            args            = 'build=1 ambiguous2=all maxindel=150000'
+            publish_dir     = 'bbsplit'
+            publish_files   = ['txt':'']
+        }
         'sortmerna' {
             args            = '--num_alignments 1 --fastx -v'
             publish_files   = ['log':'']
@@ -75,6 +67,9 @@ params {
         'star_salmon_quant' {
             publish_dir     = "${params.aligner}"
         }
+        'star_salmon_tx2gene' {
+            publish_dir     = "${params.aligner}"
+        }
         'star_salmon_tximport' {
             publish_dir     = "${params.aligner}"
             publish_by_meta = true
@@ -136,7 +131,12 @@ params {
             suffix          = '.umi_dedup.transcriptome.sorted'
             publish_dir     = "${params.aligner}"
         }
-        'umitools_dedup_transcriptome_sort' {
+        'umitools_dedup_transcriptome_samtools' {
+            suffix          = '.umi_dedup.transcriptome.sorted'
+            publish_files   = ['stats':'samtools_stats', 'flagstat':'samtools_stats', 'idxstats':'samtools_stats']
+            publish_dir     = "${params.aligner}"
+        }
+        'umitools_dedup_transcriptome_samtools_sort_name' {
             args            = '-n'
             suffix          = '.umi_dedup.transcriptome'
             publish_files   = false
@@ -148,6 +148,9 @@ params {
         'salmon_quant' {
             args            = ''
         }
+        'salmon_tx2gene' {
+            args            = ''
+        }
         'salmon_tximport' {
             publish_by_meta = true
         }
@@ -160,7 +163,7 @@ params {
             publish_dir     = "${params.aligner}/preseq"
         }
         'picard_markduplicates' {
-            args            = 'ASSUME_SORTED=true REMOVE_DUPLICATES=false VALIDATION_STRINGENCY=LENIENT TMP_DIR=tmp'
+            args            = '--ASSUME_SORTED true --REMOVE_DUPLICATES false --VALIDATION_STRINGENCY LENIENT --TMP_DIR tmp'
             suffix          = '.markdup.sorted'
             publish_files   = ['bam': '', 'metrics.txt':'picard_metrics']
             publish_dir     = "${params.aligner}"