Merge pull request #576 from maxplanck-ie/develop

prep for release 2.0.0
maxplanck-ie · Mar 2, 2020 · fd1d162 · fd1d162
2 parents 4a21a89 + cc5d42d
commit fd1d162
Show file tree

Hide file tree

Showing 128 changed files with 740,977 additions and 700 deletions.
diff --git a/.azure-pipelines/setup.yml b/.azure-pipelines/setup.yml
@@ -7,7 +7,7 @@ steps:
 #    installOptions: -c conda-forge -c bioconda --quiet
 #    createOptions: -c conda-forge -c bioconda --quiet --yes
 - bash: |
-    conda create -n foo -q --yes --quiet -c conda-forge -c bioconda snakemake fuzzywuzzy mock sphinx sphinx-argparse sphinx_rtd_theme flake8
+    conda create -n foo -q --yes --quiet -c conda-forge -c bioconda snakemake fuzzywuzzy mock sphinx sphinx-argparse sphinx_rtd_theme flake8 coreutils
   displayName: Installing dependencies
 - bash: |
     source activate foo

diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh
diff --git a/.ci_stuff/test_sampleSheet.tsv b/.ci_stuff/test_sampleSheet.tsv
@@ -0,0 +1,5 @@
+name	condition
+sample1	sample1	Control
+sample2	sample2	Control
+sample4	sample4	Treatment
+sample5	sample5	Treatment
diff --git a/README.rst b/README.rst
@@ -28,7 +28,8 @@ Workflows available
 
 - DNA-mapping*
 - ChIP-seq*
-- RNA-seq*
+- mRNA-seq*
+- noncoding-RNA-seq*
 - ATAC-seq*
 - scRNA-seq
 - Hi-C

diff --git a/bin/snakePipes b/bin/snakePipes
@@ -9,6 +9,8 @@ import glob
 import hashlib
 import shutil
 import snakePipes.common_functions as cof
+from snakePipes import __version__
+
 
 
 def parse_arguments():
@@ -66,6 +68,9 @@ def parse_arguments():
     subparsers.add_parser('flushOrganisms',
                           help='Flush all installed organism YAML files. This is only advisable when performing a clean installation.')
 
+    subparsers.add_parser('version',
+                          help='Print the snakePipes version')
+
     baseDir = os.path.dirname(snakePipes.__file__)
     defaults = cof.load_configfile(os.path.join(baseDir, "shared", "defaults.yaml"), False, "defaults")
     configParser = subparsers.add_parser('config',
@@ -268,8 +273,13 @@ def createCondaEnvs(args):
         # Don't actually create the env if either --info is set or it already exists and --force is NOT set
         if not args.info:
             if not os.path.exists(os.path.join(condaDirUse, h)) or args.force:
-                os.makedirs(os.path.join(condaDirUse, h), exist_ok=True)
-                subprocess.check_call(cmd)
+                try:
+                    os.makedirs(os.path.join(condaDirUse, h), exist_ok=True)
+                    subprocess.check_call(cmd)
+                except:
+                    # Ensure an environment is fully removed on error
+                    shutil.rmtree(os.path.join(condaDirUse, h), ignore_errors=False)
+                    sys.exit("There was an error when creating the environments!\n")
 
         # Ignore site-packages
         if args.noSitePackages and not args.info:
@@ -298,6 +308,10 @@ def updateConfig(args):
     cof.write_configfile(os.path.join(baseDir, "shared", "defaults.yaml"), d)
 
 
+def version():
+    print("version {}".format(__version__))
+
+
 def main(args):
     args = parse_arguments().parse_args(args)
     if args.command == 'info':
@@ -308,6 +322,8 @@ def main(args):
         flushOrganisms()
     elif args.command == 'config':
         updateConfig(args)
+    elif args.command == 'version':
+        version()
     else:
         createCondaEnvs(args)
 

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: snakepipes
-  version: 1.3.1
+  version: 2.0.0
 
 source:
   path: ../
@@ -10,7 +10,7 @@ build:
   noarch: python
 
 requirements:
-  build:
+  host:
     - python >=3
   run:
     - python >=3
@@ -19,6 +19,7 @@ requirements:
     - graphviz
     - fuzzywuzzy
     - pyyaml >=5.1
+    - coreutils
 
 test:
   commands:

diff --git a/docs/content/News.rst b/docs/content/News.rst
@@ -1,6 +1,21 @@
 snakePipes News
 ===============
 
+snakePipes 2.0.0
+----------------
+
+ * Added a noncoding-RNA-seq workflow and renamed RNA-seq to mRNA-seq for clarity. The noncoding workflow will also quantify protein coding genes, but its primary use is analyzing repeat expression.
+ * In order to use the noncoding-RNA-seq workflow organism YAML files must now include a `rmsk_file` entry.
+ * Fixed STAR on CIFS mounted VFAT file systems (issue #537).
+ * Added mode STARsolo to scRNAseq. This mode is now default.
+ * Added log fold change shrinkage with "apeglm" to DESeq2 basic in the mRNAseq workflow. Two versions of results tables (with and without shrinkage) are now written to the DESeq2 output folder.
+ * Added Genrich as peakCaller option to ChIPseq and ATACseq.
+ * Added HMMRATAC as peakCaller option to ATACseq.
+ * ATAC-seq short bam (filtered for short fragments) is now stored in a separate folder.
+
+.. note::
+   Please be aware that this version requires regeneration of STAR indices!
+
 snakePipes 1.3.2
 ----------------
 

diff --git a/docs/content/advanced_usage.rst b/docs/content/advanced_usage.rst
@@ -37,8 +37,9 @@ All files needed to be modified in order to extend/modify a workflow, are availa
         ├── createIndices
         ├── DNA-mapping
         ├── HiC
+        ├── mRNA-seq
+        ├── noncoding-RNA-seq
         ├── preprocessing
-        ├── RNA-seq
         ├── scRNAseq
         └──WGBS
 
@@ -147,17 +148,17 @@ Therefore in order to change or upgrade a tool version, all you need to do is to
 Modifying or adding new rules to the workflows
 ------------------------------------------------
 
-Modifying or adding new rules to snakePipes workflows is relatively easy. Considering you want to add a new Rscript that performs a downstream analysis on the DESeq2 output in RNA-seq workflow. These would be the steps needed:
+Modifying or adding new rules to snakePipes workflows is relatively easy. Considering you want to add a new Rscript that performs a downstream analysis on the DESeq2 output in mRNA-seq workflow. These would be the steps needed:
 
     * Test the Rscript on command line first, then move it in the ``shared/rscripts`` folder.
 
     * Add a rule that called the Rscript and put it under ``shared/rules`` folder.
 
-    * Add the corresponding ``rule all``, that defines the expected output into ``workflows/RNA-seq/Snakefile``
+    * Add the corresponding ``rule all``, that defines the expected output into ``workflows/mRNA-seq/Snakefile``
 
     * Now, for easy and reproducible execution of the rule, add a ``conda`` directive and point it to the relevant conda env under ``shared/rules/envs``. Since your rule might need a new R package, `search whether it's available <https://anaconda.org/search?q=knitr>`__ in one of the conda channels and add the package name (as indicated in the conda channel) and version under the ``dependencies`` key.
 
-    * Finally, modify the command line wrapper (``workflows/RNA-seq/RNA-seq.py``) to make this new feature available to the users!
+    * Finally, modify the command line wrapper (``workflows/mRNA-seq/mRNA-seq``) to make this new feature available to the users!
 
 
 Using AWS or other cloud platforms
@@ -170,7 +171,7 @@ There is nothing particularly special about performing computations on AWS or ot
  3. Ensure that you install snakePipes on a separate EBS (or equivalent) storage block. We found that a 200GB ``/data`` partition was most convenient. This absolutely must not be the ``/`` partition, as mounting such a persistent image on other instances will result in paths being changed, which result in needing to modify large numbers of files.
  4. It's usually sufficient to use a single large (e.g., ``m5.24xlarge``) compute node, with 100+ cores and a few hundred GB RAM. This allows one to use the ``--local`` option and not have to deal with the hassle of setting up a proper cluster on AWS. Make sure the then set ``-j`` to the number of available cores on the node, so snakePipes can make the most efficient use of the resources (and minimize your bill).
 
-Below is an example of running the RNA-seq pipeline on AWS using the resources outlined above. Note that it's best to store your input/output data on a separate storage block, since its lifetime is likely to be shorter than that of the indices.
+Below is an example of running the mRNA-seq pipeline on AWS using the resources outlined above. Note that it's best to store your input/output data on a separate storage block, since its lifetime is likely to be shorter than that of the indices.
 
 .. code:: bash
 
@@ -213,7 +214,7 @@ Below is an example of running the RNA-seq pipeline on AWS using the resources o
 
     # Update defaults.yaml to use /data/tmp for temporary space
 
-Then a larger instance can be spun up and the `RNA-seq` pipeline run as normal.
+Then a larger instance can be spun up and the `mRNA-seq` pipeline run as normal.
 
 .. code:: bash
 
@@ -222,7 +223,7 @@ Then a larger instance can be spun up and the `RNA-seq` pipeline run as normal.
     chown ec2-user /data
     export PATH=/data/snakePipes/bin:$PATH
     conda activate snakePipes
-    RNA-seq -m alignment -i /data/data -o /data/output --local -j 192 /data/indices/GRCm28.yaml
+    mRNA-seq -m alignment -i /data/data -o /data/output --local -j 192 /data/indices/GRCm28.yaml
 
 Receiving emails upon pipeline completion
 -----------------------------------------

diff --git a/docs/content/images/noncoding-RNA-seq_pipeline.png b/docs/content/images/noncoding-RNA-seq_pipeline.png
diff --git a/docs/content/running_snakePipes.rst b/docs/content/running_snakePipes.rst
@@ -57,7 +57,7 @@ The sample sheet
 ----------------
 
 Most of the workflows allow users to perform grouped operations as an option, for example
-differential expression analysis in RNA-seq workflow, differential binding analysis in
+differential expression analysis in mRNA-seq workflow, differential binding analysis in
 ChIP-Seq workflow, differential open-chromatin analysis in ATAC-seq workflow or merging of
 groups in Hi-C workflow. For all this analysis, snakePipes needs a ``sampleSheet.tsv`` file (file name is not important, but it has to be tab-separated) that contains sample grouping information. In most cases users would want to groups samples by replicates. The format of the file is as follows:
 

diff --git a/docs/content/setting_up.rst b/docs/content/setting_up.rst
@@ -188,7 +188,7 @@ Download premade indices
 For the sake of convenience, we provide premade indices for the following organisms:
 
  - `Human (GRCh38, Gencode release 29) <https://zenodo.org/record/2650763>`__
- - `Mouse (GRCm38/mm10, Gencode release m19) <https://zenodo.org/record/2650854>`__
+ - `Mouse (GRCm38/mm10, Gencode release m19) <https://zenodo.org/record/3629114>`__
  - `Mouse (GRCm37/mm9, Gencode release 1) <https://zenodo.org/record/2650849>`__
  - `Fruit fly (dm6, Ensembl release 94) <https://zenodo.org/record/2650762>`__
 

diff --git a/docs/content/workflows/ATAC-seq.rst b/docs/content/workflows/ATAC-seq.rst
@@ -47,7 +47,7 @@ If the user provides additional columns between 'name' and 'condition' in the sa
 
 The differential binding module utilizes the R package `CSAW <https://bioconductor.org/packages/release/bioc/html/csaw.html>`__ to detect significantly different peaks between two conditions.
 The analysis is performed on a union of peaks from all samples mentioned in the sample sheet. 
-This merged set of regions are provided as an output inside the **CSAW_sampleSheet** folder as the file 'DiffBinding_allregions.bed'. 
+This merged set of regions are provided as an output inside the **CSAW_MACS2_sampleSheet** folder as the file 'DiffBinding_allregions.bed'. 
 All differentially bound regions are available in 'CSAW/DiffBinding_significant.bed' . 
 Two thresholds are applied to produce ``Filtered.results.bed`` : FDR (default ``0.05`` ) as well as absolute log fold change (``1``). These can be specified either in the defaults.yaml dictionary or via commandline parameters '--FDR' and '--LFC'. Additionally, filtered results are split into up to 3 bed files, representing direction change (UP, DOWN, or MIXED).
 
@@ -82,6 +82,8 @@ There is a configuration file in ``snakePipes/workflows/ATACseq/defaults.yaml``:
     maxFragmentSize: 150
     minFragmentSize: 0
     verbose: false
+    ## which peak caller to use
+    peakCaller: 'MACS2'
     # sampleSheet_DB
     sampleSheet:
     # windowSize
@@ -119,7 +121,7 @@ Understanding the outputs
 Assuming a sample sheet is used, the following will be **added** to the working directory::
 
     .
-    ├── CSAW_sampleSheet
+    ├── CSAW_MACS2_sampleSheet
     │   ├── CSAW.log
     │   ├── CSAW.session_info.txt
     │   ├── DiffBinding_allregions.bed
@@ -134,6 +136,14 @@ Assuming a sample sheet is used, the following will be **added** to the working
     │   └── plotFingerprint
     │       ├── plotFingerprint.metrics.txt
     │       └── plotFingerprint.png
+    ├── Genrich
+    │   └── group1.narrowPeak
+    ├── HMMRATAC
+    │   ├── sample1.log
+    │   ├── sample1.model
+    │   ├── sample1_peaks.gappedPeak
+    │   ├── sample1_summits.bed
+    │   └── sample1_training.bed
     ├── MACS2
     │   ├── sample1.filtered.BAM_control_lambda.bdg
     │   ├── sample1.filtered.BAM_peaks.narrowPeak
@@ -151,8 +161,10 @@ Assuming a sample sheet is used, the following will be **added** to the working
         ├── sample1.filtered.BAM_peaks.qc.txt
         └── sample2.filtered.BAM_peaks.qc.txt
 
-Currently the ATAC-seq workflow performs detection of open chromatin regions via `MACS2 <https://github.com/taoliu/MACS>`__, and if a sample sheet is provided, the detection of differential open chromatin sites via `CSAW <https://bioconductor.org/packages/release/bioc/html/csaw.html>`__. There are additionally log files in most of the directories. The various outputs are documented in the CSAW and MACS2 documentation.
-For more information on the contents of the **CSAW_sampleSheet** folder, see section :ref:`diffOpenChrom` .
+Currently the ATAC-seq workflow performs detection of open chromatin regions via `MACS2 <https://github.com/taoliu/MACS>`__ (or `HMMRATAC <https://academic.oup.com/nar/article/47/16/e91/5519166>`__ or `Genrich <https://github.com/jsh58/Genrich>`__, if specified with ``--peakCaller``), and if a sample sheet is provided, the detection of differential open chromatin sites via `CSAW <https://bioconductor.org/packages/release/bioc/html/csaw.html>`__. There are additionally log files in most of the directories. The various outputs are documented in the CSAW and MACS2 documentation.
+For more information on the contents of the **CSAW_MACS2_sampleSheet** folder, see section :ref:`diffOpenChrom` .
+
+* **MACS2** / **HMMRATAC** / **Genrich**: Contains peaks found by the peak caller. The most useful files end in ``.narrowPeak`` or ``.gappedPeak`` and are appropriate for visualization in IGV.
 
 * **MACS2_QC**: contains a number of QC metrics that we find useful, namely :
     * the number of peaks
@@ -161,13 +173,15 @@ For more information on the contents of the **CSAW_sampleSheet** folder, see sec
 
 * **deepTools_ATAC**: contains the output of `plotFingerPrint <https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html>`__, which is a useful QC plot to assess signal enrichment between the ATAC-seq samples.
 
-.. note:: The ``_sampleSheet`` suffix for the ``CSAW_sampleSheet`` is drawn from the name of the sample sheet you use. So if you instead named the sample sheet ``mySampleSheet.txt`` then the folder would be named ``CSAW_mySampleSheet``. This facilitates using multiple sample sheets.
+.. note:: The ``_sampleSheet`` suffix for the ``CSAW_MACS2_sampleSheet`` is drawn from the name of the sample sheet you use. So if you instead named the sample sheet ``mySampleSheet.txt`` then the folder would be named ``CSAW_mySampleSheet``. This facilitates using multiple sample sheets. Similarly, ``_MACS2`` portion will be different if you use HMMRATAC or Genrich for peak calling.
+
+.. note:: The output from Genrich will be peaks called per-group if you specify a sample sheet. This is because Genrich is capable of directly using replicates during peak calling.
 
 
 Where to find final bam files and biwgwigs
 ------------------------------------------
 
-Bam files with the extention filtered.bam are only filtered for PCR duplicates. The final bam files filtered additionally for fragment size and used as direct input to MACS2 are found in the MACS2 folder with the exention ``.short.cleaned.bam``.
+Bam files with the extention filtered.bam are only filtered for PCR duplicates. The final bam files filtered additionally for fragment size and used as direct input to MACS2 are found in the short_bams folder with the exention ``.short.cleaned.bam``.
 Bigwig files calculated from these bam files are found under deepTools_ATAC/bamCompare with the extention ``.filtered.bw``.
 
 

diff --git a/docs/content/workflows/ChIP-seq.rst b/docs/content/workflows/ChIP-seq.rst
@@ -122,6 +122,8 @@ There is a configuration file in ``snakePipes/workflows/ChIP-seq/defaults.yaml``
     ## preconfigured target genomes (mm9,mm10,dm3,...) , see /path/to/snakemake_workflows/shared/organisms/
     ## Value can be also path to your own genome config file!
     genome:
+    ## Which peak caller should be used?
+    peakCaller: 'MACS2'
     ## paired end data?
     pairedEnd: true
     ## Bin size of output files in bigWig format
@@ -165,6 +167,8 @@ The ChIP-seq pipeline will generate additional output as follows::
     │   ├── sample2.filtered.histoneHMM-zinba-emfit.pdf
     │   ├── sample2.filtered.histoneHMM-zinba-params-em.RData
     │   └── sample2.filtered.histoneHMM-zinba-params-em.txt
+    ├── Genrich
+    │   └── sample2.narrowPeak
     └── MACS2
         ├── sample1.filtered.BAM_peaks.narrowPeak
         ├── sample1.filtered.BAM_peaks.qc.txt
@@ -187,7 +191,9 @@ Following up on the DNA-mapping module results (see :doc:`DNA-mapping`), the wor
 
 * **deepTools_ChIP**: Contains output from two of the deepTools modules. The `bamCompare <https://deeptools.readthedocs.io/en/develop/content/tools/bamCompare.html>`__ output contains the input-normalized coverage files for the samples, which is very useful for downstream analysis, such as visualization in IGV and plotting the heatmaps. The `plotFingerPrint <https://deeptools.readthedocs.io/en/develop/content/tools/plotFingerprint.html>`__ output is a useful QC plot to assess signal enrichment in the ChIP samples.
 
-* **MACS2**: This folder contains the output of `MACS2 <https://github.com/taoliu/MACS>`__ on the ChIP samples, MACS2 would perform either a **narrow** or **broad** peak calling on the samples, as indicated by the ChIP sample configuration file (see :ref:`ChIPconfig`). The outputs files would contain the respective tags (**narrowPeak** or **broadPeak**).
+* **Genrich**: This folder contains the output of `Genrich <https://github.com/jsh58/Genrich>`__. This will only exist IF you specified ``--peakCaller Genrich`` and you have samples with non-broad peaks. The output is in narrowPeak format, like that from MACS2.
+
+* **MACS2**: This folder contains the output of `MACS2 <https://github.com/taoliu/MACS>`__ on the ChIP samples, MACS2 would perform either a **narrow** or **broad** peak calling on the samples, as indicated by the ChIP sample configuration file (see :ref:`ChIPconfig`). The outputs files would contain the respective tags (**narrowPeak** or **broadPeak**). This folder will only exist if you have non-broad marks and use MACS2 for peak calling
 
 * **histoneHMM**: This folder contains the output of `histoneHMM <https://github.com/matthiasheinig/histoneHMM>`__. This folder will only exist if you have broad marks.
 
@@ -198,6 +204,8 @@ Following up on the DNA-mapping module results (see :doc:`DNA-mapping`), the wor
 
 .. note:: The ``_sampleSheet`` suffix for the ``CSAW_sampleSheet`` is drawn from the name of the sample sheet you use. So if you instead named the sample sheet ``mySampleSheet.txt`` then the folder would be named ``CSAW_mySampleSheet``. This facilitates using multiple sample sheets.
 
+.. note:: At the moment Genrich is NOT jointly calling peaks within a group since it's not aware of which samples contain which antibody. It is utilizing the input control if one exists.
+
 
 Command line options
 --------------------

diff --git a/docs/content/workflows/DNA-mapping.rst b/docs/content/workflows/DNA-mapping.rst
@@ -64,6 +64,11 @@ There is a configuration file in ``snakePipes/workflows/DNA-mapping/defaults.yam
     insertSizeMax: 1000
     alignerOpts:
     plotFormat: png
+    UMIBarcode: False
+    bcPattern: NNNNCCCCCCCC #default: 4 base umi barcode, 8 base cell barcode (eg. RELACS barcode)
+    UMIDedup: False
+    UMIDedupSep: "_"
+    UMIDedupOpts:
     ## Median/mean fragment length, only relevant for single-end data (default: 200)
     fragmentLength: 200
     qualimap: false