Merge pull request #366 from maxplanck-ie/develop

Develop
maxplanck-ie · Dec 7, 2018 · 898986b · 898986b
2 parents 584695c + c84c72d
commit 898986b
Show file tree

Hide file tree

Showing 33 changed files with 103 additions and 33 deletions.
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: snakepipes
-  version: 1.1.1
+  version: 1.1.2
 
 source:
   path: ../

diff --git a/docs/content/News.rst b/docs/content/News.rst
@@ -6,7 +6,8 @@ snakePipes 1.1.2
 
  * A number of minor bug fixes and enhancements in the HiC and WGBS pipelines
  * The RNA-seq pipeline now uses samtools for sorting. This should avoid issues with STAR running out of memory during the output sorting step.
- * Increased the memory allocation for MACS2 to 8GB
+ * Increased the memory allocation for MACS2 to 8GB and bamPEFragmentSize to 3G
+ * Fixed the scRNA-seq pipeline, which seems to have been broken in 1.1.1
 
 snakePipes 1.1.1
 ----------------

diff --git a/snakePipes/__init__.py b/snakePipes/__init__.py
@@ -1 +1 @@
-__version__ = '1.1.1'
+__version__ = '1.1.2'
diff --git a/snakePipes/shared/rscripts/CSAW.R b/snakePipes/shared/rscripts/CSAW.R
@@ -1,5 +1,6 @@
 #!/usr/bin/env Rscript
 ## ChIPseq differential binding workflow
+.libPaths(R.home("library"))
 
 sampleInfoFilePath <- snakemake@input[["sampleSheet"]]  #"samplesheet.tab"
 insert_size_metrics <- snakemake@input[["insert_size_metrics"]] # bamPEFragmentSize output

diff --git a/snakePipes/shared/rscripts/DB_functions.R b/snakePipes/shared/rscripts/DB_functions.R
@@ -13,6 +13,9 @@
 #' @examples
 #' readfiles_chip(csvFile = "testBAMs/testSampleSheet.csv", refAllele = "pat")
 #'
+
+.libPaths(R.home("library"))
+
 readfiles_chip <- function(sampleSheet, fragment_length, window_size, alleleSpecific = FALSE, pe.param){
 
     # check that not >2 conditions are given

diff --git a/snakePipes/shared/rscripts/DESeq2.R b/snakePipes/shared/rscripts/DESeq2.R
@@ -10,6 +10,8 @@
 # args 6 : T/F whether or not the workflow is allele-sepecific
 # args 7 : tx2gene file for salmon --> DESeq mode
 
+.libPaths(R.home("library"))
+
 args = commandArgs(TRUE)
 
 

diff --git a/snakePipes/shared/rscripts/DESeq2Report.Rmd b/snakePipes/shared/rscripts/DESeq2Report.Rmd
@@ -26,6 +26,7 @@ params:
 
 ```{r setup}
 #### Libraries needed
+.libPaths(R.home('library'))
 
 ## Bioconductor
 library('DESeq2')
@@ -458,16 +459,16 @@ plotCounts_gg <- function(i, dds, intgroup) {
     ## Change in version 1.15.3
     ## It might not be necessary to have any of this if else, but I'm not
     ## sure that plotCounts(returnData) will always return the 'group' variable.
-    if('group' %in% colnames(data)) {
+    if('condition' %in% colnames(data)) {
         data$group <- group
     } else {
         data <- cbind(data, data.frame('group' = group))
     }
 
-    ggplot(data, aes(x = group, y = count)) + geom_point() + ylab('Normalized count') + ggtitle(i) + coord_trans(y = "log10") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
+    ggplot(data, aes(x = group, y = count)) + geom_jitter(width=0.2) + ylab('Normalized count') + ggtitle(i) + coord_trans(y = "log10") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
 }
 for(i in head(features, nBestFeatures)) {
-    print(plotCounts_gg(i, dds = dds, intgroup = intgroup))
+    print(plotCounts_gg(i, dds = dds, intgroup = intgroup[2:length(intgroup)]))
 }
 ```
 

diff --git a/snakePipes/shared/rscripts/DE_functions.R b/snakePipes/shared/rscripts/DE_functions.R
@@ -15,6 +15,8 @@
 #' @examples
 #'
 
+.libPaths(R.home("library"))
+
 checktable <- function(countdata = NA, sampleSheet = NA, alleleSpecific = FALSE, salmon_dir = NA, tx2gene_annot = NA) {
 
   ## check whether colnames are allele-specific

diff --git a/snakePipes/shared/rscripts/WGBS_QC_report_template.Rmd b/snakePipes/shared/rscripts/WGBS_QC_report_template.Rmd
@@ -9,6 +9,8 @@ params:
 ---
 
 ```{r setup, include=FALSE}
+.libPaths(R.home("library"))
+
 knitr::opts_chunk$set(echo = FALSE)
 ```
 

diff --git a/snakePipes/shared/rscripts/WGBS_stats_report_template.Rmd b/snakePipes/shared/rscripts/WGBS_stats_report_template.Rmd
@@ -14,6 +14,8 @@ title: "`r paste0('WGBS ',params$stat_category,' stats report')`"
 ---
 
 ```{r setup, include=FALSE}
+.libPaths(R.home("library"))
+
 knitr::opts_chunk$set(echo = FALSE, out.width = '50%',out.height='50%')
 source(params$input_func)
 ```
@@ -68,4 +70,4 @@ A volcano plot was produced to visualize the effect of threshold application ont
 ```{r, fig.cap=get_fig_cap(params$outdir,"volcano")}
 volcano_plot<-dir(params$outdir,pattern="*volcano.plot.png",full.names=TRUE)
 knitr::include_graphics(volcano_plot)
-```
+```
diff --git a/snakePipes/shared/rscripts/WGBSpipe.POM.filt.R b/snakePipes/shared/rscripts/WGBSpipe.POM.filt.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 #run in R-3.3.1
 #a few lines of code from methylCtools bcall2beta by Hovestadt et al. 2014 were retained 
 #CpG position handling and coverage and Beta calculations by Katarzyna Sikora

diff --git a/snakePipes/shared/rscripts/WGBSpipe.interval_stats.limma.R b/snakePipes/shared/rscripts/WGBSpipe.interval_stats.limma.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 #run in R-3.3.1
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]

diff --git a/snakePipes/shared/rscripts/WGBSpipe.metilene_stats.limma.R b/snakePipes/shared/rscripts/WGBSpipe.metilene_stats.limma.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 #run in R-3.3.1
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]

diff --git a/snakePipes/shared/rscripts/WGBSpipe.prep_data_for_stats.R b/snakePipes/shared/rscripts/WGBSpipe.prep_data_for_stats.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 #run in R-3.3.1
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]

diff --git a/snakePipes/shared/rscripts/WGBSpipe.singleCpGstats.limma.R b/snakePipes/shared/rscripts/WGBSpipe.singleCpGstats.limma.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 #run in R-3.3.1
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]

diff --git a/snakePipes/shared/rscripts/WGBSstats_functions.R b/snakePipes/shared/rscripts/WGBSstats_functions.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 ##to be called by WGBS stats Rscripts
 
 print_sessionInfo<-function(mytext){

diff --git a/snakePipes/shared/rscripts/merge_count_tables.R b/snakePipes/shared/rscripts/merge_count_tables.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 library(tools)
 
 args <- commandArgs(trailingOnly=T)

diff --git a/snakePipes/shared/rscripts/merge_featureCounts.R b/snakePipes/shared/rscripts/merge_featureCounts.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 library(tools)
 
 args <- commandArgs(trailingOnly=T)

diff --git a/snakePipes/shared/rscripts/scRNAseq_cell_filter_monocle.R b/snakePipes/shared/rscripts/scRNAseq_cell_filter_monocle.R
@@ -1,4 +1,7 @@
 #run in R3.4.0
+
+.libPaths(R.home("library"))
+
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]
 #system(paste0('mkdir -p ',wdir)) #for debugging

diff --git a/snakePipes/shared/rscripts/scRNAseq_cell_filter_raceid.R b/snakePipes/shared/rscripts/scRNAseq_cell_filter_raceid.R
@@ -1,4 +1,7 @@
 #run in R3.4.0
+
+.libPaths(R.home("library"))
+
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]
 #system(paste0('mkdir -p ',wdir)) #for debugging

diff --git a/snakePipes/shared/rscripts/scRNAseq_monocle_stats_report.Rmd b/snakePipes/shared/rscripts/scRNAseq_monocle_stats_report.Rmd
@@ -18,6 +18,8 @@ This report summarizes statistical analyses of your single cell data stored in `
 The distribution of total transcript counts per cell (TPC) in unnormalized data was plotted.   
 
 ```{r}
+.libPaths(R.home("library"))
+
 knitr::include_graphics(file.path(params$outdir,"Expdata.ColumnSums.png"))
 ```
 

diff --git a/snakePipes/shared/rscripts/scRNAseq_raceid_stats_report.Rmd b/snakePipes/shared/rscripts/scRNAseq_raceid_stats_report.Rmd
@@ -18,6 +18,8 @@ This report summarizes statistical analyses of your single cell data stored in `
 The distribution of total transcript counts per cell (TPC) in unnormalized data was plotted.   
 
 ```{r}
+.libPaths(R.home("library"))
+
 knitr::include_graphics(file.path(params$outdir,"Expdata.ColumnSums.png"))
 ```
 

diff --git a/snakePipes/shared/rscripts/scRNAseq_select_threshold_cluster_monocle.R b/snakePipes/shared/rscripts/scRNAseq_select_threshold_cluster_monocle.R
@@ -1,4 +1,7 @@
 #run in R3.4.0
+
+.libPaths(R.home("library"))
+
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]
 #system(paste0('mkdir -p ',wdir)) #for debugging

diff --git a/snakePipes/shared/rscripts/scRNAseq_select_threshold_cluster_raceid.R b/snakePipes/shared/rscripts/scRNAseq_select_threshold_cluster_raceid.R
@@ -1,4 +1,7 @@
 #run in R3.4.0
+
+.libPaths(R.home("library"))
+
 #set working directory
 wdir<-commandArgs(trailingOnly=TRUE)[1]
 #system(paste0('mkdir -p ',wdir)) #for debugging

diff --git a/snakePipes/shared/rscripts/sleuth.R b/snakePipes/shared/rscripts/sleuth.R
@@ -1,3 +1,5 @@
+.libPaths(R.home("library"))
+
 library("sleuth")
 library("dplyr")
 #library("biomaRt")

diff --git a/snakePipes/shared/rscripts/wasabi.R b/snakePipes/shared/rscripts/wasabi.R
@@ -1,5 +1,7 @@
+.libPaths(R.home("library"))
+
 library(wasabi)
 
 args = commandArgs(trailingOnly=TRUE)
 
-prepare_fish_for_sleuth(args)
+prepare_fish_for_sleuth(args)
diff --git a/snakePipes/shared/rules/TrimGalore.snakefile b/snakePipes/shared/rules/TrimGalore.snakefile
@@ -91,16 +91,31 @@ else:
 
 ### FastQC_on_trimmed #######################################################
 
-rule FastQC_on_trimmed:
-    input:
-        fastq_dir+"/{sample}{read}.fastq.gz"
-    output:
-        "FastQC_trimmed/{sample}{read}_fastqc.html"
-    log:
-        out = "FastQC_trimmed/logs/FastQC_trimmed.{sample}{read}.out",
-        err = "FastQC_trimmed/logs/FastQC_trimmed.{sample}{read}.err"
-    benchmark:
-        "FastQC_trimmed/.benchmark/FastQC_trimmed.{sample}{read}.benchmark"
-    threads: 2
-    conda: CONDA_SHARED_ENV
-    shell: "fastqc -o FastQC_trimmed {input} > {log.out} 2> {log.err}"
+if paired:
+    rule FastQC_on_trimmed:
+        input:
+            fastq_dir+"/{sample}{read}.fastq.gz"
+        output:
+            "FastQC_trimmed/{sample}{read}_fastqc.html"
+        log:
+            out = "FastQC_trimmed/logs/FastQC_trimmed.{sample}{read}.out",
+            err = "FastQC_trimmed/logs/FastQC_trimmed.{sample}{read}.err"
+        benchmark:
+            "FastQC_trimmed/.benchmark/FastQC_trimmed.{sample}{read}.benchmark"
+        threads: 2
+        conda: CONDA_SHARED_ENV
+        shell: "fastqc -o FastQC_trimmed {input} > {log.out} 2> {log.err}"
+else:
+    rule FastQC_on_trimmed_SE:
+        input:
+            fastq_dir+"/{sample}"+reads[0]+".fastq.gz"
+        output:
+            "FastQC_trimmed/{sample}"+reads[0]+"_fastqc.html"
+        log:
+            out = "FastQC_trimmed/logs/FastQC_trimmed.{sample}"+reads[0]+".out",
+            err = "FastQC_trimmed/logs/FastQC_trimmed.{sample}"+reads[0]+".err"
+        benchmark:
+            "FastQC_trimmed/.benchmark/FastQC_trimmed.{sample}"+reads[0]+".benchmark"
+        threads: 2
+        conda: CONDA_SHARED_ENV
+        shell: "fastqc -o FastQC_trimmed {input} > {log.out} 2> {log.err}"  
diff --git a/snakePipes/shared/rules/filter_annotation.snakefile b/snakePipes/shared/rules/filter_annotation.snakefile
@@ -35,8 +35,8 @@ rule create_annotation_bed:
         """ pos=match($0,"transcript_[bio]*type.([^[:space:]]+)",a); if (pos!=0) tt=a[1]; else tt="NA"; """
         """ pos=match($0,"gene_name.([^[:space:]]+)",a);  if (pos!=0) gna=a[1]; else gna=gid; """
         """ pos=match($0,"gene_[bio]*type.([^[:space:]]+)",a); if (pos!=0) gt=a[1]; else gt="NA"; """
-        """ pos=match($0,"(transcript_support_level.[^[:space:]]+)",a); if (pos!=0) tsl=a[1]; else tsl="transcript_support_level NA"; """
-        """ pos=match($0,"[[:space:]](level.[^[:space:]]+)",a); if (pos!=0) lvl=a[1] ; else lvl="level NA"; """
+        """ pos=match($0,"transcript_support_level.([^[:space:]]+)",a); if (pos!=0) tsl=a[1]; else tsl="NA"; """
+        """ pos=match($0,"[[:space:]]level.([^[:space:]]+)",a); if (pos!=0) lvl=a[1] ; else lvl="NA"; """
         """ pos=match($0,"tag.basic"); if (lvl!~"NA"){{if (pos==0) basic="full"; else basic="basic"}} else basic="NA"; """
         """ OFS="\\t"; print tid,tna,tt,gid,gna,gt,"gencode",basic,"transcript_support_level",tsl,"level",lvl}}' | """
         """ sort | uniq | sort -k1,1) | """

diff --git a/snakePipes/shared/rules/multiQC.snakefile b/snakePipes/shared/rules/multiQC.snakefile
@@ -51,11 +51,11 @@ def multiqc_input_check(return_value):
         indir += " ".join(expand("HiC_matrices/QCplots/{sample}_QC ", sample = samples))
     elif pipeline == "scrna-seq":
         if trim:
-            infiles.append( expand("FastQC_trimmed/{sample}_fastqc.html", sample = samples) )
+            infiles.append( expand("FastQC_trimmed/{sample}"+reads[0]+"_fastqc.html", sample = samples) )
             indir += " FastQC_trimmed "
-            infiles.append( expand("FastQC/{sample}{read}_fastqc.html", sample = samples, read = reads) )
+            infiles.append( expand("FastQC/{sample}"+reads[0]+"_fastqc.html", sample = samples) )
             indir +=" FastQC "
-            infiles.append( expand(fastq_dir+"/{sample}.fastq.gz", sample = samples, read = reads) )
+            infiles.append( expand(fastq_dir+"/{sample}"+reads[0]+".fastq.gz", sample = samples) )
             indir += fastq_dir + " "
         elif fastqc:
              infiles.append( expand("FastQC/{sample}{read}_fastqc.html", sample = samples, read = reads) )

diff --git a/snakePipes/shared/rules/scRNAseq.snakefile b/snakePipes/shared/rules/scRNAseq.snakefile
@@ -1,9 +1,9 @@
 ### add barcodes from R1 to R2 #########
 
 rule fastq_barcode:
-        input:
-            R1 = "FASTQ/{sample}"+reads[0]+".fastq.gz",
-            R2 = "FASTQ/{sample}"+reads[1]+".fastq.gz"
+        input: ## remember that we swapped reads[] in internals.snakefile in this workflow!!!
+            R2 = "FASTQ/{sample}"+reads[0]+".fastq.gz",
+            R1 = "FASTQ/{sample}"+reads[1]+".fastq.gz"
         output:
             R2_barcoded = "FASTQ_barcoded/{sample}"+reads[0]+".fastq.gz"
         params:

diff --git a/snakePipes/workflows/scRNAseq/Snakefile b/snakePipes/workflows/scRNAseq/Snakefile
@@ -45,8 +45,8 @@ if trim:
 def run_Trimming(trim):
     if trim:
         file_list = [
-        expand(fastq_dir+"/{sample}.fastq.gz", sample = samples),
-        expand("FastQC_trimmed/{sample}_fastqc.html", sample = samples)
+        expand(fastq_dir+"/{sample}"+reads[0]+".fastq.gz", sample = samples),
+        expand("FastQC_trimmed/{sample}"+reads[0]+"_fastqc.html", sample = samples)
         ]
         return(file_list)
     else:
@@ -139,4 +139,4 @@ rule all:
 onsuccess:
     cf.cleanLogs(outdir)
     if "verbose" in config and config["verbose"]:
-        print("\n--- scRNAseq-mapcount workflow finished successfully! --------------------------------\n")
+        print("\n--- scRNAseq workflow finished successfully! --------------------------------\n")
diff --git a/snakePipes/workflows/scRNAseq/cluster.yaml b/snakePipes/workflows/scRNAseq/cluster.yaml
@@ -1,7 +1,9 @@
 STAR:
-    memory: 3200M
+    memory: 3500M
 sc_bam_featureCounts_genomic:
     memory: 4G
+bamPE_fragment_size:
+    memory: 3G
 combine_sample_counts:
     memory: 10G
 cluster_cells_raceid:

diff --git a/snakePipes/workflows/scRNAseq/internals.snakefile b/snakePipes/workflows/scRNAseq/internals.snakefile
@@ -43,6 +43,9 @@ if not cf.is_paired(infiles,ext,reads):
 ## After barcode transfer to R2 we have only single end data / R2
 ## but we need to keep "reads" for rule fastq_barcode
 paired = False
+## we swap read extensions as we continue in SE mode but with R2
+##some rules use a hardcoded reads[0] for SE
+reads = reads[::-1]
 
 ### barcode pattern extraction #################################################
 pattern = re.compile("[N]+")