Merge pull request #725 from maxplanck-ie/develop

Develop
maxplanck-ie · Nov 16, 2020 · 3f3d2bf · 3f3d2bf
2 parents d73848c + 788163a
commit 3f3d2bf
Show file tree

Hide file tree

Showing 31 changed files with 514 additions and 44 deletions.
diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh
@@ -184,7 +184,7 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 331 ]; then exit 1 ; fi
 WC=`mRNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 826 ]; then exit 1 ; fi
 WC=`mRNA-seq -i PE_input -o output --rMats --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
-if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 841 ]; then exit 1 ; fi
+if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 842 ]; then exit 1 ; fi
 WC=`mRNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 602 ]; then exit 1 ; fi
 WC=`mRNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" -m "alignment,deepTools_qc" --trim .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
@@ -207,6 +207,9 @@ WC=`mRNA-seq -i SE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 909 ]; then exit 1 ; fi
 WC=`mRNA-seq -i BAM_input/filtered_bam -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --fromBAM .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 593 ]; then exit 1 ; fi
+#multiple comparison groups
+WC=`mRNA-seq --mode alignment,alignment-free -i PE_input -o output --rMats --sampleSheet .ci_stuff/test_sampleSheet_multiComp.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
+if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 867 ]; then exit 1 ; fi
 #allelic
 WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1,strain2 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1450 ]; then exit 1 ; fi
@@ -222,7 +225,9 @@ WC=`noncoding-RNA-seq -i SE_input -o output --sampleSheet .ci_stuff/test_sampleS
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 582 ]; then exit 1 ; fi
 WC=`noncoding-RNA-seq -i BAM_input -o output --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" --fromBAM .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 503 ]; then exit 1 ; fi
-
+#multiple comparison groups
+WC=`noncoding-RNA-seq -i PE_input -o output --sampleSheet .ci_stuff/test_sampleSheet_multiComp.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
+if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 689 ]; then exit 1 ; fi
 
 # scRNA-seq
 #WC=`scRNAseq -i PE_input -o output --mode Gruen --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`

diff --git a/.ci_stuff/test_sampleSheet_multiComp.tsv b/.ci_stuff/test_sampleSheet_multiComp.tsv
@@ -0,0 +1,7 @@
+name	condition	group
+sample1	sample1	Control	All
+sample2	sample2	Control	All
+sample3	sample3	Treatment	Group1
+sample4	sample4	Treatment	Group1
+sample5	sample5	Treatment	Group2
+sample6	sample6	Treatment	Group2
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -1,6 +1,6 @@
 package:
   name: snakepipes
-  version: 2.3.1
+  version: 2.4.0
 
 source:
   path: ../

diff --git a/docs/content/News.rst b/docs/content/News.rst
@@ -1,6 +1,18 @@
 snakePipes News
 ===============
 
+snakePipes 2.4.0
+----------------
+
+* Added support for multiple pairwise comparisons for DESeq2, sleuth, and rMats in the mRNA-seq workflow, as well as for DESeq2 in the noncoding-RNA-seq workflow.
+* Loompy from conda is now used in mode STARsolo in scRNA-seq workflow.
+* Added bamExt to mRNA-seq and noncoding-RNA-seq commandline arguments.
+* Added multi-thread support to rMats in mRNA-seq workflow.
+* Fixed deepTools GC bias command with SE reads.
+* Bumped HiC explorer version.
+* Fixed STARsoloCoords for Custom kit.
+
+
 snakePipes 2.3.1
 ----------------
 

diff --git a/docs/content/setting_up.rst b/docs/content/setting_up.rst
@@ -34,7 +34,7 @@ The easiest way to install snakePipes is via our conda channel. The following co
 
 .. code:: bash
 
-    conda create -n snakePipes -c mpi-ie -c conda-forge -c bioconda snakePipes==2.3.1
+    conda create -n snakePipes -c mpi-ie -c conda-forge -c bioconda snakePipes==2.4.0
 
 This way, the software used within snakePipes do not conflict with the software pre-installed on your terminal or in your python environment.
 

diff --git a/docs/content/workflows/mRNA-seq.rst b/docs/content/workflows/mRNA-seq.rst
@@ -129,6 +129,23 @@ Complex designs with blocking factors
 
 If the user provides additional columns between 'name' and 'condition' in the sample sheet, the variables stored there will be used as blocking factors in the order they appear in the sample sheet. Eg. if the first line of your sample sheet looks like 'name	batch	condition', this will translate into a formula ``batch + condition``. 'condition' has to be the final column and it will be used for any statistical inference.
 
+Multiple pairwise comparisons
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The user may specify multiple groups of independent comparisons by providing a 'group' column after the 'condition' column. This will cause the sample sheet to be split into the groups defined in this column, and a corresponding number of independent pairwise comparisons will be run, one for each split sheet, and placed in separate output folders named accordingly. This will be applied to DESeq2, sleuth, and rMats pairwise comparisons as requested by the user.
+Specifying a value of 'All' in the 'group' column will cause that sample group to be used in all pairwise comparisons, e.g. if the same set of controls should be used for several different treatment groups.
+
+An example sample sheet with the group information provided looks like this:
+
+name	condition	group
+sample1	Control		All
+sample2	Control		All
+sample3	Treatment	Group1
+sample4	Treatment	Group1
+sample5	Treatment	Group2
+sample6	Treatment	Group2
+
+
 Analysis modes
 --------------
 

diff --git a/docs/content/workflows/noncoding-RNA-seq.rst b/docs/content/workflows/noncoding-RNA-seq.rst
@@ -102,6 +102,22 @@ Complex designs with blocking factors
 
 If the user provides additional columns between 'name' and 'condition' in the sample sheet, the variables stored there will be used as blocking factors in the order they appear in the sample sheet. Eg. if the first line of your sample sheet looks like 'name	batch	condition', this will translate into a formula ``batch + condition``. 'condition' has to be the final column and it will be used for any statistical inference.
 
+Multiple pairwise comparisons
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The user may specify multiple groups of independent comparisons by providing a 'group' column after the 'condition' column. This will cause the sample sheet to be split into the groups defined in this column, and a corresponding number of independent pairwise comparisons will be run, one for each split sheet, and placed in separate output folders named accordingly. This will be applied to DESeq2 pairwise comparison.
+Specifying a value of 'All' in the 'group' column will cause that sample group to be used in all pairwise comparisons, e.g. if the same set of controls should be used for several different treatment groups.
+
+An example sample sheet with the group information provided looks like this:
+
+name	condition	group
+sample1	Control		All
+sample2	Control		All
+sample3	Treatment	Group1
+sample4	Treatment	Group1
+sample5	Treatment	Group2
+sample6	Treatment	Group2
+
 Analysis modes
 --------------
 

diff --git a/docs/index.rst b/docs/index.rst
@@ -34,7 +34,7 @@ Quick start
 
 .. code:: bash
 
-    conda create -n snakePipes -c mpi-ie -c conda-forge -c bioconda snakePipes==2.3.1
+    conda create -n snakePipes -c mpi-ie -c conda-forge -c bioconda snakePipes==2.4.0
 
 * You can update snakePipes to the latest version available on conda with:
 

diff --git a/snakePipes/__init__.py b/snakePipes/__init__.py
@@ -1 +1 @@
-__version__ = '2.3.1'
+__version__ = '2.4.0'
diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py
@@ -22,6 +22,7 @@ def set_env_yamls():
     return {'CONDA_SHARED_ENV': 'envs/shared.yaml',
             'CONDA_CREATE_INDEX_ENV': 'envs/createIndices.yaml',
             'CONDA_RNASEQ_ENV': 'envs/rna_seq.yaml',
+            'CONDA_RMATS_ENV': 'envs/rMats.yaml',
             'CONDA_scRNASEQ_ENV': 'envs/sc_rna_seq.yaml',
             'CONDA_seurat3_ENV': 'envs/sc_rna_seq_seurat3.yaml',
             'CONDA_loompy_ENV': 'envs/sc_rna_seq_loompy.yaml',
@@ -230,6 +231,101 @@ def check_replicates(sample_info_file):
     return True
 
 
+def isMultipleComparison(sampleSheet):
+    f = open(sampleSheet)
+    nCols = None
+    d = dict()
+    for idx, line in enumerate(f):
+        cols = line.strip().split("\t")
+        if idx == 0:
+            if "group" not in cols:
+                return False
+            comparisonGroupCol = cols.index("group")
+            nCols = len(cols)
+            continue
+        elif idx == 1:
+            # Sometimes there's a column of row names, which lack a header
+            if len(cols) - 1 == nCols:
+                comparisonGroupCol += 1
+        if not len(line.strip()) == 0:
+            if cols[comparisonGroupCol] not in d:
+                d[cols[comparisonGroupCol]] = 0
+            d[cols[comparisonGroupCol]] += 1
+    f.close()
+
+    if len(d) > 1:
+        return True
+
+
+def splitSampleSheet(sampleSheet, destination_pfx):
+    f = open(sampleSheet)
+    conditionCol = None
+    nameCol = None
+    comparisonGroupCol = None
+    nCols = None
+    d = dict()
+    for idx, line in enumerate(f):
+        cols = line.strip().split("\t")
+        if idx == 0:
+            conditionCol = cols.index("condition")
+            nameCol = cols.index("name")
+            comparisonGroupCol = cols.index("group")
+            nCols = len(cols)
+            continue
+        elif idx == 1:
+            # Sometimes there's a column of row names, which lack a header
+            if len(cols) != nCols and len(cols) - 1 != nCols:
+                sys.exit("ERROR: there's a mismatch between the number of columns in the header and body of {}!\n".format(sampleSheet))
+            if len(cols) - 1 == nCols:
+                conditionCol += 1
+                nameCol += 1
+                comparisonGroupCol += 1
+        if not len(line.strip()) == 0:
+            if cols[comparisonGroupCol] not in d:
+                d[cols[comparisonGroupCol]] = []
+            d[cols[comparisonGroupCol]].append([cols[nameCol], cols[conditionCol]])
+
+    f.close()
+    for k in d.keys():
+        if k != "All" and "All" in d.keys():
+            d[k].extend(d['All'])
+            outfile = os.path.join("splitSampleSheets", '.'.join([os.path.basename(destination_pfx), k, 'tsv']))
+            with open(outfile, 'w') as of:
+                of.write('name\tcondition\n')
+                for item in d[k]:
+                    of.write('\t'.join(item) + '\n')
+
+    return
+
+
+def returnComparisonGroups(sampleSheet):
+    f = open(sampleSheet)
+    nCols = None
+    d = dict()
+    for idx, line in enumerate(f):
+        cols = line.strip().split("\t")
+        if idx == 0:
+            if "group" not in cols:
+                return False
+            comparisonGroupCol = cols.index("group")
+            nCols = len(cols)
+            continue
+        elif idx == 1:
+            # Sometimes there's a column of row names, which lack a header
+            if len(cols) - 1 == nCols:
+                comparisonGroupCol += 1
+        if not len(line.strip()) == 0:
+            if cols[comparisonGroupCol] not in d:
+                d[cols[comparisonGroupCol]] = 0
+            d[cols[comparisonGroupCol]] += 1
+    f.close()
+
+    if "All" in d.keys():
+        del d['All']
+
+    return d.keys()
+
+
 def sampleSheetGroups(sampleSheet):
     """
     Parse a sampleSheet and return a dictionary with keys the group and values the sample names

diff --git a/snakePipes/shared/rules/DESeq2.multipleComp.snakefile b/snakePipes/shared/rules/DESeq2.multipleComp.snakefile
@@ -0,0 +1,91 @@
+## function to get the name of the samplesheet and extend the name of the folder DESeq2 to DESeq2_[name]
+def get_outdir(folder_name,sampleSheet):
+    sample_name = os.path.splitext(os.path.basename(str(sampleSheet)))[0]
+
+    return("{}_{}".format(folder_name, sample_name))
+
+checkpoint split_sampleSheet:
+    input:
+        sampleSheet = sampleSheet
+    output:
+        splitSheets = os.path.join("splitSampleSheets",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")
+    params:
+        splitSheetPfx = os.path.join("splitSampleSheets",os.path.splitext(os.path.basename(str(sampleSheet)))[0])
+    run:
+        if isMultipleComparison:
+            cf.splitSampleSheet(input.sampleSheet,params.splitSheetPfx)
+
+
+## DESeq2 (on featureCounts)
+rule DESeq2:
+    input:
+        counts_table = lambda wildcards : "featureCounts/counts_allelic.tsv" if 'allelic-mapping' in mode else "featureCounts/counts.tsv",
+        sampleSheet = lambda wildcards: checkpoints.split_sampleSheet.get(compGroup=wildcards.compGroup).output,
+        symbol_file = "Annotation/genes.filtered.symbol" #get_symbol_file
+    output:
+         "{}/DESeq2.session_info.txt".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
+    benchmark:
+        "{}/.benchmark/DESeq2.featureCounts.benchmark".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
+    params:
+        script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
+        outdir = lambda wildcards,input: get_outdir("DESeq2",input.sampleSheet),
+        sampleSheet = lambda wildcards,input: os.path.join(outdir,str(input.sampleSheet)),
+        fdr = 0.05,
+        importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
+        allele_info = lambda wildcards : 'TRUE' if 'allelic-mapping' in mode else 'FALSE',
+        tx2gene_file = 'NA',
+        rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd")
+    log:
+        out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")),
+        err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
+    conda: CONDA_RNASEQ_ENV
+    shell:
+        "cd {params.outdir} && "
+        "Rscript {params.script} "
+        "{params.sampleSheet} " # 1
+        "../{input.counts_table} " # 2
+        "{params.fdr} " # 3
+        "../{input.symbol_file} " # 4
+        "{params.importfunc} " # 5
+        "{params.allele_info} " # 6
+        "{params.tx2gene_file} " # 7
+        "{params.rmdTemplate} " # 8
+        " > ../{log.out} 2> ../{log.err}"
+
+
+## DESeq2 (on Salmon)
+rule DESeq2_Salmon:
+    input:
+        counts_table = "Salmon/counts.transcripts.tsv",
+        sampleSheet = lambda wildcards: checkpoints.split_sampleSheet.get(compGroup=wildcards.compGroup).output,
+        tx2gene_file = "Annotation/genes.filtered.t2g",
+        symbol_file = "Annotation/genes.filtered.symbol" #get_symbol_file
+    output:
+        "{}/DESeq2.session_info.txt".format(get_outdir("DESeq2_Salmon",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
+    log:
+        out = "{}/logs/DESeq2.out".format(get_outdir("DESeq2_Salmon",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv")),
+        err = "{}/logs/DESeq2.err".format(get_outdir("DESeq2_Salmon",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
+    benchmark:
+        "{}/.benchmark/DESeq2.Salmon.benchmark".format(get_outdir("DESeq2_Salmon",os.path.splitext(os.path.basename(str(sampleSheet)))[0]+".{compGroup}.tsv"))
+    params:
+        script=os.path.join(maindir, "shared", "rscripts", "DESeq2.R"),
+        outdir = lambda wildcards,input: get_outdir("DESeq2_Salmon",input.sampleSheet),
+        sampleSheet = lambda wildcards,input: os.path.join(outdir,str(input.sampleSheet)),
+        fdr = 0.05,
+        importfunc = os.path.join(maindir, "shared", "rscripts", "DE_functions.R"),
+        allele_info = 'FALSE',
+        tx2gene_file = "Annotation/genes.filtered.t2g",
+        rmdTemplate = os.path.join(maindir, "shared", "rscripts", "DESeq2Report.Rmd")
+    conda: CONDA_RNASEQ_ENV
+    shell:
+        "cd {params.outdir} && "
+        "Rscript {params.script} "
+        "{params.sampleSheet} " # 1
+        "../{input.counts_table} " # 2
+        "{params.fdr} " # 3
+        "../{input.symbol_file} " # 4
+        "{params.importfunc} " # 5
+        "{params.allele_info} " # 6
+        "../{input.tx2gene_file} " # 7
+        "{params.rmdTemplate} " # 8
+        " > ../{log.out} 2> ../{log.err}"
diff --git a/snakePipes/shared/rules/DESeq2.snakefile → .../shared/rules/DESeq2.singleComp.snakefile b/snakePipes/shared/rules/DESeq2.snakefile → .../shared/rules/DESeq2.singleComp.snakefile
diff --git a/snakePipes/shared/rules/deepTools_qc.snakefile b/snakePipes/shared/rules/deepTools_qc.snakefile
@@ -188,7 +188,7 @@ rule computeGCBias:
         genome_size = int(genome_size),
         genome_2bit = genome_2bit,
         blacklist = "--blackListFileName {}".format(blacklist_bed) if blacklist_bed else "",
-        median_fragment_length = "" if pairedEnd else "-fragmentLength {}".format(fragmentLength),
+        median_fragment_length = "" if pairedEnd else "--fragmentLength {}".format(fragmentLength),
         sampleSize = downsample if downsample and downsample < 10000000 else 10000000
     log:
         out = "deepTools_qc/logs/computeGCBias.{sample}.filtered.out",

diff --git a/snakePipes/shared/rules/envs/hic.yaml b/snakePipes/shared/rules/envs/hic.yaml
@@ -3,6 +3,6 @@ channels:
  - conda-forge
  - bioconda
 dependencies:
- - hicexplorer = 3.5.3
+ - hicexplorer = 3.6
  - bwa = 0.7.17
  - samtools = 1.10
diff --git a/snakePipes/shared/rules/envs/rMats.yaml b/snakePipes/shared/rules/envs/rMats.yaml
@@ -0,0 +1,8 @@
+name: snakepipes_rMats_environment_0.1
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ - python >= 3
+ - rmats = 4.1.0
+ - samtools = 1.10
diff --git a/snakePipes/shared/rules/envs/rna_seq.yaml b/snakePipes/shared/rules/envs/rna_seq.yaml
@@ -3,6 +3,7 @@ channels:
  - conda-forge
  - bioconda
 dependencies:
+# - python >= 3
  - bedtools = 2.29.2
  - samtools = 1.10
  - subread = 2.0.0
@@ -24,4 +25,4 @@ dependencies:
  - bioconductor-tximport = 1.14.0
  - r-knitcitations
  - bioconductor-apeglm
- - rmats
+# - rmats = 4.1.0
diff --git a/snakePipes/shared/rules/envs/sc_rna_seq_loompy.yaml b/snakePipes/shared/rules/envs/sc_rna_seq_loompy.yaml
@@ -4,5 +4,4 @@ channels:
  - bioconda
 dependencies:
  - python=3.8
- - pip:
-   - loompy
+ - loompy=3.0.6