Merge pull request #891 from maxplanck-ie/3primeseq

3primeseq
maxplanck-ie · Apr 21, 2023 · e61c3bd · e61c3bd
2 parents 8ab7b69 + 5671e10
commit e61c3bd
Show file tree

Hide file tree

Showing 19 changed files with 1,142 additions and 12 deletions.
diff --git a/.ci_stuff/organism.yaml b/.ci_stuff/organism.yaml
@@ -1,7 +1,7 @@
 genome_size: 2652783500
 genome_fasta: "/tmp/genome.fa"
 genome_index: "/tmp/genome.fa.fai"
-genome_2bit: ".ci_stuff/genome_fasta/genome.2bit"
+genome_2bit: "/tmp/genome.2bit"
 bowtie2_index: ".ci_stuff/BowtieIndex/genome"
 hisat2_index: ".ci_stuff/HISAT2Index/genome"
 bwa_index: ".ci_stuff/BWAindex/genome.fa"

diff --git a/.ci_stuff/test_dag.sh b/.ci_stuff/test_dag.sh
@@ -106,7 +106,7 @@ touch allelic_BAM_input/allelic_bams/sample1.genome1.sorted.bam \
       allelic_BAM_input/bamCoverage/allele_specific/sample5.genome1.seq_depth_norm.bw \
       allelic_BAM_input/bamCoverage/allele_specific/sample6.genome1.seq_depth_norm.bw
 mkdir -p output
-touch /tmp/genes.gtf /tmp/genome.fa /tmp/genome.fa.fai /tmp/rmsk.txt /tmp/genes.bed /tmp/spikein_genes.gtf
+touch /tmp/genes.gtf /tmp/genome.fa /tmp/genome.fa.fai /tmp/rmsk.txt /tmp/genes.bed /tmp/spikein_genes.gtf /tmp/genome.2bit
 mkdir -p allelic_input
 mkdir -p allelic_input/Ngenome
 touch allelic_input/file.vcf.gz allelic_input/snpfile.txt
@@ -214,6 +214,9 @@ if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 657 ]; then exit 1 ; fi
 #multiple comparison groups
 WC=`mRNA-seq --mode alignment,alignment-free -i PE_input -o output --rMats --sampleSheet .ci_stuff/test_sampleSheet_multiComp.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 869 ]; then exit 1 ; fi
+# three prime sequencing
+WC=`mRNA-seq -i PE_input -o output --three-prime-seq --sampleSheet .ci_stuff/test_sampleSheet.tsv --snakemakeOptions " --dryrun --conda-prefix /tmp" .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
+if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1332 ]; then exit 1 ; fi
 #allelic
 WC=`mRNA-seq -m allelic-mapping,deepTools_qc -i PE_input -o output --snakemakeOptions " --dryrun --conda-prefix /tmp" --VCFfile allelic_input/file.vcf.gz --strains strain1,strain2 .ci_stuff/organism.yaml | tee >(cat 1>&2) | grep -v "Conda environment" | sed '/^\s*$/d' | wc -l`
 if [ ${PIPESTATUS[0]} -ne 0 ] || [ $WC -ne 1357 ]; then exit 1 ; fi

diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -38,7 +38,7 @@ jobs:
     - name: flake
       run: |
         micromamba activate snakePipes_CI
-        flake8 --ignore=E501,E722,E402 --exclude docs/conf.py .
+        flake8 --ignore=E501,E722,E402 --exclude docs/conf.py,build/lib/snakePipes/shared/tools/three_prime_seq,snakePipes/shared/tools/three_prime_seq .
   CI:
     runs-on: ubuntu-latest
     steps:

diff --git a/.gitignore b/.gitignore
@@ -26,3 +26,6 @@ output
 
 # pip stuff
 snakePipes.egg-info
+
+# misc
+.vscode/
diff --git a/docs/content/workflows/mRNA-seq.rst b/docs/content/workflows/mRNA-seq.rst
@@ -187,6 +187,28 @@ using the **deepTools_qc** mode. It's a very useful add-on with any of the other
 
 .. note:: Since most deeptools functions require an aligned (BAM) file, the deepTools_qc mode will additionally perform the alignment of the fastq files. However this would not interfere with operations of the other modes.
 
+"threePrimeSeq"
+~~~~~~~~~~~~~~~
+
+**threePrimeSeq** uses a pipeline developed by the Hilgers lab to annotate and 
+count clusters of reads mapping to three prime ends of genes using 
+poly(T)VN-primed 3' sequencing kits such as Lexogen's 3' mRNA-seq kit. 
+In this mode, **fastp** is used to pretrim with preset parameters, followed by 
+**STAR** mapping. 
+
+First, a blacklist of possible internal priming sites is generated for the 
+given organism. Next, the mapped regions are filtered according to this
+blacklist and associated with the nearest gene within a certain window. 
+For all samples within the run, a database of PAS sites is generated
+and read counts aggregated for each particular site. These are then 
+summarized on a metagene level and output to a counts.tsv file for 
+further downstream analysis. 
+
+The output for this mode will be stored in the ``three_prime_seq/`` subfolder. 
+
+.. note:: The ``--three-prime-seq`` option must be invoked (which will also set mode to threePrimeSeq) as this will set **fastp** and **STAR** with the appropriate parameters. 
+
+
 Understanding the outputs
 ---------------------------
 

diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py
@@ -581,8 +581,6 @@ def commonYAMLandLogs(baseDir, workflowDir, defaults, args, callingScript):
     and create the DAG
     """
     workflowName = os.path.basename(callingScript)
-    snakemake_path = os.path.dirname(os.path.abspath(callingScript))
-
     os.makedirs(args.outdir, exist_ok=True)
 
     if isinstance(args.snakemakeOptions, list):
@@ -632,9 +630,8 @@ def commonYAMLandLogs(baseDir, workflowDir, defaults, args, callingScript):
         args.snakemakeOptions += " --notemp"
 
     snakemake_cmd = """
-                    TMPDIR={tempDir} PYTHONNOUSERSITE=True {snakemake} {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir}
-                    """.format(snakemake=os.path.join(snakemake_path, "snakemake"),
-                               latency_wait=cluster_config["snakemake_latency_wait"],
+                    TMPDIR={tempDir} PYTHONNOUSERSITE=True snakemake {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir}
+                    """.format(latency_wait=cluster_config["snakemake_latency_wait"],
                                snakefile=os.path.join(workflowDir, "Snakefile"),
                                maxJobs=args.maxJobs,
                                workingdir=args.workingdir,

diff --git a/snakePipes/shared/rules/envs/shared.yaml b/snakePipes/shared/rules/envs/shared.yaml
@@ -15,3 +15,5 @@ dependencies:
  - multiqc = 1.12
  - fastp = 0.23.2
  - umi_tools = 1.1.2
+ - pybigwig = 0.3.18
+