major update, including dedup_clusterPAS.py for postprocessing and ad…

…ding pyBigWig to shared YAML (pushed to nodes already)
maxplanck-ie · Apr 17, 2023 · 3beb258 · 3beb258
1 parent c0a1fe9
commit 3beb258
Show file tree

Hide file tree

Showing 7 changed files with 211 additions and 61 deletions.
diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py
@@ -632,9 +632,8 @@ def commonYAMLandLogs(baseDir, workflowDir, defaults, args, callingScript):
         args.snakemakeOptions += " --notemp"
 
     snakemake_cmd = """
-                    TMPDIR={tempDir} PYTHONNOUSERSITE=True {snakemake} {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir}
-                    """.format(snakemake=os.path.join(snakemake_path, "snakemake"),
-                               latency_wait=cluster_config["snakemake_latency_wait"],
+                    TMPDIR={tempDir} PYTHONNOUSERSITE=True snakemake {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir}
+                    """.format(latency_wait=cluster_config["snakemake_latency_wait"],
                                snakefile=os.path.join(workflowDir, "Snakefile"),
                                maxJobs=args.maxJobs,
                                workingdir=args.workingdir,

diff --git a/snakePipes/shared/rules/envs/shared.yaml b/snakePipes/shared/rules/envs/shared.yaml
@@ -15,3 +15,5 @@ dependencies:
  - multiqc = 1.12
  - fastp = 0.23.2
  - umi_tools = 1.1.2
+ - pybigwig = 0.3.18
+
diff --git a/snakePipes/shared/rules/three_prime_seq.snakefile b/snakePipes/shared/rules/three_prime_seq.snakefile
@@ -1,38 +1,62 @@
 # adapted from Andrew Rezansoff's 3' seq pipeline tools
 # /data/hilgers/group/rezansoff/3seq_pipeline_tools/
 
+from pathlib import Path
+import pandas as pd
+
+# prevent wildcards from picking up directories
+wildcard_constraints: sample="[^(\/)]+"
+
+# read in sampleSheet metadata to merge replicates for preprocess_cluster_pas 
+sample_metadata = pd.read_table(sampleSheet, index_col=None)
+
+# given a condition, return a list of all samples associated with it
+# check that there are not overlapping samples
+def samples_from_condition(condition):
+    sub_df = sample_metadata[sample_metadata['condition'] == condition]
+    assert(set(sub_df['name']) == set(sub_df['name'].unique()))
+    return list(sub_df['name'])
+
+# the opposite of the above
+def condition_from_sample(sample):
+    sub_df = sample_metadata[sample_metadata['name'] == sample]
+    return sub_df['condition'].iloc[0]
+
 # TODO: 
-# 1. pass three_prime_seq fastp arguments to fastp
-# 2. specify all output in mRNA-seq snakefile -> how to pass --three-prime-seq?
-# 3. check why cmatrix_filtered is commented out
+# 3. check why cmatrix_filtered is commented out - OK
+# 5. re-implement clusterPAS?  
+# 6. implement filtering of column 4 of the geneAssociation output - things with multiple hits (i.e. commas) should be filtered out [x]
+# possibly assign cluster to "next" gene annotation (i.e. the closest?) 
+# this would be in signal2gene.py
+# 7. assign unique IDs with _0, _1 to 4th column in clusterPAS output in 5'->3' order to create unique 
+# example is /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/AllSamples_clustered_strict1_fromSortedNumeric.txt -> 
+# /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/AllSamples_clustered_strict1_fromSortedNumeric_uniqIDs.txt
+# this will be run by countReadEnds
+# then possibly DESeq on countReadEnds output
+# understand cmatrix steps? 
 
 tools_dir = Path(maindir) / "shared" / "tools"
 
 # trimming done using STAR and fastp
 
-rule all_three_prime_samples:
-    input: 
-
-rule clusterPAS:
-    input: 
-        "three_prime_seq/SampleAll.txt"
-    conda:
-        CONDA_SHARED_ENV
-    params: 
-        script=(tools_dir / "three_prime_seq" / "clusterPAS.py"),
-        windowsize=config["clusterPASWindowSize"], # 10
+# rule clusterPAS:
+#     input: 
+#         "three_prime_seq/SampleAll.txt"
+#     conda:
+#         CONDA_SHARED_ENV
+
 
 rule polyAT: 
     input: 
-        twobit=genome_2bit,
+        two_bit=genome_2bit,
         bed=genes_bed
     output: 
         "three_prime_seq/poly{base}.bed"
     params:
         script=(tools_dir / "three_prime_seq" / "findSitesMM.py"),
         minlength=config["polyAT"]["minlength"],
         mindistance=config["polyAT"]["mindistance"],
-        extend=config["polyAT"]["extend"],
+        extension=config["polyAT"]["extend"],
         windowlength=config["polyAT"]["windowlength"],
         percbase=config["polyAT"]["percbase"],
     conda:
@@ -43,13 +67,14 @@ rule polyAT:
         "--bed {input.bed} "
         "--minLength {params.minlength} "
         "--base {wildcards.base} "
-        "--extend {params.extend} "
-        "--minDistance {params.minDistance} "
-        "--windowLength {params.windowLength} "
-        "--percBase {params.percBase} "
+        "--extend {params.extension} "
+        "--minDistance {params.mindistance} "
+        "--windowLength {params.windowlength} "
+        "--percBase {params.percbase} "
 
+# exclude second in pair, only 5' signal of R1
 def bamcov_filter_opts(wc):
-    s = ("--filterRNAstrand {direction} --samFlagExclude 128 
+    s = ("--filterRNAstrand {direction} --samFlagExclude 128 "
          "--Offset 1 -bs 1 --skipNAs")
     return s.format(direction=wc.direction)
 
@@ -58,7 +83,7 @@ rule three_prime_seq_bam_cov:
         bam=aligner + "/{sample}.sorted.bam",
         bai=aligner + "/{sample}.sorted.bam.bai"
     output: 
-        "three_prime_seq/{sample}_{direction}.bw"
+        "three_prime_seq/raw/{sample}_direction-{direction}.bw"
     params:
         filterOpt=bamcov_filter_opts
     threads: 
@@ -73,17 +98,17 @@ rule three_prime_seq_bam_cov:
 
 
 def filterbw_which_bed(wc):
-    return ("three_prime_seq/polyA.bed" if wildcards.direction == "forward" 
+    return ("three_prime_seq/polyA.bed" if wc.direction == "forward" 
             else "three_prime_seq/polyT.bed")
 
 rule filterBW:
     input:
-        bigwig="three_prime_seq/{sample}_{direction}.bw",
+        bigwig="three_prime_seq/raw/{sample}_direction-{direction}.bw",
         bed=filterbw_which_bed
     output: 
-        "three_prime_seq/{sample}_{direction}.filtered.bw"
+        "three_prime_seq/filtered/{sample}_direction-{direction}.bw"
     log:
-        stdout="three_prime_seq/logs/{sample}_{direction}.stdout,
+        stdout="three_prime_seq/logs/{sample}_{direction}.stdout",
         stderr="three_prime_seq/logs/{sample}_{direction}.stderr"
     params:
         script=(tools_dir / "three_prime_seq" / "filterBW.py")
@@ -94,55 +119,122 @@ rule filterBW:
         "> {log.stdout} "
         "2> {log.stderr} "
 
-# previously done by hand 
-# e.g. /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/countReadEnds_commands_real.txt
-rule count_read_ends:
-    input: 
-        expand("three_prime_seq/{{sample}}_{direction}.filtered.bw", 
-               direction=["forward", "reverse"])
-    output:
-        ids="three_prime_seq/{sample}_uniqIDs.txt",
-        counts="three_prime_seq/{sample}_uniqcounts.txt"
-    params:
-        script=(tools_dir / "three_prime_seq / "countReadEnds.py")
-    shell:
-        "{params.script} {input} {output}"
 
 # Associate signal with each gene (flank by some amount)
 # Note how many bases couldn't be associated with any gene
 rule geneAssociation:
     input:
-        expand("three_prime_seq/{{sample}}_{direction}.filtered.bw", 
-               direction=["forward", "reverse"])
+        expand("three_prime_seq/filtered/{{sample}}_direction-{direction}.bw", direction=["forward", "reverse"])
     output: "three_prime_seq/{sample}_polyA_annotation.txt"
     threads: 16
     params:
-        extend=config["geneAssociation"]["extend"], # 2000
+        extension=config["geneAssociation"]["extend"], # 500
         gtf=genes_gtf,
         script=(tools_dir / "three_prime_seq" / "signal2gene.py")
     conda:
         CONDA_SHARED_ENV
     shell: 
-        "{params.script} --extend {params.extend} "
+        "{params.script} --extend {params.extension} "
         "--threads {threads} "
         "{input} {params.gtf} {output} "
 
+# return list of replicates for each condition output by geneAssociation
+def find_replicates_cluster_pas(wc):
+    _samples = samples_from_condition(str(wc.condition))
+    return expand("three_prime_seq/{sample}_polyA_annotation.txt", sample=_samples)
+
+# here we need to merge replicates of geneAssociation -> see 
+# /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/combined_cluster_andGrep_commands
+# also sort by start position
+rule preprocess_cluster_pas:
+    input: 
+        find_replicates_cluster_pas
+    output: 
+        temp("three_prime_seq/tmp/condition-{condition}_preprocessed.txt")
+    shell:        
+        "cat {input} | "
+        "sed '/^[ ]*Chrom/ d' | " 
+        "sort -k1,1 -k2,2n " 
+        "> {output} "
+
+rule clusterPAS: 
+    # input is preprocessed output from geneAssoc
+    input: 
+        "three_prime_seq/tmp/condition-{condition}_preprocessed.txt"
+    output: 
+        temp("three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt")
+    conda:
+        CONDA_SHARED_ENV
+    params: 
+        script=(tools_dir / "three_prime_seq" / "clusterPAS.py"),
+        windowsize=config["clusterPAS"]["window"], # 15
+    shell:
+        "python {params.script} {input} {output}"
+
+
+# awk command: remove entries with multiple genes in 4th column (must be unambiguous)
+# python script: add "_1", "_2", to each cluster label (4th column) to make each 
+# unique for each genomic position
+rule postprocess_cluster_pas:
+    input: 
+        "three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt"
+    output: 
+        "three_prime_seq/db/condition-{condition}_clusterPAS_db.txt"
+    conda:
+        CONDA_SHARED_ENV
+    params:
+        dedup_script=(tools_dir / "three_prime_seq" / "dedup_clusterPAS.py")
+    shell:
+        """
+        grep -v "5'UTR" {input} | \
+        grep -v "CDS" | \
+        grep -v "exon" | \
+        awk '!($4 ~ /[,]/)' | \
+        python {params.dedup_script} > {output}
+        """
+
+
+# return clusterPAS db for corresponding condition given sample
+def find_cluster_pas_db(wc):
+    condition = condition_from_sample(str(wc.sample))
+    return ("three_prime_seq/db/condition-{condition}_clusterPAS_db.txt"
+            .format(condition=condition))
+
+
+# previously done by hand 
+# e.g. /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/countReadEnds_commands_real.txt
+# each line of output of clusterPAS must be unique'd 
+rule count_read_ends:
+    input: 
+        bws=expand("three_prime_seq/filtered/{{sample}}_direction-{direction}.bw", 
+                   direction=["forward", "reverse"]),
+        bed=find_cluster_pas_db, # this is the "database" of PAS sites
+    output:
+        counts="three_prime_seq/{sample}_uniqcounts.txt"
+    wildcard_constraints:
+        sample="[^\/]+" # no /
+    conda:
+        CONDA_SHARED_ENV
+    params:
+        script=(tools_dir / "three_prime_seq" / "countReadEnds.py")
+    shell:
+        "python {params.script} {input} {output}"
+
 
-## Need a better regex filter ".+(?<!\.filtered)"
 rule cmatrix_raw:
     input: 
-        expand("three_prime_seq/{sample}_{{direction}}.bw", sample = samples)
+        expand("three_prime_seq/raw/{sample}_direction-{{direction}}.bw", sample=samples)
     output: 
         temp("three_prime_seq/cmatrix_raw_direction-{direction}.mat.gz")
     threads: 
         16
     conda:
         CONDA_SHARED_ENV
     params:
-        upstream=config["geneAssociation"]["upstream"], #500
-        downstream=config["geneAssociation"]["downstream"], #500
+        upstream=config["cmatrix_raw"]["upstream"], #500
+        downstream=config["cmatrix_raw"]["downstream"], #500
         labels=samples,
-        bed=genes_bed
+        bed=genes_bed,
     shell: 
         "computeMatrix scale-regions "
         "-S {input} "
@@ -156,11 +248,12 @@ rule cmatrix_raw:
         "-bs 5 "
         "-o {output} "
 
+
 rule cmatrix_filtered:
     input: 
         "three_prime_seq/cmatrix_raw_direction-{direction}.mat.gz"
     output: 
-        temp("three_prime_seq/cmatrix_raw_direction-{direction}.filtered.mat.gz")
+        temp("three_prime_seq/cmatrix_filtered_direction-{direction}.mat.gz")
     threads: 8
     params:
         strand=lambda wc: "+" if wc.direction == "forward" else "-"
@@ -172,9 +265,10 @@ rule cmatrix_filtered:
         "-o {output} "
         "--strand '{params.strand}' "
 
+
 rule merge_matrix:
     input:
-        expand("three_prime_seq/cmatrix_raw_direction-{direction}.filtered.mat.gz",
+        expand("three_prime_seq/cmatrix_filtered_direction-{direction}.mat.gz",
                direction=["forward", "reverse"])
     output: 
         "three_prime_seq/combined_polyA.mat.gz"
@@ -192,4 +286,5 @@ rule heatmap:
     conda: 
         CONDA_SHARED_ENV
     shell: 
-        "plotProfile -m {input} -o {output}"
+        "plotProfile -m {input} -o {output}"
+
diff --git a/snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py b/snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import sys
+
+"""
+Read in post-processed output of clusterPAS from STDIN, add header,
+deduplicate the Gene column (4th), write to STDOUT. 
+"""
+
+
+HEADERS = [
+    "Chromosome",
+    "Start",
+    "End",
+    "Gene",
+    "Counts",
+    "Strand",
+    "Annotation",
+    "Summit",
+]
+
+
+def dedup(sub_df):
+    new_ids = ["_".join([str(gene), str(i)]) for (i, gene) in enumerate(sub_df['Gene'])]
+    sub_df['Gene'] = new_ids
+    return sub_df
+
+
+def main():
+    df = pd.read_table(sys.stdin, index_col=None, header=None)
+    df.columns = HEADERS
+    df = df.groupby("Gene").apply(dedup)
+    df.to_csv(sys.stdout, sep="\t", index=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/snakePipes/workflows/mRNA-seq/Snakefile b/snakePipes/workflows/mRNA-seq/Snakefile
@@ -111,6 +111,10 @@ else:
     ## non-allelic featureCounts
     include: os.path.join(maindir, "shared", "rules", "featureCounts.snakefile")
 
+# Three Prime Sequencing mode
+if "threePrimeSeq" in mode:
+    include: os.path.join(maindir, "shared", "rules", "three_prime_seq.snakefile")
+
 ##Genomic_contamination
 include: os.path.join(maindir, "shared", "rules", "GenomicContamination.snakefile")
 
@@ -287,11 +291,13 @@ def run_deepTools_qc():
 
 
 def run_threePrimeSeq():
+    file_list = []
     if "threePrimeSeq" in mode: 
-        file_list = [
-            expand(),
-            expand(),
+        file_list += [
+            "three_prime_seq/combined_polyA.mat.gz",
+            expand("three_prime_seq/{sample}_uniqcounts.txt", sample=samples),
         ]
+    return file_list
 
 
 def run_GenomicContamination():