From 3beb258fc926e63ab12641fbb84d1df756d00528 Mon Sep 17 00:00:00 2001 From: David Koppstein Date: Mon, 17 Apr 2023 14:18:11 +0200 Subject: [PATCH] major update, including dedup_clusterPAS.py for postprocessing and adding pyBigWig to shared YAML (pushed to nodes already) --- snakePipes/common_functions.py | 5 +- snakePipes/shared/rules/envs/shared.yaml | 2 + .../shared/rules/three_prime_seq.snakefile | 197 +++++++++++++----- .../tools/three_prime_seq/dedup_clusterPAS.py | 38 ++++ snakePipes/workflows/mRNA-seq/Snakefile | 12 +- snakePipes/workflows/mRNA-seq/defaults.yaml | 11 +- snakePipes/workflows/mRNA-seq/mRNA-seq | 7 +- 7 files changed, 211 insertions(+), 61 deletions(-) create mode 100644 snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py diff --git a/snakePipes/common_functions.py b/snakePipes/common_functions.py index 3e878ae38..d16d51742 100644 --- a/snakePipes/common_functions.py +++ b/snakePipes/common_functions.py @@ -632,9 +632,8 @@ def commonYAMLandLogs(baseDir, workflowDir, defaults, args, callingScript): args.snakemakeOptions += " --notemp" snakemake_cmd = """ - TMPDIR={tempDir} PYTHONNOUSERSITE=True {snakemake} {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir} - """.format(snakemake=os.path.join(snakemake_path, "snakemake"), - latency_wait=cluster_config["snakemake_latency_wait"], + TMPDIR={tempDir} PYTHONNOUSERSITE=True snakemake {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir} + """.format(latency_wait=cluster_config["snakemake_latency_wait"], snakefile=os.path.join(workflowDir, "Snakefile"), maxJobs=args.maxJobs, workingdir=args.workingdir, diff --git a/snakePipes/shared/rules/envs/shared.yaml b/snakePipes/shared/rules/envs/shared.yaml index fe22c5e10..4bf8573ca 100755 --- a/snakePipes/shared/rules/envs/shared.yaml +++ b/snakePipes/shared/rules/envs/shared.yaml @@ -15,3 +15,5 @@ dependencies: - multiqc = 1.12 - fastp = 0.23.2 - umi_tools = 1.1.2 + - pybigwig = 0.3.18 + \ No newline at end of file diff --git a/snakePipes/shared/rules/three_prime_seq.snakefile b/snakePipes/shared/rules/three_prime_seq.snakefile index 85ee4c6cc..ae572059e 100644 --- a/snakePipes/shared/rules/three_prime_seq.snakefile +++ b/snakePipes/shared/rules/three_prime_seq.snakefile @@ -1,30 +1,54 @@ # adapted from Andrew Rezansoff's 3' seq pipeline tools # /data/hilgers/group/rezansoff/3seq_pipeline_tools/ +from pathlib import Path +import pandas as pd + +# prevent wildcards from picking up directories +wildcard_constraints: sample="[^(\/)]+" + +# read in sampleSheet metadata to merge replicates for preprocess_cluster_pas +sample_metadata = pd.read_table(sampleSheet, index_col=None) + +# given a condition, return a list of all samples associated with it +# check that there are not overlapping samples +def samples_from_condition(condition): + sub_df = sample_metadata[sample_metadata['condition'] == condition] + assert(set(sub_df['name']) == set(sub_df['name'].unique())) + return list(sub_df['name']) + +# the opposite of the above +def condition_from_sample(sample): + sub_df = sample_metadata[sample_metadata['name'] == sample] + return sub_df['condition'].iloc[0] + # TODO: -# 1. pass three_prime_seq fastp arguments to fastp -# 2. specify all output in mRNA-seq snakefile -> how to pass --three-prime-seq? -# 3. check why cmatrix_filtered is commented out +# 3. check why cmatrix_filtered is commented out - OK +# 5. re-implement clusterPAS? +# 6. implement filtering of column 4 of the geneAssociation output - things with multiple hits (i.e. commas) should be filtered out [x] +# possibly assign cluster to "next" gene annotation (i.e. the closest?) +# this would be in signal2gene.py +# 7. assign unique IDs with _0, _1 to 4th column in clusterPAS output in 5'->3' order to create unique +# example is /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/AllSamples_clustered_strict1_fromSortedNumeric.txt -> +# /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/AllSamples_clustered_strict1_fromSortedNumeric_uniqIDs.txt +# this will be run by countReadEnds +# then possibly DESeq on countReadEnds output +# understand cmatrix steps? tools_dir = Path(maindir) / "shared" / "tools" # trimming done using STAR and fastp -rule all_three_prime_samples: - input: - -rule clusterPAS: - input: - "three_prime_seq/SampleAll.txt" - conda: - CONDA_SHARED_ENV - params: - script=(tools_dir / "three_prime_seq" / "clusterPAS.py"), - windowsize=config["clusterPASWindowSize"], # 10 +# rule clusterPAS: +# input: +# "three_prime_seq/SampleAll.txt" +# conda: +# CONDA_SHARED_ENV + rule polyAT: input: - twobit=genome_2bit, + two_bit=genome_2bit, bed=genes_bed output: "three_prime_seq/poly{base}.bed" @@ -32,7 +56,7 @@ rule polyAT: script=(tools_dir / "three_prime_seq" / "findSitesMM.py"), minlength=config["polyAT"]["minlength"], mindistance=config["polyAT"]["mindistance"], - extend=config["polyAT"]["extend"], + extension=config["polyAT"]["extend"], windowlength=config["polyAT"]["windowlength"], percbase=config["polyAT"]["percbase"], conda: @@ -43,13 +67,14 @@ rule polyAT: "--bed {input.bed} " "--minLength {params.minlength} " "--base {wildcards.base} " - "--extend {params.extend} " - "--minDistance {params.minDistance} " - "--windowLength {params.windowLength} " - "--percBase {params.percBase} " + "--extend {params.extension} " + "--minDistance {params.mindistance} " + "--windowLength {params.windowlength} " + "--percBase {params.percbase} " +# exclude second in pair, only 5' signal of R1 def bamcov_filter_opts(wc): - s = ("--filterRNAstrand {direction} --samFlagExclude 128 + s = ("--filterRNAstrand {direction} --samFlagExclude 128 " "--Offset 1 -bs 1 --skipNAs") return s.format(direction=wc.direction) @@ -58,7 +83,7 @@ rule three_prime_seq_bam_cov: bam=aligner + "/{sample}.sorted.bam", bai=aligner + "/{sample}.sorted.bam.bai" output: - "three_prime_seq/{sample}_{direction}.bw" + "three_prime_seq/raw/{sample}_direction-{direction}.bw" params: filterOpt=bamcov_filter_opts threads: @@ -73,17 +98,17 @@ rule three_prime_seq_bam_cov: def filterbw_which_bed(wc): - return ("three_prime_seq/polyA.bed" if wildcards.direction == "forward" + return ("three_prime_seq/polyA.bed" if wc.direction == "forward" else "three_prime_seq/polyT.bed") rule filterBW: input: - bigwig="three_prime_seq/{sample}_{direction}.bw", + bigwig="three_prime_seq/raw/{sample}_direction-{direction}.bw", bed=filterbw_which_bed output: - "three_prime_seq/{sample}_{direction}.filtered.bw" + "three_prime_seq/filtered/{sample}_direction-{direction}.bw" log: - stdout="three_prime_seq/logs/{sample}_{direction}.stdout, + stdout="three_prime_seq/logs/{sample}_{direction}.stdout", stderr="three_prime_seq/logs/{sample}_{direction}.stderr" params: script=(tools_dir / "three_prime_seq" / "filterBW.py") @@ -94,44 +119,111 @@ rule filterBW: "> {log.stdout} " "2> {log.stderr} " -# previously done by hand -# e.g. /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/countReadEnds_commands_real.txt -rule count_read_ends: - input: - expand("three_prime_seq/{{sample}}_{direction}.filtered.bw", - direction=["forward", "reverse"]) - output: - ids="three_prime_seq/{sample}_uniqIDs.txt", - counts="three_prime_seq/{sample}_uniqcounts.txt" - params: - script=(tools_dir / "three_prime_seq / "countReadEnds.py") - shell: - "{params.script} {input} {output}" # Associate signal with each gene (flank by some amount) # Note how many bases couldn't be associated with any gene rule geneAssociation: input: - expand("three_prime_seq/{{sample}}_{direction}.filtered.bw", - direction=["forward", "reverse"]) + expand("three_prime_seq/filtered/{{sample}}_direction-{direction}.bw", direction=["forward", "reverse"]) output: "three_prime_seq/{sample}_polyA_annotation.txt" threads: 16 params: - extend=config["geneAssociation"]["extend"], # 2000 + extension=config["geneAssociation"]["extend"], # 500 gtf=genes_gtf, script=(tools_dir / "three_prime_seq" / "signal2gene.py") conda: CONDA_SHARED_ENV shell: - "{params.script} --extend {params.extend} " + "{params.script} --extend {params.extension} " "--threads {threads} " "{input} {params.gtf} {output} " +# return list of replicates for each condition output by geneAssociation +def find_replicates_cluster_pas(wc): + _samples = samples_from_condition(str(wc.condition)) + return expand("three_prime_seq/{sample}_polyA_annotation.txt", sample=_samples) + +# here we need to merge replicates of geneAssociation -> see +# /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/combined_cluster_andGrep_commands +# also sort by start position +rule preprocess_cluster_pas: + input: + find_replicates_cluster_pas + output: + temp("three_prime_seq/tmp/condition-{condition}_preprocessed.txt") + shell: + "cat {input} | " + "sed '/^[ ]*Chrom/ d' | " + "sort -k1,1 -k2,2n " + "> {output} " + +rule clusterPAS: + # input is preprocessed output from geneAssoc + input: + "three_prime_seq/tmp/condition-{condition}_preprocessed.txt" + output: + temp("three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt") + conda: + CONDA_SHARED_ENV + params: + script=(tools_dir / "three_prime_seq" / "clusterPAS.py"), + windowsize=config["clusterPAS"]["window"], # 15 + shell: + "python {params.script} {input} {output}" + + +# awk command: remove entries with multiple genes in 4th column (must be unambiguous) +# python script: add "_1", "_2", to each cluster label (4th column) to make each +# unique for each genomic position +rule postprocess_cluster_pas: + input: + "three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt" + output: + "three_prime_seq/db/condition-{condition}_clusterPAS_db.txt" + conda: + CONDA_SHARED_ENV + params: + dedup_script=(tools_dir / "three_prime_seq" / "dedup_clusterPAS.py") + shell: + """ + grep -v "5'UTR" {input} | \ + grep -v "CDS" | \ + grep -v "exon" | \ + awk '!($4 ~ /[,]/)' | \ + python {params.dedup_script} > {output} + """ + + +# return clusterPAS db for corresponding condition given sample +def find_cluster_pas_db(wc): + condition = condition_from_sample(str(wc.sample)) + return ("three_prime_seq/db/condition-{condition}_clusterPAS_db.txt" + .format(condition=condition)) + + +# previously done by hand +# e.g. /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/countReadEnds_commands_real.txt +# each line of output of clusterPAS must be unique'd +rule count_read_ends: + input: + bws=expand("three_prime_seq/filtered/{{sample}}_direction-{direction}.bw", + direction=["forward", "reverse"]), + bed=find_cluster_pas_db, # this is the "database" of PAS sites + output: + counts="three_prime_seq/{sample}_uniqcounts.txt" + wildcard_constraints: + sample="[^\/]+" # no / + conda: + CONDA_SHARED_ENV + params: + script=(tools_dir / "three_prime_seq" / "countReadEnds.py") + shell: + "python {params.script} {input} {output}" + -## Need a better regex filter ".+(?