Skip to content

Commit

Permalink
major update, including dedup_clusterPAS.py for postprocessing and ad…
Browse files Browse the repository at this point in the history
…ding pyBigWig to shared YAML (pushed to nodes already)
  • Loading branch information
David Koppstein committed Apr 17, 2023
1 parent c0a1fe9 commit 3beb258
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 61 deletions.
5 changes: 2 additions & 3 deletions snakePipes/common_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -632,9 +632,8 @@ def commonYAMLandLogs(baseDir, workflowDir, defaults, args, callingScript):
args.snakemakeOptions += " --notemp"

snakemake_cmd = """
TMPDIR={tempDir} PYTHONNOUSERSITE=True {snakemake} {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir}
""".format(snakemake=os.path.join(snakemake_path, "snakemake"),
latency_wait=cluster_config["snakemake_latency_wait"],
TMPDIR={tempDir} PYTHONNOUSERSITE=True snakemake {snakemakeOptions} --latency-wait {latency_wait} --snakefile {snakefile} --jobs {maxJobs} --directory {workingdir} --configfile {configFile} --keep-going --use-conda --conda-prefix {condaEnvDir}
""".format(latency_wait=cluster_config["snakemake_latency_wait"],
snakefile=os.path.join(workflowDir, "Snakefile"),
maxJobs=args.maxJobs,
workingdir=args.workingdir,
Expand Down
2 changes: 2 additions & 0 deletions snakePipes/shared/rules/envs/shared.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ dependencies:
- multiqc = 1.12
- fastp = 0.23.2
- umi_tools = 1.1.2
- pybigwig = 0.3.18

197 changes: 146 additions & 51 deletions snakePipes/shared/rules/three_prime_seq.snakefile
Original file line number Diff line number Diff line change
@@ -1,38 +1,62 @@
# adapted from Andrew Rezansoff's 3' seq pipeline tools
# /data/hilgers/group/rezansoff/3seq_pipeline_tools/

from pathlib import Path
import pandas as pd

# prevent wildcards from picking up directories
wildcard_constraints: sample="[^(\/)]+"

# read in sampleSheet metadata to merge replicates for preprocess_cluster_pas
sample_metadata = pd.read_table(sampleSheet, index_col=None)

# given a condition, return a list of all samples associated with it
# check that there are not overlapping samples
def samples_from_condition(condition):
sub_df = sample_metadata[sample_metadata['condition'] == condition]
assert(set(sub_df['name']) == set(sub_df['name'].unique()))
return list(sub_df['name'])

# the opposite of the above
def condition_from_sample(sample):
sub_df = sample_metadata[sample_metadata['name'] == sample]
return sub_df['condition'].iloc[0]

# TODO:
# 1. pass three_prime_seq fastp arguments to fastp
# 2. specify all output in mRNA-seq snakefile -> how to pass --three-prime-seq?
# 3. check why cmatrix_filtered is commented out
# 3. check why cmatrix_filtered is commented out - OK
# 5. re-implement clusterPAS?
# 6. implement filtering of column 4 of the geneAssociation output - things with multiple hits (i.e. commas) should be filtered out [x]
# possibly assign cluster to "next" gene annotation (i.e. the closest?)
# this would be in signal2gene.py
# 7. assign unique IDs with _0, _1 to 4th column in clusterPAS output in 5'->3' order to create unique
# example is /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/AllSamples_clustered_strict1_fromSortedNumeric.txt ->
# /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/AllSamples_clustered_strict1_fromSortedNumeric_uniqIDs.txt
# this will be run by countReadEnds
# then possibly DESeq on countReadEnds output
# understand cmatrix steps?

tools_dir = Path(maindir) / "shared" / "tools"

# trimming done using STAR and fastp

rule all_three_prime_samples:
input:

rule clusterPAS:
input:
"three_prime_seq/SampleAll.txt"
conda:
CONDA_SHARED_ENV
params:
script=(tools_dir / "three_prime_seq" / "clusterPAS.py"),
windowsize=config["clusterPASWindowSize"], # 10
# rule clusterPAS:
# input:
# "three_prime_seq/SampleAll.txt"
# conda:
# CONDA_SHARED_ENV


rule polyAT:
input:
twobit=genome_2bit,
two_bit=genome_2bit,
bed=genes_bed
output:
"three_prime_seq/poly{base}.bed"
params:
script=(tools_dir / "three_prime_seq" / "findSitesMM.py"),
minlength=config["polyAT"]["minlength"],
mindistance=config["polyAT"]["mindistance"],
extend=config["polyAT"]["extend"],
extension=config["polyAT"]["extend"],
windowlength=config["polyAT"]["windowlength"],
percbase=config["polyAT"]["percbase"],
conda:
Expand All @@ -43,13 +67,14 @@ rule polyAT:
"--bed {input.bed} "
"--minLength {params.minlength} "
"--base {wildcards.base} "
"--extend {params.extend} "
"--minDistance {params.minDistance} "
"--windowLength {params.windowLength} "
"--percBase {params.percBase} "
"--extend {params.extension} "
"--minDistance {params.mindistance} "
"--windowLength {params.windowlength} "
"--percBase {params.percbase} "

# exclude second in pair, only 5' signal of R1
def bamcov_filter_opts(wc):
s = ("--filterRNAstrand {direction} --samFlagExclude 128
s = ("--filterRNAstrand {direction} --samFlagExclude 128 "
"--Offset 1 -bs 1 --skipNAs")
return s.format(direction=wc.direction)

Expand All @@ -58,7 +83,7 @@ rule three_prime_seq_bam_cov:
bam=aligner + "/{sample}.sorted.bam",
bai=aligner + "/{sample}.sorted.bam.bai"
output:
"three_prime_seq/{sample}_{direction}.bw"
"three_prime_seq/raw/{sample}_direction-{direction}.bw"
params:
filterOpt=bamcov_filter_opts
threads:
Expand All @@ -73,17 +98,17 @@ rule three_prime_seq_bam_cov:


def filterbw_which_bed(wc):
return ("three_prime_seq/polyA.bed" if wildcards.direction == "forward"
return ("three_prime_seq/polyA.bed" if wc.direction == "forward"
else "three_prime_seq/polyT.bed")

rule filterBW:
input:
bigwig="three_prime_seq/{sample}_{direction}.bw",
bigwig="three_prime_seq/raw/{sample}_direction-{direction}.bw",
bed=filterbw_which_bed
output:
"three_prime_seq/{sample}_{direction}.filtered.bw"
"three_prime_seq/filtered/{sample}_direction-{direction}.bw"
log:
stdout="three_prime_seq/logs/{sample}_{direction}.stdout,
stdout="three_prime_seq/logs/{sample}_{direction}.stdout",
stderr="three_prime_seq/logs/{sample}_{direction}.stderr"
params:
script=(tools_dir / "three_prime_seq" / "filterBW.py")
Expand All @@ -94,55 +119,122 @@ rule filterBW:
"> {log.stdout} "
"2> {log.stderr} "

# previously done by hand
# e.g. /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/countReadEnds_commands_real.txt
rule count_read_ends:
input:
expand("three_prime_seq/{{sample}}_{direction}.filtered.bw",
direction=["forward", "reverse"])
output:
ids="three_prime_seq/{sample}_uniqIDs.txt",
counts="three_prime_seq/{sample}_uniqcounts.txt"
params:
script=(tools_dir / "three_prime_seq / "countReadEnds.py")
shell:
"{params.script} {input} {output}"

# Associate signal with each gene (flank by some amount)
# Note how many bases couldn't be associated with any gene
rule geneAssociation:
input:
expand("three_prime_seq/{{sample}}_{direction}.filtered.bw",
direction=["forward", "reverse"])
expand("three_prime_seq/filtered/{{sample}}_direction-{direction}.bw", direction=["forward", "reverse"])
output: "three_prime_seq/{sample}_polyA_annotation.txt"
threads: 16
params:
extend=config["geneAssociation"]["extend"], # 2000
extension=config["geneAssociation"]["extend"], # 500
gtf=genes_gtf,
script=(tools_dir / "three_prime_seq" / "signal2gene.py")
conda:
CONDA_SHARED_ENV
shell:
"{params.script} --extend {params.extend} "
"{params.script} --extend {params.extension} "
"--threads {threads} "
"{input} {params.gtf} {output} "

# return list of replicates for each condition output by geneAssociation
def find_replicates_cluster_pas(wc):
_samples = samples_from_condition(str(wc.condition))
return expand("three_prime_seq/{sample}_polyA_annotation.txt", sample=_samples)

# here we need to merge replicates of geneAssociation -> see
# /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/polyA_annotation_polysome2K/combined_cluster_andGrep_commands
# also sort by start position
rule preprocess_cluster_pas:
input:
find_replicates_cluster_pas
output:
temp("three_prime_seq/tmp/condition-{condition}_preprocessed.txt")
shell:
"cat {input} | "
"sed '/^[ ]*Chrom/ d' | "
"sort -k1,1 -k2,2n "
"> {output} "

rule clusterPAS:
# input is preprocessed output from geneAssoc
input:
"three_prime_seq/tmp/condition-{condition}_preprocessed.txt"
output:
temp("three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt")
conda:
CONDA_SHARED_ENV
params:
script=(tools_dir / "three_prime_seq" / "clusterPAS.py"),
windowsize=config["clusterPAS"]["window"], # 15
shell:
"python {params.script} {input} {output}"


# awk command: remove entries with multiple genes in 4th column (must be unambiguous)
# python script: add "_1", "_2", to each cluster label (4th column) to make each
# unique for each genomic position
rule postprocess_cluster_pas:
input:
"three_prime_seq/tmp/condition-{condition}_clusterPAS_tmpdb.txt"
output:
"three_prime_seq/db/condition-{condition}_clusterPAS_db.txt"
conda:
CONDA_SHARED_ENV
params:
dedup_script=(tools_dir / "three_prime_seq" / "dedup_clusterPAS.py")
shell:
"""
grep -v "5'UTR" {input} | \
grep -v "CDS" | \
grep -v "exon" | \
awk '!($4 ~ /[,]/)' | \
python {params.dedup_script} > {output}
"""


# return clusterPAS db for corresponding condition given sample
def find_cluster_pas_db(wc):
condition = condition_from_sample(str(wc.sample))
return ("three_prime_seq/db/condition-{condition}_clusterPAS_db.txt"
.format(condition=condition))


# previously done by hand
# e.g. /data/hilgers/group2/rezansoff/sakshiProj/2507_3seq/countReadEnds_commands_real.txt
# each line of output of clusterPAS must be unique'd
rule count_read_ends:
input:
bws=expand("three_prime_seq/filtered/{{sample}}_direction-{direction}.bw",
direction=["forward", "reverse"]),
bed=find_cluster_pas_db, # this is the "database" of PAS sites
output:
counts="three_prime_seq/{sample}_uniqcounts.txt"
wildcard_constraints:
sample="[^\/]+" # no /
conda:
CONDA_SHARED_ENV
params:
script=(tools_dir / "three_prime_seq" / "countReadEnds.py")
shell:
"python {params.script} {input} {output}"


## Need a better regex filter ".+(?<!\.filtered)"
rule cmatrix_raw:
input:
expand("three_prime_seq/{sample}_{{direction}}.bw", sample = samples)
expand("three_prime_seq/raw/{sample}_direction-{{direction}}.bw", sample=samples)
output:
temp("three_prime_seq/cmatrix_raw_direction-{direction}.mat.gz")
threads:
16
conda:
CONDA_SHARED_ENV
params:
upstream=config["geneAssociation"]["upstream"], #500
downstream=config["geneAssociation"]["downstream"], #500
upstream=config["cmatrix_raw"]["upstream"], #500
downstream=config["cmatrix_raw"]["downstream"], #500
labels=samples,
bed=genes_bed
bed=genes_bed,
shell:
"computeMatrix scale-regions "
"-S {input} "
Expand All @@ -156,11 +248,12 @@ rule cmatrix_raw:
"-bs 5 "
"-o {output} "


rule cmatrix_filtered:
input:
"three_prime_seq/cmatrix_raw_direction-{direction}.mat.gz"
output:
temp("three_prime_seq/cmatrix_raw_direction-{direction}.filtered.mat.gz")
temp("three_prime_seq/cmatrix_filtered_direction-{direction}.mat.gz")
threads: 8
params:
strand=lambda wc: "+" if wc.direction == "forward" else "-"
Expand All @@ -172,9 +265,10 @@ rule cmatrix_filtered:
"-o {output} "
"--strand '{params.strand}' "


rule merge_matrix:
input:
expand("three_prime_seq/cmatrix_raw_direction-{direction}.filtered.mat.gz",
expand("three_prime_seq/cmatrix_filtered_direction-{direction}.mat.gz",
direction=["forward", "reverse"])
output:
"three_prime_seq/combined_polyA.mat.gz"
Expand All @@ -192,4 +286,5 @@ rule heatmap:
conda:
CONDA_SHARED_ENV
shell:
"plotProfile -m {input} -o {output}"
"plotProfile -m {input} -o {output}"

38 changes: 38 additions & 0 deletions snakePipes/shared/tools/three_prime_seq/dedup_clusterPAS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python

import pandas as pd
import sys

"""
Read in post-processed output of clusterPAS from STDIN, add header,
deduplicate the Gene column (4th), write to STDOUT.
"""


HEADERS = [
"Chromosome",
"Start",
"End",
"Gene",
"Counts",
"Strand",
"Annotation",
"Summit",
]


def dedup(sub_df):
new_ids = ["_".join([str(gene), str(i)]) for (i, gene) in enumerate(sub_df['Gene'])]
sub_df['Gene'] = new_ids
return sub_df


def main():
df = pd.read_table(sys.stdin, index_col=None, header=None)
df.columns = HEADERS
df = df.groupby("Gene").apply(dedup)
df.to_csv(sys.stdout, sep="\t", index=False)


if __name__ == "__main__":
main()
12 changes: 9 additions & 3 deletions snakePipes/workflows/mRNA-seq/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ else:
## non-allelic featureCounts
include: os.path.join(maindir, "shared", "rules", "featureCounts.snakefile")

# Three Prime Sequencing mode
if "threePrimeSeq" in mode:
include: os.path.join(maindir, "shared", "rules", "three_prime_seq.snakefile")

##Genomic_contamination
include: os.path.join(maindir, "shared", "rules", "GenomicContamination.snakefile")

Expand Down Expand Up @@ -287,11 +291,13 @@ def run_deepTools_qc():


def run_threePrimeSeq():
file_list = []
if "threePrimeSeq" in mode:
file_list = [
expand(),
expand(),
file_list += [
"three_prime_seq/combined_polyA.mat.gz",
expand("three_prime_seq/{sample}_uniqcounts.txt", sample=samples),
]
return file_list


def run_GenomicContamination():
Expand Down

0 comments on commit 3beb258

Please sign in to comment.