Skip to content

Commit

Permalink
Merge pull request #49 from neherlab/feat/finalize-paper
Browse files Browse the repository at this point in the history
Feat/finalize paper
  • Loading branch information
mmolari committed Dec 13, 2022
2 parents a97b7b9 + e8962ca commit d704cd2
Show file tree
Hide file tree
Showing 9 changed files with 35 additions and 34 deletions.
11 changes: 5 additions & 6 deletions script/rules/marginalization.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ rule MG_projection_full_graph:
"Creating projection graph for species {wildcards.species}"
input:
lambda w: expand(
"panx_data/{{species}}/fa/{acc}.fa", acc=MG_opt[w.species]["strains"]
"panx_results/fasta/{{species}}/{acc}.fa", acc=MG_opt[w.species]["strains"]
),
output:
"projections/{species}/full/pangraph_{kind}.json",
Expand All @@ -51,8 +51,8 @@ rule MG_pairwise_graphs:
message:
"Building pairwise graph for strains {wildcards.s1} - {wildcards.s2} ({wildcards.species})"
input:
"panx_data/{species}/fa/{s1}.fa",
"panx_data/{species}/fa/{s2}.fa",
"panx_results/fasta/{species}/{s1}.fa",
"panx_results/fasta/{species}/{s2}.fa",
output:
"projections/{species}/pairwise/pangraph_{kind}__{s1}-{s2}.json",
params:
Expand Down Expand Up @@ -85,8 +85,8 @@ rule MG_shared_kmers_pair:
message:
"species {wildcards.species} - evaluating the number of shared kmers {wildcards.s1}|{wildcards.s2}"
input:
s1="panx_data/{species}/fa/{s1}.fa",
s2="panx_data/{species}/fa/{s2}.fa",
s1="panx_results/fasta/{species}/{s1}.fa",
s2="panx_results/fasta/{species}/{s2}.fa",
output:
temp("projections/{species}/kmers/{s1}-{s2}.json"),
params:
Expand Down Expand Up @@ -128,7 +128,6 @@ rule MG_shared_kmers_summary:
pd.DataFrame(data).to_csv(output.csv, index=False)



rule MG_compare_projection_pairwise:
# Compares the pairwise and the marginalized graph, checking on what fraction of the genome
# the two agree. Results are saved in a json file
Expand Down
39 changes: 21 additions & 18 deletions script/rules/panx.smk
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ wildcard_constraints:
# final benchmark plots
rule PX_benchmark_all:
input:
"panx_data/benchmark/benchmark_compression.csv",
"panx_data/benchmark/benchmark_summary.csv",
"panx_results/benchmark/benchmark_compression.csv",
"panx_results/benchmark/benchmark_summary.csv",


# convert genbank files to fasta
Expand All @@ -49,7 +49,7 @@ rule PX_gbk_to_fa:
input:
"panx_data/{species}/input_GenBank/{acc}.gbk",
output:
"panx_data/{species}/fa/{acc}.fa",
"panx_results/fasta/{species}/{acc}.fa",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand All @@ -65,10 +65,12 @@ rule PX_build_full_pangraph:
message:
"building full pangraph ({wildcards.kind}) for {wildcards.species}"
input:
lambda w: expand("panx_data/{{species}}/fa/{acc}.fa", acc=PX_accnums[w.species]),
lambda w: expand(
"panx_results/fasta/{{species}}/{acc}.fa", acc=PX_accnums[w.species]
),
output:
pg="panx_data/{species}/pangraphs/pangraph-{kind}.json",
bm="panx_data/{species}/pangraphs/benchmark/pangraph-{kind}.txt",
pg="panx_results/pangraphs/{species}/pangraph-{kind}.json",
bm="panx_results/pangraphs/{species}/benchmark/pangraph-{kind}.txt",
params:
opt=lambda w: PX_ker_opt[w.kind],
conda:
Expand All @@ -89,13 +91,13 @@ rule PX_summary_performance_benchmark:
"Summary of pangraph performances"
input:
expand(
"panx_data/{species}/pangraphs/benchmark/pangraph-{kind}.txt",
rules.PX_build_full_pangraph.output.bm,
species=PX_species,
kind=PX_ker_names,
),
output:
csv="panx_data/benchmark/benchmark_summary.csv",
pdf="panx_data/benchmark/benchmark_summary.pdf",
csv="panx_results/benchmark/benchmark_summary.csv",
pdf="panx_results/benchmark/benchmark_summary.pdf",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand All @@ -113,13 +115,14 @@ rule PX_compression_benchmark:
"Compression performances for species {wildcards.species}"
input:
pang=expand(
"panx_data/{{species}}/pangraphs/pangraph-{kind}.json", kind=PX_ker_names
"panx_results/pangraphs/{{species}}/pangraph-{kind}.json",
kind=PX_ker_names,
),
fa=lambda w: expand(
"panx_data/{{species}}/fa/{acc}.fa", acc=PX_accnums[w.species]
"panx_results/fasta/{{species}}/{acc}.fa", acc=PX_accnums[w.species]
),
output:
json="panx_data/benchmark/{species}/compression.json",
json="panx_results/benchmark/{species}/compression.json",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand All @@ -135,10 +138,10 @@ rule PX_summary_compression_benchmark:
message:
"Summary of compression performances"
input:
expand("panx_data/benchmark/{species}/compression.json", species=PX_species),
expand(rules.PX_compression_benchmark.output.json, species=PX_species),
output:
csv="panx_data/benchmark/benchmark_compression.csv",
pdf="panx_data/benchmark/benchmark_compression.pdf",
csv="panx_results/benchmark/benchmark_compression.csv",
pdf="panx_results/benchmark/benchmark_compression.pdf",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand Down Expand Up @@ -225,7 +228,7 @@ def IS_select_strains(w):
s = w.size
t = w.trial
strains = PX_IS_strains[s][t]
return [f"panx_data/escherichia_coli/fa/{s}.fa" for s in strains]
return [f"panx_results/fasta/escherichia_coli/{s}.fa" for s in strains]


# rule to build a pangenome graph for the selected set of strains, specified by the size + trial dictionary
Expand Down Expand Up @@ -268,9 +271,9 @@ rule PX_IS_extract_stats_full_graph:
message:
"Extracting stats for full pangenome graph"
input:
pang="panx_data/escherichia_coli/pangraphs/pangraph-minimap20-std.json",
pang="panx_results/pangraphs/escherichia_coli/pangraph-minimap20-std.json",
fasta=expand(
"panx_data/escherichia_coli/fa/{acc}.fa",
"panx_results/fasta/escherichia_coli/{acc}.fa",
acc=PX_IS_allstrains,
),
output:
Expand Down
4 changes: 2 additions & 2 deletions script/rules/paper_figs.smk
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ rule PF_panx_compression_plot:
message:
"Creating plots for PanX compression performances"
input:
comp="panx_data/benchmark/benchmark_compression.csv",
summ="panx_data/benchmark/benchmark_summary.csv",
comp=rules.PX_summary_compression_benchmark.output.csv,
summ=rules.PX_summary_performance_benchmark.output.csv,
output:
main="figs/paper/panx/main.pdf",
suppl="figs/paper/panx/suppl.pdf",
Expand Down
2 changes: 1 addition & 1 deletion script/rules/size_benchmark.smk
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ rule SB_summary_dataframe:
n=SB_trials,
),
output:
"figs/size-benchmark/size_vs_time.csv",
"size-benchmark/summary/size_vs_time.csv",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand Down
2 changes: 1 addition & 1 deletion script/workflow_scripts/compression_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def single_fasta_stats(fa):

def extract_species(fname):
"""Extract species from the fasta filename input"""
return re.match("panx_data/([^/]+)/", fname).group(1)
return re.match("panx_results/fasta/([^/]+)/", fname).group(1)


def summary_fasta_stats(fas):
Expand Down
1 change: 0 additions & 1 deletion script/workflow_scripts/gbk_to_fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# sequence is saved. The fasta record id is the same as the genbank filename.
# The script checks that the original sequence id is compatible with this name.

from selectors import EpollSelector
from Bio import SeqIO
import argparse
import re
Expand Down
2 changes: 1 addition & 1 deletion script/workflow_scripts/paper_figs/accuracy_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def misplacement_plot(costs, savename, stat):
df,
args.pdf_snps,
kernel_title=titles,
fit_max_snps=0.002,
fit_max_snps=0.001,
)

# extract breakpoint misplacement distance, keep only relevant distance,
Expand Down
4 changes: 2 additions & 2 deletions script/workflow_scripts/plot-accuracy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,10 @@ function main(path, destdir, snps_keep)

data = load(path)
snps_keep = [parse(Float64,s) for s in snps_keep]
remove_extra_snps!(data, snps_keep)
grid = plotgrid(entropy(data)...; group=group(path))
remove_extra_snps!(data, snps_keep)
cdfs = plotcdfs(accuracy(data)...; group=group(path))

save(base, name) = savefig("$(destdir)/$(base)-$(name).png")
save(plot, base, name) = savefig(plot, "$(destdir)/$(base)-$(name).png")

Expand Down
4 changes: 2 additions & 2 deletions script/workflow_scripts/summary_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ def parsetime(x):
"species": str,
"kind": str,
"n-isolates": lambda x: len(
[y for y in x.split(" ") if y.startswith("panx_data/")]
[y for y in x.split(" ") if y.startswith("panx_results/fasta/")]
),
"mem": lambda x: int(x) / 1e6,
"wall-time": parsetime,
"usr-time": lambda x: float(x) / (3600),
"sys-time": lambda x: float(x) / (3600),
"cpu-percent": int,
"command": lambda x: " ".join(
[y for y in x.split(" ") if not y.startswith("panx_data/")]
[y for y in x.split(" ") if not y.startswith("panx_results/fasta/")]
),
}

Expand Down

0 comments on commit d704cd2

Please sign in to comment.