Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/finalize paper #49

Merged
merged 11 commits into from
Dec 13, 2022
2 changes: 1 addition & 1 deletion script/conda_envs/bioinfo_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,5 +126,5 @@ dependencies:
- zstd=1.5.2=h8a70e8d_2
- pip:
- networkx==2.8.4
- git+https://github.com/mmolari/pypangraph
- git+https://github.com/mmolari/pypangraph@0.1.0
prefix: /scicore/home/neher/molari0000/miniconda3/envs/bioinfo
11 changes: 5 additions & 6 deletions script/rules/marginalization.smk
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ rule MG_projection_full_graph:
"Creating projection graph for species {wildcards.species}"
input:
lambda w: expand(
"panx_data/{{species}}/fa/{acc}.fa", acc=MG_opt[w.species]["strains"]
"panx_results/fasta/{{species}}/{acc}.fa", acc=MG_opt[w.species]["strains"]
),
output:
"projections/{species}/full/pangraph_{kind}.json",
Expand All @@ -51,8 +51,8 @@ rule MG_pairwise_graphs:
message:
"Building pairwise graph for strains {wildcards.s1} - {wildcards.s2} ({wildcards.species})"
input:
"panx_data/{species}/fa/{s1}.fa",
"panx_data/{species}/fa/{s2}.fa",
"panx_results/fasta/{species}/{s1}.fa",
"panx_results/fasta/{species}/{s2}.fa",
output:
"projections/{species}/pairwise/pangraph_{kind}__{s1}-{s2}.json",
params:
Expand Down Expand Up @@ -85,8 +85,8 @@ rule MG_shared_kmers_pair:
message:
"species {wildcards.species} - evaluating the number of shared kmers {wildcards.s1}|{wildcards.s2}"
input:
s1="panx_data/{species}/fa/{s1}.fa",
s2="panx_data/{species}/fa/{s2}.fa",
s1="panx_results/fasta/{species}/{s1}.fa",
s2="panx_results/fasta/{species}/{s2}.fa",
output:
temp("projections/{species}/kmers/{s1}-{s2}.json"),
params:
Expand Down Expand Up @@ -128,7 +128,6 @@ rule MG_shared_kmers_summary:
pd.DataFrame(data).to_csv(output.csv, index=False)



rule MG_compare_projection_pairwise:
# Compares the pairwise and the marginalized graph, checking on what fraction of the genome
# the two agree. Results are saved in a json file
Expand Down
39 changes: 21 additions & 18 deletions script/rules/panx.smk
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ wildcard_constraints:
# final benchmark plots
rule PX_benchmark_all:
input:
"panx_data/benchmark/benchmark_compression.csv",
"panx_data/benchmark/benchmark_summary.csv",
"panx_results/benchmark/benchmark_compression.csv",
"panx_results/benchmark/benchmark_summary.csv",


# convert genbank files to fasta
Expand All @@ -49,7 +49,7 @@ rule PX_gbk_to_fa:
input:
"panx_data/{species}/input_GenBank/{acc}.gbk",
output:
"panx_data/{species}/fa/{acc}.fa",
"panx_results/fasta/{species}/{acc}.fa",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand All @@ -65,10 +65,12 @@ rule PX_build_full_pangraph:
message:
"building full pangraph ({wildcards.kind}) for {wildcards.species}"
input:
lambda w: expand("panx_data/{{species}}/fa/{acc}.fa", acc=PX_accnums[w.species]),
lambda w: expand(
"panx_results/fasta/{{species}}/{acc}.fa", acc=PX_accnums[w.species]
),
output:
pg="panx_data/{species}/pangraphs/pangraph-{kind}.json",
bm="panx_data/{species}/pangraphs/benchmark/pangraph-{kind}.txt",
pg="panx_results/pangraphs/{species}/pangraph-{kind}.json",
bm="panx_results/pangraphs/{species}/benchmark/pangraph-{kind}.txt",
params:
opt=lambda w: PX_ker_opt[w.kind],
conda:
Expand All @@ -89,13 +91,13 @@ rule PX_summary_performance_benchmark:
"Summary of pangraph performances"
input:
expand(
"panx_data/{species}/pangraphs/benchmark/pangraph-{kind}.txt",
rules.PX_build_full_pangraph.output.bm,
species=PX_species,
kind=PX_ker_names,
),
output:
csv="panx_data/benchmark/benchmark_summary.csv",
pdf="panx_data/benchmark/benchmark_summary.pdf",
csv="panx_results/benchmark/benchmark_summary.csv",
pdf="panx_results/benchmark/benchmark_summary.pdf",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand All @@ -113,13 +115,14 @@ rule PX_compression_benchmark:
"Compression performances for species {wildcards.species}"
input:
pang=expand(
"panx_data/{{species}}/pangraphs/pangraph-{kind}.json", kind=PX_ker_names
"panx_results/pangraphs/{{species}}/pangraph-{kind}.json",
kind=PX_ker_names,
),
fa=lambda w: expand(
"panx_data/{{species}}/fa/{acc}.fa", acc=PX_accnums[w.species]
"panx_results/fasta/{{species}}/{acc}.fa", acc=PX_accnums[w.species]
),
output:
json="panx_data/benchmark/{species}/compression.json",
json="panx_results/benchmark/{species}/compression.json",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand All @@ -135,10 +138,10 @@ rule PX_summary_compression_benchmark:
message:
"Summary of compression performances"
input:
expand("panx_data/benchmark/{species}/compression.json", species=PX_species),
expand(rules.PX_compression_benchmark.output.json, species=PX_species),
output:
csv="panx_data/benchmark/benchmark_compression.csv",
pdf="panx_data/benchmark/benchmark_compression.pdf",
csv="panx_results/benchmark/benchmark_compression.csv",
pdf="panx_results/benchmark/benchmark_compression.pdf",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand Down Expand Up @@ -225,7 +228,7 @@ def IS_select_strains(w):
s = w.size
t = w.trial
strains = PX_IS_strains[s][t]
return [f"panx_data/escherichia_coli/fa/{s}.fa" for s in strains]
return [f"panx_results/fasta/escherichia_coli/{s}.fa" for s in strains]


# rule to build a pangenome graph for the selected set of strains, specified by the size + trial dictionary
Expand Down Expand Up @@ -268,9 +271,9 @@ rule PX_IS_extract_stats_full_graph:
message:
"Extracting stats for full pangenome graph"
input:
pang="panx_data/escherichia_coli/pangraphs/pangraph-minimap20-std.json",
pang="panx_results/pangraphs/escherichia_coli/pangraph-minimap20-std.json",
fasta=expand(
"panx_data/escherichia_coli/fa/{acc}.fa",
"panx_results/fasta/escherichia_coli/{acc}.fa",
acc=PX_IS_allstrains,
),
output:
Expand Down
4 changes: 2 additions & 2 deletions script/rules/paper_figs.smk
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ rule PF_panx_compression_plot:
message:
"Creating plots for PanX compression performances"
input:
comp="panx_data/benchmark/benchmark_compression.csv",
summ="panx_data/benchmark/benchmark_summary.csv",
comp=rules.PX_summary_compression_benchmark.output.csv,
summ=rules.PX_summary_performance_benchmark.output.csv,
output:
main="figs/paper/panx/main.pdf",
suppl="figs/paper/panx/suppl.pdf",
Expand Down
2 changes: 1 addition & 1 deletion script/rules/size_benchmark.smk
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ rule SB_summary_dataframe:
n=SB_trials,
),
output:
"figs/size-benchmark/size_vs_time.csv",
"size-benchmark/summary/size_vs_time.csv",
conda:
"../conda_envs/bioinfo_env.yml"
shell:
Expand Down
2 changes: 1 addition & 1 deletion script/workflow_scripts/compression_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def single_fasta_stats(fa):

def extract_species(fname):
"""Extract species from the fasta filename input"""
return re.match("panx_data/([^/]+)/", fname).group(1)
return re.match("panx_results/fasta/([^/]+)/", fname).group(1)


def summary_fasta_stats(fas):
Expand Down
1 change: 0 additions & 1 deletion script/workflow_scripts/gbk_to_fa.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
# sequence is saved. The fasta record id is the same as the genbank filename.
# The script checks that the original sequence id is compatible with this name.

from selectors import EpollSelector
from Bio import SeqIO
import argparse
import re
Expand Down
2 changes: 1 addition & 1 deletion script/workflow_scripts/paper_figs/accuracy_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def misplacement_plot(costs, savename, stat):
df,
args.pdf_snps,
kernel_title=titles,
fit_max_snps=0.002,
fit_max_snps=0.001,
)

# extract breakpoint misplacement distance, keep only relevant distance,
Expand Down
4 changes: 2 additions & 2 deletions script/workflow_scripts/plot-accuracy.jl
Original file line number Diff line number Diff line change
Expand Up @@ -149,10 +149,10 @@ function main(path, destdir, snps_keep)

data = load(path)
snps_keep = [parse(Float64,s) for s in snps_keep]
remove_extra_snps!(data, snps_keep)
grid = plotgrid(entropy(data)...; group=group(path))
remove_extra_snps!(data, snps_keep)
cdfs = plotcdfs(accuracy(data)...; group=group(path))

save(base, name) = savefig("$(destdir)/$(base)-$(name).png")
save(plot, base, name) = savefig(plot, "$(destdir)/$(base)-$(name).png")

Expand Down
4 changes: 2 additions & 2 deletions script/workflow_scripts/summary_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ def parsetime(x):
"species": str,
"kind": str,
"n-isolates": lambda x: len(
[y for y in x.split(" ") if y.startswith("panx_data/")]
[y for y in x.split(" ") if y.startswith("panx_results/fasta/")]
),
"mem": lambda x: int(x) / 1e6,
"wall-time": parsetime,
"usr-time": lambda x: float(x) / (3600),
"sys-time": lambda x: float(x) / (3600),
"cpu-percent": int,
"command": lambda x: " ".join(
[y for y in x.split(" ") if not y.startswith("panx_data/")]
[y for y in x.split(" ") if not y.startswith("panx_results/fasta/")]
),
}

Expand Down