Merge pull request #49 from neherlab/feat/finalize-paper

Feat/finalize paper
neherlab · Dec 13, 2022 · d704cd2 · d704cd2
2 parents a97b7b9 + e8962ca
commit d704cd2
Show file tree

Hide file tree

Showing 9 changed files with 35 additions and 34 deletions.
diff --git a/script/rules/marginalization.smk b/script/rules/marginalization.smk
@@ -31,7 +31,7 @@ rule MG_projection_full_graph:
         "Creating projection graph for species {wildcards.species}"
     input:
         lambda w: expand(
-            "panx_data/{{species}}/fa/{acc}.fa", acc=MG_opt[w.species]["strains"]
+            "panx_results/fasta/{{species}}/{acc}.fa", acc=MG_opt[w.species]["strains"]
         ),
     output:
         "projections/{species}/full/pangraph_{kind}.json",
@@ -51,8 +51,8 @@ rule MG_pairwise_graphs:
     message:
         "Building pairwise graph for strains {wildcards.s1} - {wildcards.s2} ({wildcards.species})"
     input:
-        "panx_data/{species}/fa/{s1}.fa",
-        "panx_data/{species}/fa/{s2}.fa",
+        "panx_results/fasta/{species}/{s1}.fa",
+        "panx_results/fasta/{species}/{s2}.fa",
     output:
         "projections/{species}/pairwise/pangraph_{kind}__{s1}-{s2}.json",
     params:
@@ -85,8 +85,8 @@ rule MG_shared_kmers_pair:
     message:
         "species {wildcards.species} - evaluating the number of shared kmers {wildcards.s1}|{wildcards.s2}"
     input:
-        s1="panx_data/{species}/fa/{s1}.fa",
-        s2="panx_data/{species}/fa/{s2}.fa",
+        s1="panx_results/fasta/{species}/{s1}.fa",
+        s2="panx_results/fasta/{species}/{s2}.fa",
     output:
         temp("projections/{species}/kmers/{s1}-{s2}.json"),
     params:
@@ -128,7 +128,6 @@ rule MG_shared_kmers_summary:
         pd.DataFrame(data).to_csv(output.csv, index=False)
 
 
-
 rule MG_compare_projection_pairwise:
     # Compares the pairwise and the marginalized graph, checking on what fraction of the genome
     # the two agree. Results are saved in a json file

diff --git a/script/rules/panx.smk b/script/rules/panx.smk
@@ -38,8 +38,8 @@ wildcard_constraints:
 # final benchmark plots
 rule PX_benchmark_all:
     input:
-        "panx_data/benchmark/benchmark_compression.csv",
-        "panx_data/benchmark/benchmark_summary.csv",
+        "panx_results/benchmark/benchmark_compression.csv",
+        "panx_results/benchmark/benchmark_summary.csv",
 
 
 # convert genbank files to fasta
@@ -49,7 +49,7 @@ rule PX_gbk_to_fa:
     input:
         "panx_data/{species}/input_GenBank/{acc}.gbk",
     output:
-        "panx_data/{species}/fa/{acc}.fa",
+        "panx_results/fasta/{species}/{acc}.fa",
     conda:
         "../conda_envs/bioinfo_env.yml"
     shell:
@@ -65,10 +65,12 @@ rule PX_build_full_pangraph:
     message:
         "building full pangraph ({wildcards.kind}) for {wildcards.species}"
     input:
-        lambda w: expand("panx_data/{{species}}/fa/{acc}.fa", acc=PX_accnums[w.species]),
+        lambda w: expand(
+            "panx_results/fasta/{{species}}/{acc}.fa", acc=PX_accnums[w.species]
+        ),
     output:
-        pg="panx_data/{species}/pangraphs/pangraph-{kind}.json",
-        bm="panx_data/{species}/pangraphs/benchmark/pangraph-{kind}.txt",
+        pg="panx_results/pangraphs/{species}/pangraph-{kind}.json",
+        bm="panx_results/pangraphs/{species}/benchmark/pangraph-{kind}.txt",
     params:
         opt=lambda w: PX_ker_opt[w.kind],
     conda:
@@ -89,13 +91,13 @@ rule PX_summary_performance_benchmark:
         "Summary of pangraph performances"
     input:
         expand(
-            "panx_data/{species}/pangraphs/benchmark/pangraph-{kind}.txt",
+            rules.PX_build_full_pangraph.output.bm,
             species=PX_species,
             kind=PX_ker_names,
         ),
     output:
-        csv="panx_data/benchmark/benchmark_summary.csv",
-        pdf="panx_data/benchmark/benchmark_summary.pdf",
+        csv="panx_results/benchmark/benchmark_summary.csv",
+        pdf="panx_results/benchmark/benchmark_summary.pdf",
     conda:
         "../conda_envs/bioinfo_env.yml"
     shell:
@@ -113,13 +115,14 @@ rule PX_compression_benchmark:
         "Compression performances for species {wildcards.species}"
     input:
         pang=expand(
-            "panx_data/{{species}}/pangraphs/pangraph-{kind}.json", kind=PX_ker_names
+            "panx_results/pangraphs/{{species}}/pangraph-{kind}.json",
+            kind=PX_ker_names,
         ),
         fa=lambda w: expand(
-            "panx_data/{{species}}/fa/{acc}.fa", acc=PX_accnums[w.species]
+            "panx_results/fasta/{{species}}/{acc}.fa", acc=PX_accnums[w.species]
         ),
     output:
-        json="panx_data/benchmark/{species}/compression.json",
+        json="panx_results/benchmark/{species}/compression.json",
     conda:
         "../conda_envs/bioinfo_env.yml"
     shell:
@@ -135,10 +138,10 @@ rule PX_summary_compression_benchmark:
     message:
         "Summary of compression performances"
     input:
-        expand("panx_data/benchmark/{species}/compression.json", species=PX_species),
+        expand(rules.PX_compression_benchmark.output.json, species=PX_species),
     output:
-        csv="panx_data/benchmark/benchmark_compression.csv",
-        pdf="panx_data/benchmark/benchmark_compression.pdf",
+        csv="panx_results/benchmark/benchmark_compression.csv",
+        pdf="panx_results/benchmark/benchmark_compression.pdf",
     conda:
         "../conda_envs/bioinfo_env.yml"
     shell:
@@ -225,7 +228,7 @@ def IS_select_strains(w):
     s = w.size
     t = w.trial
     strains = PX_IS_strains[s][t]
-    return [f"panx_data/escherichia_coli/fa/{s}.fa" for s in strains]
+    return [f"panx_results/fasta/escherichia_coli/{s}.fa" for s in strains]
 
 
 # rule to build a pangenome graph for the selected set of strains, specified by the size + trial dictionary
@@ -268,9 +271,9 @@ rule PX_IS_extract_stats_full_graph:
     message:
         "Extracting stats for full pangenome graph"
     input:
-        pang="panx_data/escherichia_coli/pangraphs/pangraph-minimap20-std.json",
+        pang="panx_results/pangraphs/escherichia_coli/pangraph-minimap20-std.json",
         fasta=expand(
-            "panx_data/escherichia_coli/fa/{acc}.fa",
+            "panx_results/fasta/escherichia_coli/{acc}.fa",
             acc=PX_IS_allstrains,
         ),
     output:

diff --git a/script/rules/paper_figs.smk b/script/rules/paper_figs.smk
@@ -121,8 +121,8 @@ rule PF_panx_compression_plot:
     message:
         "Creating plots for PanX compression performances"
     input:
-        comp="panx_data/benchmark/benchmark_compression.csv",
-        summ="panx_data/benchmark/benchmark_summary.csv",
+        comp=rules.PX_summary_compression_benchmark.output.csv,
+        summ=rules.PX_summary_performance_benchmark.output.csv,
     output:
         main="figs/paper/panx/main.pdf",
         suppl="figs/paper/panx/suppl.pdf",

diff --git a/script/rules/size_benchmark.smk b/script/rules/size_benchmark.smk
@@ -79,7 +79,7 @@ rule SB_summary_dataframe:
             n=SB_trials,
         ),
     output:
-        "figs/size-benchmark/size_vs_time.csv",
+        "size-benchmark/summary/size_vs_time.csv",
     conda:
         "../conda_envs/bioinfo_env.yml"
     shell:

diff --git a/script/workflow_scripts/compression_benchmark.py b/script/workflow_scripts/compression_benchmark.py
@@ -38,7 +38,7 @@ def single_fasta_stats(fa):
 
 def extract_species(fname):
     """Extract species from the fasta filename input"""
-    return re.match("panx_data/([^/]+)/", fname).group(1)
+    return re.match("panx_results/fasta/([^/]+)/", fname).group(1)
 
 
 def summary_fasta_stats(fas):

diff --git a/script/workflow_scripts/gbk_to_fa.py b/script/workflow_scripts/gbk_to_fa.py
@@ -2,7 +2,6 @@
 # sequence is saved. The fasta record id is the same as the genbank filename.
 # The script checks that the original sequence id is compatible with this name.
 
-from selectors import EpollSelector
 from Bio import SeqIO
 import argparse
 import re

diff --git a/script/workflow_scripts/paper_figs/accuracy_plots.py b/script/workflow_scripts/paper_figs/accuracy_plots.py
@@ -128,7 +128,7 @@ def misplacement_plot(costs, savename, stat):
         df,
         args.pdf_snps,
         kernel_title=titles,
-        fit_max_snps=0.002,
+        fit_max_snps=0.001,
     )
 
     # extract breakpoint misplacement distance, keep only relevant distance,

diff --git a/script/workflow_scripts/plot-accuracy.jl b/script/workflow_scripts/plot-accuracy.jl
@@ -149,10 +149,10 @@ function main(path, destdir, snps_keep)
 
     data = load(path)
     snps_keep = [parse(Float64,s) for s in snps_keep]
-    remove_extra_snps!(data, snps_keep)
     grid = plotgrid(entropy(data)...;  group=group(path))
+    remove_extra_snps!(data, snps_keep)
     cdfs = plotcdfs(accuracy(data)...; group=group(path))
-
+    
     save(base, name) = savefig("$(destdir)/$(base)-$(name).png")
     save(plot, base, name) = savefig(plot, "$(destdir)/$(base)-$(name).png")
 

diff --git a/script/workflow_scripts/summary_benchmark.py b/script/workflow_scripts/summary_benchmark.py
@@ -31,15 +31,15 @@ def parsetime(x):
     "species": str,
     "kind": str,
     "n-isolates": lambda x: len(
-        [y for y in x.split(" ") if y.startswith("panx_data/")]
+        [y for y in x.split(" ") if y.startswith("panx_results/fasta/")]
     ),
     "mem": lambda x: int(x) / 1e6,
     "wall-time": parsetime,
     "usr-time": lambda x: float(x) / (3600),
     "sys-time": lambda x: float(x) / (3600),
     "cpu-percent": int,
     "command": lambda x: " ".join(
-        [y for y in x.split(" ") if not y.startswith("panx_data/")]
+        [y for y in x.split(" ") if not y.startswith("panx_results/fasta/")]
     ),
 }