In [2]:
import pandas as pd
import altair as alt
import glob
import os
from humanfriendly import parse_size

In [3]:
def convert_to_timedelta(duration_str, unit="m"):
    conv = {"d": 86400,"h": 3600, "m": 60}
    vals = duration_str.split(" ")
    total = 0
    for val in vals:
        if "d" in val:
            total += int(val.replace("d", "")) * conv["d"]
        if "h" in val:
            total += int(val.replace("h", "")) * conv["h"]
        if "m" in val and "ms" not in val:
            total += int(val.replace("m", "")) * conv["m"]
        if "s" in val:
            total += float(val.replace("s", ""))
    return pd.Timedelta(total, unit="s").total_seconds() / conv[unit]

In [4]:
run = "/home/fchaaban/asterix/scratch/hdd2/farid/2024_MeSS/benchmark/run1"

In [5]:
files = glob.glob(
    os.path.join(run,"pipeline_info","*.txt")
)

In [6]:
files

['/home/fchaaban/asterix/scratch/hdd2/farid/2024_MeSS/benchmark/run1/pipeline_info/execution_trace_2024-04-25_17-45-11.txt']

In [7]:
tsvs = []
for file in files:
    tsv = pd.read_csv(file, sep="\t")
    tsv["run"] = [file.split("/")[-3]] * len(tsv)
    tsvs.append(tsv)
df = pd.concat(tsvs)

In [8]:
df["task_name"] = df.name.str.split(":", expand=True)[1]
df["tool"] = df.task_name.str.split("_", expand=True)[0]
df = df[df.task_name.str.contains("SIMULATE")]
df["genomes"] = df.task_name.str.split(" ", expand=True)[1]
df["genomes"] = df["genomes"].str.replace("(", "").str.replace(")", "")
df["genomes"] = [int(nb) for nb in df["genomes"]]
df["duration_min"] = df.duration.apply(convert_to_timedelta)
df["realtime_min"] = df.realtime.apply(convert_to_timedelta)

In [9]:
df["tool"] = df["tool"].replace({"MESS": "MeSS v0.8.0", "CAMISIM": "CAMISIM v1.3"})

## add genomes metadata


In [10]:
meta_files = glob.glob(
    os.path.join(run,"mess", "*","processing","*.tsv")
    
)
meta_df = []
for file in meta_files:
    mdf = pd.read_csv(file, sep="\t")
    mdf["genomes"] = [mdf.shape[0]] * len(mdf)
    meta_df.append(mdf)
meta_df = pd.concat(meta_df)

In [11]:
meta_df = meta_df.groupby("genomes", as_index=False)[
    ["total_sequence_length", "number_of_contigs", "bases", "reads"]
].sum()

In [12]:
meta_df.head()

Unnamed: 0,genomes,total_sequence_length,number_of_contigs,bases,reads
0,1,3502975,2,3502975.0,17514.875
1,2,8160866,3,8160866.0,40804.33
2,3,6774179,9,6774179.0,33870.895
3,4,10175797,8,10175797.0,50878.985
4,5,21563198,7,21563198.0,107815.99


In [13]:
# Add metadata
df = df.merge(meta_df, on="genomes")

In [14]:
df = df.sort_values(["tool","bases"])

In [15]:
df.columns

Index(['task_id', 'hash', 'native_id', 'name', 'status', 'exit', 'submit',
       'duration', 'realtime', '%cpu', 'peak_rss', 'peak_vmem', 'rchar',
       'wchar', 'run', 'task_name', 'tool', 'genomes', 'duration_min',
       'realtime_min', 'total_sequence_length', 'number_of_contigs', 'bases',
       'reads'],
      dtype='object')

In [17]:
df["RAM"] = [parse_size(size) / 10**9 for size in df.peak_rss]
df["%cpu"] = [float(usage.replace("%","")) for usage in df["%cpu"]]
df["cpu"] = [cpu / 100 for cpu in df["%cpu"]]
df["calc_time"] = df["realtime_min"] * df["cpu"]

In [67]:
mean_df = df.groupby("tool", as_index=False)[["RAM","calc_time","cpu"]].mean()

In [68]:
mean_df

Unnamed: 0,tool,RAM,calc_time,cpu
0,CAMISIM v1.3,70.03125,249.048303,1.088688
1,MeSS v0.8.0,4.194575,24.570161,5.719625


In [72]:
70.031 / 4.195

16.69392133492253

In [73]:
249.048 / 24.570

10.136263736263736

In [74]:
5.72 / 1.089

5.252525252525253

In [49]:
bp = alt.Chart(df[["tool","RAM"]]).mark_boxplot(size=90).encode(
    alt.X("tool:N", axis=alt.Axis(labelAngle=0)),
    alt.Y("RAM:Q").title("RAM (GB)"),
    alt.Color("tool:N").legend(None),
).properties(
    width=600,
    height=300
).configure_axis(
    labelFontSize=12,
    titleFontSize=12
)
bp

In [50]:
bp.save('ram-usage.svg', ppi=400)

In [51]:
bp = alt.Chart(df[["tool","cpu"]]).mark_boxplot(size=90).encode(
    alt.X("tool:N",axis=alt.Axis(labelAngle=0)),
    alt.Y("cpu:Q").title("CPU usage"),
    alt.Color("tool:N").legend(None),
).properties(
    width=600,
    height=300
).configure_axis(
    labelFontSize=12,
    titleFontSize=12
)
bp

In [52]:
bp.save('cpu-usage.svg', ppi=400)

In [77]:
line = (
    alt.Chart(df[["genomes","duration_min","tool","total_sequence_length","number_of_contigs","bases","reads"]])
    .mark_line(point=True)
    .encode(
        x=alt.X("genomes"),
        y=alt.Y("duration_min").title("Wall time (min)"),
        color="tool",
        tooltip=["duration_min","genomes","total_sequence_length","number_of_contigs","bases","reads"]
    )
).properties(
    width=600,
    height=300
).configure_axis(
    labelFontSize=12,
    titleFontSize=12
)
line

In [55]:
line.save('wall-time.svg', ppi=400)

In [78]:
line = (
    alt.Chart(df[["genomes","calc_time","tool","total_sequence_length","number_of_contigs","bases","reads"]])
    .mark_line(point=True)
    .encode(
        x=alt.X("genomes"),
        y=alt.Y("calc_time").title("CPU time (min)"),
        color="tool",
        tooltip=["calc_time","genomes","total_sequence_length","number_of_contigs","bases","reads"]
    )
).properties(
    width=600,
    height=300
).configure_axis(
    labelFontSize=12,
    titleFontSize=12,
)
line

In [79]:
2394167000 / 10**9

2.394167

In [64]:
line.save('cpu-time.svg', ppi=400)