In [None]:
import altair
import pandas
import pathlib

workdir = pathlib.Path("workdir")

# Tree building run time and memory usage

In [None]:
datasets = ["clinvar", "gnomad", "hg001", "hg002", "hg003", "hg004", "hg006", "hg007"]

dfs = []
for dataset in datasets:
    memory_path = workdir / "memory" / "build"  / f"{dataset}.csv"
    time_path = workdir / "time" / "build" / f"{dataset}.csv"
    input_path = workdir / "data" / f"{dataset}.bed"
    
    memory = pandas.read_csv(memory_path)
    time = pandas.read_csv(time_path, usecols=["command", "median"]).rename(columns={"median": "time"})

    df = memory.merge(time)
    df["command"] = df["command"].str.replace("_build", "")

    df["lines"] = sum(1 for _ in open(input_path))

    dfs.append(df)

df = pandas.concat(dfs)
df

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, y, y_title, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y(y, title=y_title),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_memory, legend = make_plot(df, "memory", "memory (Kb)", selection, color, opacity)
plot_time, legend = make_plot(df, "time", "time (s)", selection, color, opacity)

plot = plot_memory | plot_time | legend
plot = plot.properties(
    title="Tree building",
)

plot

# Query time compare to input size

In [None]:
datasets = ["clinvar", "gnomad", "hg001", "hg002", "hg003", "hg004", "hg006", "hg007"]

dfs = []
for dataset in datasets:
    time_path = workdir / "time" / "query" / f"{dataset}.csv"
    input_path = workdir / "data" / f"{dataset}.bed"
    
    df = pandas.read_csv(time_path)
    df["time"] = df["time"] / 100
    df = df.groupby("command").agg("median")
    df["lines"] = sum(1 for _ in open(input_path))

    dfs.append(df)

df = pandas.concat(dfs)
df = df.reset_index()
df

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, y, y_title, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y(y, title=y_title),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_time, legend = make_plot(df, "time", "time (s)", selection, color, opacity)

plot = plot_time | legend
plot = plot.properties(
    title="Quering time in function of tree size",
)

plot

# Query time compare to query size

In [None]:
datasets = ["clinvar", "gnomad", "hg001", "hg002", "hg003", "hg004", "hg006", "hg007"]

dfs = []
for dataset in datasets:
    time_path = workdir / "time" / "annotate" / f"{dataset}.csv"
    input_path = workdir / "data" / f"{dataset}.bed"
    
    df = pandas.read_csv(time_path, usecols=["command", "median"]).rename(columns={"median": "time"})
    df["lines"] = sum(1 for _ in open(input_path))
    df["command"] = df["command"].str.replace("_annotate", "")


    dfs.append(df)

df = pandas.concat(dfs)
df = df.reset_index()
df

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, y, y_title, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y(y, title=y_title),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_time, legend = make_plot(df, "time", "time (s)", selection, color, opacity)

plot = plot_time | legend
plot = plot.properties(
    title="Quering time in function of input size",
)

plot