# Run benchmark

In [None]:
!make -C experiment benchmark

# Create plot

In [None]:
import pandas
import pathlib

workdir = pathlib.Path("experiment")
result  = workdir / "result"
dataset = workdir / "dataset"

def build_data(path: pathlib.Path, input_path: pathlib.Path):
    data = pandas.read_csv(path)

    data["lines"] = sum(1 for _ in open(input_path))

    return data

def query_data(path: pathlib.Path, input_path: pathlib.Path):
    data = pandas.read_csv(path)
    
    data["lines"] = sum(1 for _ in open(input_path))
    data["time"] = data["time"] / 100
    
    data = data.groupby("command").agg({
        "lines": "first",
        "time": ["mean", "median"]
    })
    
    data.columns = ["lines", "mean", "median"]

    return data.reset_index()

build = pandas.concat((
    build_data(result / "clinvar_build.csv", dataset / "clinvar.bed"),
    build_data(result / "gnomad_build.csv", dataset / "gnomad.bed"),
    build_data(result / "hg38_build.csv", dataset / "hg38.bed"),
    build_data(result / "hg001_build.csv", dataset / "hg001.bed"),
    build_data(result / "hg002_build.csv", dataset / "hg002.bed"),
    build_data(result / "hg003_build.csv", dataset / "hg003.bed"),
    build_data(result / "hg004_build.csv", dataset / "hg004.bed"),
    build_data(result / "hg006_build.csv", dataset / "hg006.bed"),
    build_data(result / "hg007_build.csv", dataset / "hg007.bed"),
    build_data(result / "combine_shuffle_build.csv", dataset / "combine_shuffle.bed"),
))

query = pandas.concat((
    query_data(result / "clinvar_query.csv", dataset / "clinvar.bed"),
    query_data(result / "gnomad_query.csv", dataset / "gnomad.bed"),
    query_data(result / "hg38_query.csv", dataset / "hg38.bed"),
    query_data(result / "hg001_query.csv", dataset / "hg001.bed"),
    query_data(result / "hg002_query.csv", dataset / "hg002.bed"),
    query_data(result / "hg003_query.csv", dataset / "hg003.bed"),
    query_data(result / "hg004_query.csv", dataset / "hg004.bed"),
    query_data(result / "hg006_query.csv", dataset / "hg006.bed"),
    query_data(result / "combine_shuffle_query.csv", dataset / "combine_shuffle.bed"),
))

In [None]:
import altair

altair.renderers.set_embed_options(theme='dark')

def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y("median", title="wall time(ns)"),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection    
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_build, legend = make_plot(build, selection, color, opacity)

plot_build = plot_build.properties(
    title="Tree building",
)

query_build, legend = make_plot(query, selection, color, opacity)
query_build = query_build.properties(
    title="Tree query",
)

plot_build & query_build | legend

In [None]:
annotation_variant = pandas.concat((
    build_data(result / "annotation_variant_clinvar.csv", dataset / "clinvar.bed"),
    build_data(result / "annotation_variant_gnomad.csv", dataset / "gnomad.bed"),
    build_data(result / "annotation_variant_hg001.csv", dataset / "hg001.bed"),
    build_data(result / "annotation_variant_hg002.csv", dataset / "hg002.bed"),
    build_data(result / "annotation_variant_hg003.csv", dataset / "hg003.bed"),
    build_data(result / "annotation_variant_hg004.csv", dataset / "hg004.bed"),
    build_data(result / "annotation_variant_hg006.csv", dataset / "hg006.bed"),
    build_data(result / "annotation_variant_hg007.csv", dataset / "hg007.bed"),
))

(selection, color, opacity) = make_selection()

plot_build, legend = make_plot(annotation_variant, selection, color, opacity)
plot_build | legend