# Run benchmark

In [None]:
!make -C experiment benchmark

## Get data

In [None]:
import pandas
import pathlib

workdir = pathlib.Path("experiment")
result  = workdir / "result"
dataset = workdir / "dataset"

##Â Build

In [None]:
def build_data(path: pathlib.Path, input_path: pathlib.Path):
    data = pandas.read_csv(path)

    data["lines"] = sum(1 for _ in open(input_path))

    return data

build = pandas.concat((
    build_data(result / "build_clinvar.csv", dataset / "clinvar.bed"),
    build_data(result / "build_gnomad.csv", dataset / "gnomad.bed"),
    build_data(result / "build_hg38.csv", dataset / "hg38.bed"),
    build_data(result / "build_hg001.csv", dataset / "hg001.bed"),
    build_data(result / "build_hg002.csv", dataset / "hg002.bed"),
    build_data(result / "build_hg003.csv", dataset / "hg003.bed"),
    build_data(result / "build_hg004.csv", dataset / "hg004.bed"),
    build_data(result / "build_hg006.csv", dataset / "hg006.bed"),
    build_data(result / "build_hg007.csv", dataset / "hg007.bed"),
))

## Query

In [None]:
def query_data(path: pathlib.Path, input_path: pathlib.Path):
    data = pandas.read_csv(path)
    
    data["lines"] = sum(1 for _ in open(input_path))
    data["time"] = data["time"] / 100
    
    data = data.groupby("command").agg({
        "lines": "first",
        "time": ["mean", "median"]
    })
    
    data.columns = ["lines", "mean", "median"]

    return data.reset_index()


query = pandas.concat((
    query_data(result / "query_clinvar.csv", dataset / "clinvar.bed"),
    query_data(result / "query_gnomad.csv", dataset / "gnomad.bed"),
    query_data(result / "query_hg38.csv", dataset / "hg38.bed"),
    query_data(result / "query_hg001.csv", dataset / "hg001.bed"),
    query_data(result / "query_hg002.csv", dataset / "hg002.bed"),
    query_data(result / "query_hg003.csv", dataset / "hg003.bed"),
    query_data(result / "query_hg004.csv", dataset / "hg004.bed"),
    query_data(result / "query_hg006.csv", dataset / "hg006.bed"),
))

## Annotation variant

In [None]:
annotation_variant = pandas.concat((
    build_data(result / "annotation_variant_clinvar.csv", dataset / "clinvar.bed"),
    build_data(result / "annotation_variant_gnomad.csv", dataset / "gnomad.bed"),
    build_data(result / "annotation_variant_hg001.csv", dataset / "hg001.bed"),
    build_data(result / "annotation_variant_hg002.csv", dataset / "hg002.bed"),
    build_data(result / "annotation_variant_hg003.csv", dataset / "hg003.bed"),
    build_data(result / "annotation_variant_hg004.csv", dataset / "hg004.bed"),
    build_data(result / "annotation_variant_hg006.csv", dataset / "hg006.bed"),
    build_data(result / "annotation_variant_hg007.csv", dataset / "hg007.bed"),
))

## Plot

In [None]:
import altair

altair.renderers.set_embed_options(theme='dark')

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y("median", title="wall time(ns)"),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection    
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_build, legend = make_plot(build, selection, color, opacity)
plot_build = plot_build.properties(
    title="Tree building",
)

plot_query, legend = make_plot(query, selection, color, opacity)
plot_query = plot_query.properties(
    title="Tree query",
)

#plot_av, legend = make_plot(annotation_variant, selection, color, opacity)
#plot_av = plot_av.properties(
#    title="Variant annotation",
#)

plot_build & plot_query | legend #& plot_av | legend

## Thread effect

In [None]:
def thread_effect_data(path: pathlib.Path, input_path: pathlib.Path):
    data = pandas.read_csv(path)

    data["dataset_domain"] = input_path + "_" + data["command"].str.extract(".+_(.+)_.+")
    data["thread"] = data["command"].str.extract(".+_.+_(.+)").astype(int)

    data = data[["dataset_domain", "thread", "median"]]

    return data

In [None]:
thread_effect = pandas.concat((
    thread_effect_data(result / "thread_effect_clinvar.csv", "clinvar"),
    thread_effect_data(result / "thread_effect_gnomad.csv", "gnomad"),
    thread_effect_data(result / "thread_effect_hg001.csv", "hg001"),
    thread_effect_data(result / "thread_effect_hg002.csv", "hg002"),
    thread_effect_data(result / "thread_effect_hg003.csv", "hg003"),
    thread_effect_data(result / "thread_effect_hg004.csv", "hg004"),
    thread_effect_data(result / "thread_effect_hg006.csv", "hg006"),
    thread_effect_data(result / "thread_effect_hg007.csv", "hg007"),
))

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['dataset_domain'])
    color = altair.condition(
        selection,
        altair.Color('dataset_domain:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("thread:N", title="number of thread"),
        y=altair.Y("median", title="wall time(ns)"),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection    
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('dataset_domain:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend
    
(selection, color, opacity) = make_selection()

plot_thread_effect, legend = make_plot(thread_effect, selection, color, opacity)
plot_thread_effect = plot_thread_effect.properties(
    title="Thread effect",
)

plot_thread_effect | legend