In [None]:
import altair
import pandas
import pathlib

workdir = pathlib.Path("workdir")

datasets = ["clinvar", "gnomad", "hg001", "hg002", "hg003", "hg004", "hg006", "hg007"]

# Tree building run time and memory usage

In [None]:
dfs = []
for dataset in datasets:
    memory_path = workdir / "memory" / "build"  / f"{dataset}.csv"
    time_path = workdir / "time" / "build" / f"{dataset}.csv"
    input_path = workdir / "data" / f"{dataset}.bed"

    if not memory_path.is_file():
        continue
    if not time_path.is_file():
        continue
    
    memory = pandas.read_csv(memory_path)
    time = pandas.read_csv(time_path, usecols=["command", "median"]).rename(columns={"median": "time"})

    df = memory.merge(time)
    df["command"] = df["command"].str.replace("_build", "")

    df["lines"] = sum(1 for _ in open(input_path))

    dfs.append(df)

df = pandas.concat(dfs)

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, y, y_title, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y(y, title=y_title),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_memory, legend = make_plot(df, "memory", "memory (Kb)", selection, color, opacity)
plot_time, legend = make_plot(df, "time", "time (s)", selection, color, opacity)

plot = plot_memory | plot_time | legend
plot = plot.properties(
    title="Tree building",
)

plot

# Query time compare to input size

In [None]:

dfs = []
for dataset in datasets:
    time_path = workdir / "time" / "query" / f"{dataset}.csv"
    input_path = workdir / "data" / f"{dataset}.bed"

    if not time_path.is_file():
        continue
    
    df = pandas.read_csv(time_path)
    df["time"] = df["time"] / 100
    df = df.groupby("command").agg("median")
    df["lines"] = sum(1 for _ in open(input_path))

    dfs.append(df)

df = pandas.concat(dfs)
df = df.reset_index()

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, y, y_title, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y(y, title=y_title),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_time, legend = make_plot(df, "time", "time (s)", selection, color, opacity)

plot = plot_time | legend
plot = plot.properties(
    title="Quering time in function of tree size",
)

plot

# Query time compare to query size

In [None]:
dfs = []
for dataset in datasets:
    time_path = workdir / "time" / "annotate" / f"{dataset}.csv"
    input_path = workdir / "data" / f"{dataset}.bed"

    if not time_path.is_file():
        continue
    
    df = pandas.read_csv(time_path, usecols=["command", "median"]).rename(columns={"median": "time"})
    df["lines"] = sum(1 for _ in open(input_path))
    df["command"] = df["command"].str.replace("_annotate", "")


    dfs.append(df)

df = pandas.concat(dfs)
df = df.reset_index()

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color('command:N').legend(None),
        altair.value('lightgray')
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, y, y_title, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("lines", title="# bed records"),
        y=altair.Y(y, title=y_title),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y('command:N').axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plot_time, legend = make_plot(df, "time", "time (s)", selection, color, opacity)

plot = plot_time | legend
plot = plot.properties(
    title="Quering time in function of input size",
)

plot

## Build thread effect

In [None]:
dfs = []
for dataset in datasets:
    time_path = workdir / "thread" / f"{dataset}.csv"
    input_path = workdir / "data" / f"{dataset}.bed"

    if not time_path.is_file():
        continue
    
    df = pandas.read_csv(time_path, usecols=["command", "median"]).rename(columns={"median": "time"})
    df["input"] = dataset
    dfs.append(df)

df = pandas.concat(dfs)
df = df.reset_index()

df["threads"] = df["command"].str.extract(r"_([^_]+)$")
df["threads"] = df["threads"].replace("build", "0")
df["command"] = df["command"].str.extract(r"^(.+)_")
df = df.astype({"threads": "int64"})
df.loc[df["command"] == "clairiere_interpolate_build", "threads"] = 0

In [None]:
def make_selection():
    selection = altair.selection_point(fields=['command'])
    color = altair.condition(
        selection,
        altair.Color("command:N").legend(None),
        altair.value("lightgray")
    )
    opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

    return (selection, color, opacity)

def make_plot(data: pandas.DataFrame, y, y_title, selection, color, opacity) -> altair.Chart:
    point = altair.Chart(data).mark_line().encode(
        x=altair.X("threads", title="# threads"),
        y=altair.Y(y, title=y_title),
        color=color,
        opacity=opacity,
    ).properties(
        width=800,
        height=800,
    ).transform_filter(
        selection
    )

    legend = altair.Chart(data).mark_point().encode(
        y=altair.Y("command:N").axis(orient='right'),
        color=color
    ).add_params(
        selection
    )

    return point, legend

(selection, color, opacity) = make_selection()

plots = list()
for (input_name, group) in df.groupby(by=["input"]):
    plot_thread, legend = make_plot(group, "time", "time (s)", selection, color, opacity)

    plot = plot_thread
    plot = plot.properties(
        title=input_name,
    )
    plots.append(plot)

plot = plots[0]
for (i, p) in enumerate(plots[1:]):
    if i % 2:
        plot |= p
    else:
        plot &= p

plot | legend

## Effect affine

In [None]:
domains = [2, 8, 32, 128, 512, 2048, 8192]

all_datas = list()
for dataset in datasets:
    for domain in domains:
        path = workdir / "affine_effect" / f"{dataset}_{domain}.csv"
        if not path.is_file():
            continue
        
        data = pandas.read_csv(path, names = ["data", "label"])
        data["dataset"] = dataset
        data["domain"] = domain
        all_datas.append(data)
        
data = pandas.concat(all_datas)
data

df_correction = data[data["label"] == "guess_correction"]
df_level = data[data["label"] == "guess_level"]
del data

df_correction = df_correction.groupby(by=["domain", "dataset", "data"]).count()
df_correction = df_correction.reset_index()
df_correction = df_correction.rename(columns={"label": "count"})

df_level = df_level.groupby(by=["domain", "dataset", "data"]).count()
df_level = df_level.reset_index()
df_level = df_level.rename(columns={"label": "count"})

In [None]:
selection = altair.selection_point(fields=['domain'])
color = altair.condition(
    selection,
    altair.Color('domain:N').legend(None),
    altair.value('lightgray')
)
opacity = altair.condition(selection, altair.value(1.0), altair.value(0.25))

legend = altair.Chart(df_level).mark_point().encode(
    y=altair.Y('domain:N').axis(orient='right'),
    color=color
).add_params(
    selection
)

plots = list()
for dataset in datasets:
    df_corr = df_correction[df_correction["dataset"] == dataset]
    plot_correction = altair.Chart(df_corr).mark_line().encode(
        x=altair.X("data").bin(maxbins=30),
        y=altair.Y("count"),
        color=color,
        opacity=opacity,
    ).properties(
        title="Correction of guess",
    ).transform_filter(
        selection
    )

    df_lvl = df_level[df_level["dataset"] == dataset]
    plot_level = altair.Chart(df_lvl).mark_line().encode(
        x=altair.X("data").bin(maxbins=30),
        y=altair.Y("count").stack(None),
        color=color,
        opacity=opacity,
    ).properties(
        title="Value of guess"
    ).transform_filter(
        selection
    )

    plots.append((plot_correction | plot_level).properties(title=f"Dataset {dataset}"))

final_plot = plots[0]
for index in range(1, len(domains)):
    final_plot &= plots[index]
    
final_plot | legend