In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
pd.set_option('display.max_rows', 1000)
%matplotlib inline

In [356]:
# Algorithms and Experiments defitions
algorithms = {
    'AverageKDTree': {
        'name': 'average_kd_tree',
        'color': '#E95C50',
        'show_name': 'AvgKD',
        'alg_id': 'average_kd_tree-0.0-1024'
    },
    'MedianKDTree': {
        'name': 'median_kd_tree',
        'color': '#62818D',
        'show_name': 'MedKD',
        'alg_id': 'median_kd_tree-0.0-1024'
    },
    'CrackingKDTree': {
        'name': 'cracking_kd_tree',
        'color': '#8ACF7D',
        'show_name': 'AdaptKD',
        'alg_id': 'cracking_kd_tree-0.0-1024'
    },
    'Quasii': {
        'name': 'quasii',
        'color': '#F0F2E5',
        'show_name': 'Quasii',
        'alg_id': 'quasii-0.0-1024'
    },
    'FullScan': {
        'name': 'full_scan_cl',
        'color': '#D86140',
        'show_name': 'FullScan',
        'partition': '0',
        'alg_id': 'full_scan_cl-0.0-0'
    },
    'ProgressiveIndexCostModel': {
        'name': 'progressive_index_cm',
        'color': '#8F8F8F',
        'show_name': 'ProgKD',
        'delta': '0.1',
        'alg_id': 'progressive_index_cm-0.1-1024'
    },
    'ProgressiveIndex': {
        'name': 'progressive_index',
        'color': '#89D1D1',
        'show_name': 'AdapProgKD',
        'delta': '0.1',
        'alg_id': 'progressive_index-0.1-1024'
    }
}

experiments = {
    'Power': {
        'name': 'power',
        'n_rows': '10000000',
        'n_queries': '3000',
        'n_cols': '',
        'sel': '0.0',
        'base_folder': 'real-data-workload/results',
        'exp_id': 'power-10000000-3000-0.0'
    }
}

# add genomics experiments
for i in range(9):
    experiments[f'Genomics{i}'] = {
        'name': f'genomics{i}',
        'n_rows': '10000000',
        'n_queries': '3000',
        'n_cols': '',
        'sel': '0.0',
        'base_folder': 'real-data-workload/results',
        'exp_id': f'genomics_query_{i}-10000000-3000-0.0'
    }

In [373]:
# Input/Output
def read(alg, exp):
    df = pd.read_csv(f"{experiments[exp]['base_folder']}/{algorithms[alg]['alg_id']}-{experiments[exp]['exp_id']}.csv")
    repetitions = df['repetition'].max() + 1
    step = int(len(df.index)/repetitions)
    df_final = df[:step].copy().reset_index()
    for rep in range(1, repetitions):
        df_final += df[step * (rep) : step * (rep + 1)].copy().reset_index()
    
    df_final = df_final/repetitions
    
    if 'index_search_time' not in df_final:
        df_final['index_search_time'] = 0.0
    df_final['query_time'] = df_final['initialization_time'] + df_final['index_search_time'] + df_final['scan_time'] + df_final['adaptation_time']
    df_final['query_time_cumsum'] = df_final['query_time'].cumsum()
    return df_final

                     
def read_multiple(algs, exp):
    ''' Reads multiple algorithms in an experiment, return three arrays: dfs, colors, names
    '''
    dfs = []
    colors = []
    names = []
    for alg in algs:
        dfs.append(read(alg, exp))
        names.append(algorithms[alg]['show_name'])
        colors.append(algorithms[alg]['color'])
    
    return dfs, colors, names

                     
def save_figure(fig, fig_name):
    fig.write_image(f"figures/{fig_name}")
                     
def save_table(table, table_name):
    with open(f"tables/{table_name}", 'w') as f:
        f.write(table)

In [358]:
# Helper methods
def get_first_query(df):
    return df['query_time'].iloc[0]

def get_payoff(df, baseline):
    p = [i for i, x in enumerate(df['query_time_cumsum'] - baseline['query_time_cumsum']) if x < 0]
    if len(p) == 0:
        return len(df)
    return p[0]

def get_convergence(df):
    adpt_time = df['adaptation_time']
    averaged_adapt_time = [(adpt_time[i-1] + adpt_time[i] + adpt_time[i+1])/3.0 for i in range(1, len(adpt_time) - 1)]
    c = [i for i, x in enumerate(averaged_adapt_time) if x <= 0.003]
    if(len(c) == 0):
        return len(df)
    else:
        return c[0]

def get_robustness(df):
    return np.var(df['query_time'][:30])

def get_total_time(df):
    return df['query_time'].sum()

In [360]:
# Figures

def create_figure():
    return go.Figure(
        layout=go.Layout(
            width=1500,
            plot_bgcolor='rgba(0,0,0,0)',
            font=dict(
                size=22
            ),
            yaxis=dict(
                showline=True,
                linewidth=2,
                linecolor='black'
            ),
            xaxis=dict(
                showline=True,
                linewidth=2,
                linecolor='black'
            ),
            bargap=0,
            showlegend=False
#             legend_orientation="h",
#             legend=dict(x=0.08, y=-.05)
        )
    )

def first_query(exp, algs):
    dfs, colors, names = read_multiple(algs, exp)
    fig = create_figure()
    first_query_time = np.array([get_first_query(x) for x in dfs])
    
    bars = []
    
    for i, _ in enumerate(dfs):
        bars.append(
            go.Bar(
                name=names[i],
                x=[''],
                y=[first_query_time[i]],
                text=['{:.2f}'.format(first_query_time[i])],
                textposition='auto',
                marker_color=colors[i]
            )
        )
    
    fig.add_traces(data=bars)
    fig.update_layout(yaxis_title='Time (seconds)')
    return fig

def payoff(exp, algs):
    baseline = read('FullScan', exp)
    dfs, colors, names = read_multiple(algs, exp)
    fig = create_figure()

    payoffs = []
    for df in dfs:
        payoffs.append(get_payoff(df, baseline))
    
    bars = []

    for i, _ in enumerate(dfs):
        bars.append(
            go.Bar(
                name=names[i],
                x=[''],
                y=[payoffs[i]],
                text=['{:.2f}'.format(payoffs[i])],
                textposition='auto',
                marker_color=colors[i]
            )
        )
    
    fig.add_traces(data=bars)
    fig.update_layout(yaxis_title='Query Number')
    return fig

def convergence(exp, algs):
    dfs, colors, names = read_multiple(algs, exp)
    fig = create_figure()

    convergences = []
    for df in dfs:
        convergences.append(get_convergence(df))
    
    bars = []
    
    for i, _ in enumerate(dfs):
        bars.append(
            go.Bar(
                name=names[i],
                x=[''],
                y=[convergences[i]],
                text=[convergences[i]],
                textposition='auto',
                marker_color=colors[i]
            )
        )
    
    fig.add_traces(data=bars)
    fig.update_layout(yaxis_title='Query number')
    return fig

def robustness(exp, algs):
    dfs, colors, names = read_multiple(algs, exp)
    fig = create_figure()

    variances = [get_robustness(df) for df in dfs]
    bars = []
    
    for i, _ in enumerate(dfs):
        bars.append(
            go.Bar(
                name=names[i],
                x=[''],
                y=[variances[i]],
#                 text=['{:.8f}'.format(variances[i])],
                textposition='auto',
                marker_color=colors[i]
            )
        )
    
    fig.add_traces(data=bars)
    fig.update_layout(yaxis_title='Time (seconds)')
    return fig

def power_response_time(exp, algs):
    dfs, colors, names = read_multiple(algs, exp)
    fig = create_figure()
    resp_time = np.array([get_total_time(x) for x in dfs])
    
    bars = []
    
    for i, _ in enumerate(dfs):
        bars.append(
            go.Bar(
                name=names[i],
                x=[''],
                y=[resp_time[i]],
                text=['{:.2f}'.format(resp_time[i])],
                textposition='auto',
                marker_color=colors[i]
            )
        )
    
    fig.add_traces(data=bars)
    fig.update_layout(yaxis_title='Time (seconds)')
    return fig

def per_query(exp, algs):
    dfs, colors, names = read_multiple(algs, exp)
    fig = create_figure()
    
    lines = []
    
    biggest = 0
    
    for i, df in enumerate(dfs):
        per_query_times = np.array(df['query_time'][:100]) * 1000
        if biggest < np.max(per_query_times):
            biggest = np.max(per_query_times)
        lines.append(
            go.Scatter(
                name=names[i],
                x=list(range(len(per_query_times))),
                y=per_query_times,
                marker_color=colors[i],
                mode='lines+markers',
                line=dict(width=4)
            )
        )
    
    ticks = []
    tick = 1
    while tick < biggest:
        ticks.append(tick)
        tick *= 10
    
    fig.add_traces(data=lines)
    fig.update_layout(showlegend=True, yaxis_title='Time (milliseconds)')
    fig.update_yaxes(type="log", tickvals=ticks)
    return fig

def cummulative(exp, algs):
    dfs, colors, names = read_multiple(algs, exp)
    fig = create_figure()
    
    lines = []
    
    biggest = 0
    
    for i, df in enumerate(dfs):
        per_query_times = np.array(df['query_time_cumsum'])
        if biggest < np.max(per_query_times):
            biggest = np.max(per_query_times)
        lines.append(
            go.Scatter(
                name=names[i],
                x=list(range(len(per_query_times))),
                y=per_query_times,
                marker_color=colors[i],
                mode='lines',
                line=dict(width=4)
            )
        )
    
    fig.add_traces(data=lines)
    fig.update_layout(showlegend=True, yaxis_title='Time (Seconds)')
    return fig

def break_down(exp, algs):
    dfs, _, names = read_multiple(algs, exp)
    fig = go.Figure()
    initializations = np.array([x['initialization_time'].sum() for x in dfs])
    adaptation = np.array([x['adaptation_time'].sum() for x in dfs])
    search = np.array([x['index_search_time'].sum() for x in dfs])
    scan = np.array([x['scan_time'].sum() for x in dfs])
    
    fig = go.Figure(data=[
        go.Bar(name='Initialization', x=names, y=initializations),
        go.Bar(name='Adaptation', x=names, y=adaptation),
        go.Bar(name='Index Search', x=names, y=search),
        go.Bar(name='Scan', x=names, y=scan),
    ])
    
    # Change the bar mode
    fig.update_layout(barmode='stack')
    fig.update_layout(yaxis_title='Time (seconds)')
    return fig

def plot(exp):
#     fq = first_query(exp, [
#         'AverageKDTree',
#         'MedianKDTree',
#         'CrackingKDTree',
#         'Quasii',
#         'ProgressiveIndex',
#         'ProgressiveIndexCostModel',
#         'FullScan',
#     ])
#     po = payoff(exp, [
#         'AverageKDTree',
#         'MedianKDTree',
#         'CrackingKDTree',
#         'Quasii',
#         'ProgressiveIndex',
#         'ProgressiveIndexCostModel'
#     ])

#     conv = convergence(exp, [
#         'CrackingKDTree',
#         'Quasii',
#         'ProgressiveIndex',
#         'ProgressiveIndexCostModel'
#     ])

#     ro = robustness(exp, [
#         'AverageKDTree',
#         'MedianKDTree',
#         'CrackingKDTree',
#         'Quasii',
#         'ProgressiveIndex',
#         'ProgressiveIndexCostModel'
#     ])

#     resp = power_response_time(exp, [
#         'AverageKDTree',
#         'MedianKDTree',
#         'CrackingKDTree',
#         'Quasii',
#         'ProgressiveIndex',
#         'ProgressiveIndexCostModel'
#     ])

    pq = per_query(exp, [
        'AverageKDTree',
        'MedianKDTree',
        'CrackingKDTree',
        'Quasii',
        'ProgressiveIndex',
        'ProgressiveIndexCostModel',
        'FullScan',
    ])

    cumm = cummulative(exp, [
        'AverageKDTree',
        'MedianKDTree',
        'CrackingKDTree',
        'Quasii',
        'ProgressiveIndex',
        'ProgressiveIndexCostModel'
    ])

    bd = break_down(exp, [
        'AverageKDTree',
        'MedianKDTree',
        'CrackingKDTree',
        'Quasii',
        'ProgressiveIndex',
        'ProgressiveIndexCostModel'
    ])

#     save_figure(fq, f'{exp}-first_query.pdf')
#     save_figure(conv, f'{exp}-convergence.pdf')
#     save_figure(po, f'{exp}-payoff.pdf')
#     save_figure(ro, f'{exp}-robustness.pdf')
#     save_figure(resp, f'{exp}-total_response_time.pdf')
    save_figure(pq, f'{exp}-per_query.pdf')
    save_figure(cumm, f'{exp}-cummulative.pdf')
    save_figure(bd, f'{exp}-break_down.pdf')

In [361]:
# exps = experiments.keys()

exps = ['Power']
for exp in exps:
    plot(exp)

In [374]:
# Latex Tables

def metrics(exps, algs):
    '''
    ||||First Query|Convergence|...
    AKD|   1.11.   |     15.   |...
    ...
    '''
    data = {
        'FQ': [],
        'PO': [],
        'CV': [],
        'RB': [],
        'Time': []
    }
    
    index_exp = []
    index_name = []
    
    for exp in exps:
        dfs, _, names = read_multiple(algs, exp)

        baseline = read('FullScan', exp)

        for df, name in zip(dfs, names):
            index_exp.append(exp)
            index_name.append(name)
            data['FQ'].append('%.2f' % get_first_query(df))
            data['PO'].append(get_payoff(df, baseline))
            data['CV'].append(get_convergence(df))
            data['RB'].append('%.2E' % get_robustness(df))
            data['Time'].append('%.2f' %get_total_time(df))

    df = pd.DataFrame(data, index=[index_exp, index_name])
    
    latex = df.to_latex(multicolumn=True, multirow=True)

    for exp in exps:
        latex = latex.replace(exp, "\\rotatebox[origin=c]{90}{%s}" % exp)
    return latex

In [376]:
exps = experiments.keys()
m = metrics(exps, [
        'AverageKDTree',
        'MedianKDTree',
        'CrackingKDTree',
        'Quasii',
        'ProgressiveIndex',
        'ProgressiveIndexCostModel',
        'FullScan',
])

print(m)
# save_table(m, 'power')

\begin{tabular}{lllrrll}
\toprule
          &          &    FQ &    PO &   CV &        RB &    Time \\
\midrule
\multirow{7}{*}{\rotatebox[origin=c]{90}{Power}} & AvgKD &  0.83 &    16 &    0 &  2.19E-02 &   21.97 \\
          & MedKD &  1.52 &    28 &    0 &  7.36E-02 &   24.49 \\
          & AdaptKD &  0.16 &     7 &   28 &  8.86E-04 &   20.05 \\
          & Quasii &  0.19 &    11 &   22 &  1.76E-03 &   29.28 \\
          & AdapProgKD &  0.06 &     6 &  111 &  3.44E-04 &   23.47 \\
          & ProgKD &  0.06 &     9 &   32 &  1.07E-04 &   22.99 \\
          & FullScan &  0.06 &  3000 &    0 &  5.43E-04 &  174.59 \\
\cline{1-7}
\multirow{7}{*}{\rotatebox[origin=c]{90}{Genomics0}} & AvgKD &  2.78 &    73 &    0 &  2.10E-01 &   24.15 \\
          & MedKD &  2.73 &    48 &    0 &  2.04E-01 &   22.38 \\
          & AdaptKD &  0.97 &     4 &    1 &  2.57E-02 &    8.23 \\
          & Quasii &  5.58 &   100 &  100 &  7.59E-01 &   87.67 \\
          & AdapProgKD &  0.32 &    60 &  100 &  2.02