In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
pd.set_option('display.max_rows', 1000)
%matplotlib inline

In [17]:
files = {
    'AverageKDTree': 'average_kd_tree',
    'MedianKDTree': 'median_kd_tree',
    'CrackingKDTree': 'cracking_kd_tree',
    'CrackingKDTreePerDimension': 'cracking_kd_tree_pd',
    'Quasii': 'quasii',
    'FullScanBitVector':'full_scan_bv',
    'FullScanCandidateList': 'full_scan_cl',
    'ProgressiveIndex': 'progressive_index',
    'ProgressiveIndexCostModel': 'progressive_index_cm'
}
def read_df(alg_name, delta, partition, exp_name, n_rows, n_queries, sel): 
    df = pd.read_csv(f"results/{alg_name}-{delta}-{partition}-{exp_name}-{n_rows}-{n_queries}-{sel}.csv")
    repetitions = df['repetition'].max() + 1
    step = int(len(df.index)/repetitions)
    df_final = df[:step].copy().reset_index()
    for rep in range(1, repetitions):
        df_final += df[step * (rep) : step * (rep + 1)].copy().reset_index()
    
    df_final = df_final/repetitions
    
    if 'index_search_time' not in df_final:
        df_final['index_search_time'] = 0.0
    df_final['query_time'] = df_final['initialization_time'] + df_final['index_search_time'] + df_final['scan_time'] + df_final['adaptation_time']
    df_final['query_time_cumsum'] = df_final['query_time'].cumsum()
    return df_final

# Cumulative Response Time
Cumulative response time using 2 attributes, 10M rows, and 0.1 selectivity

In [20]:
experiment = 'genomics_query_8'
n_rows = '10000000'
n_queries = '3000'
sel='0.0'
fig = go.Figure()

def plot_time(fig, df, name):
    fig.add_trace(go.Scatter(y=df['query_time_cumsum'], mode='lines',name=name))

plot_time(fig, read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree')

plot_time(fig, read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'MedianKDTree')

plot_time(fig, read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree')

plot_time(fig, read_df(files['CrackingKDTreePerDimension'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree Per Predicate')

plot_time(fig, read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii')

plot_time(fig, read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.1)')

plot_time(fig, read_df(files['ProgressiveIndexCostModel'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexCostModel (Delta=0.1)')


plot_time(fig, read_df(files['FullScanCandidateList'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan Candidate List')
plot_time(fig, read_df(files['FullScanBitVector'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan Bitvector')

# plot_time(fig, read_df(experiment, files['FullScan']), 'Full Scan')
# plot_time(fig, read_df(experiment, files['FullScanCandidate']), 'Full Scan Candidate List')
fig.update_layout(title=f'Cumulative response time ({experiment})',
                   xaxis_title='Query',
                   yaxis_title='Time (seconds)')
fig.show()

# GMRQ benchmark response time

In [23]:
def gmrq_response_times(algorithms, names):
    fig = go.Figure()
    bars = []
    for alg in algorithms:
        dfs = algorithms[alg]
        
        response_times = np.array([x['query_time'].sum() for x in dfs])
        bars.append(
            go.Bar(name=alg, x=names, y=response_times)
        )
    fig = go.Figure(data=bars)

    fig.update_layout(
        title=f'GMRQ Response time',
        yaxis_title='Time (seconds)',
        width=2000,
        legend=dict(x=.3, y=1.2,  orientation="h"))
    fig.show()

n_rows = 10000000
sel = 0.0
n_queries = 3000

algorithms = ['AverageKDTree', 'MedianKDTree', 'CrackingKDTree', 'Quasii', 'ProgressiveIndex', 'ProgressiveIndexCostModel', 'FullScanCandidateList']
alg_names = ['Average KD-Tree', 'Median KD-Tree', 'Adaptive KD-Tree', 'Quasii', ' Progressive Index', 'Progressive Index w/ Cost Model', 'Full Scan']
deltas = ['0.0', '0.0', '0.0', '0.0', '0.1', '0.1', '0.0']
partitions = ['1024', '1024', '1024', '1024', '1024', '1024', '0']
experiments = [f'genomics_query_{i}' for i in range(9)]
names = [f'Query Pattern {i}' for i in range(8)] + ['All Patterns']

response_times = {}
for alg, name, delta, partition in zip(algorithms, alg_names, deltas, partitions):
    response_times_alg = []
    for experiment in experiments:
        response_times_alg.append(
            read_df(files[alg], delta, partition, experiment, n_rows, n_queries, sel)
        )
    response_times[name] = response_times_alg
    
#     v = {}
#     for experiment, name in zip(experiments, names):
#         v[name] = [
#             [read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree'],
#             [read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'MedianKDTree'],
#             [read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree'],
#             [read_df(files['CrackingKDTreePerDimension'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree Per Predicate'],
#             [read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii'],
#             [read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.1)'],
#             [read_df(files['ProgressiveIndexAdaptive'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.1)'],
#             [read_df(files['FullScanCandidateList'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan Candidate List'],
#         ]

gmrq_response_times(response_times, names)

# Power benchmark response time

In [22]:
def power_response_time(values):
    dfs = np.array(values)[:,0]
    names = np.array(values)[:,1]
    fig = go.Figure()
    first_query_time = np.array([x['query_time'].sum() for x in dfs])
    fig = go.Figure(data=[
        go.Bar(name='', x=names, y=first_query_time)
    ])
    fig.update_layout(
        title=f'Power response time ',
        yaxis_title='Time (seconds)',
        width=600
    )
    fig.show()

experiment = 'power'
n_rows = '10000000'
n_queries = '3000'
sel='0.0'
    
power_response_time([
[read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree'],
[read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Median KDTree'],
[read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree'],
[read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii'],
[read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'Progressive Index'],
[read_df(files['ProgressiveIndexCostModel'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'Progressive Index w/ Cost Model'],
[read_df(files['FullScanCandidateList'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan'],
])

# First Query Response Time

In [12]:
# experiment = 'power'
# n_rows = '10000000.0'
# n_queries = '3000'
# sel='0.0'

def first_query(values):
    dfs = np.array(values)[:,0]
    names = np.array(values)[:,1]
    fig = go.Figure()
    first_query_time = np.array([x['query_time'][0] for x in dfs])
    fig = go.Figure(data=[
        go.Bar(name='First Query Time', x=names, y=first_query_time)
    ])
    fig.update_layout(title=f'First Query Response time ({experiment})', yaxis_title='Time (seconds)')
    fig.show()

first_query([
[read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree'],
[read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'MedianKDTree'],
[read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree'],
[read_df(files['CrackingKDTreePerDimension'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree Per Predicate'],
[read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii'],
[read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.1)'],
[read_df(files['ProgressiveIndex'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.2)'],
[read_df(files['ProgressiveIndex'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.5)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.1)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.2)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.5)'],
[read_df(files['FullScanCandidateList'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan Candidate List'],
])

In [13]:
# experiment = 'power'
# n_rows = '10000000.0'
# n_queries = '3000'
# sel='0.0'

def break_down(values):
    dfs = np.array(values)[:,0]
    names = np.array(values)[:,1]
    fig = go.Figure()
    initializations = np.array([x['initialization_time'].sum() for x in dfs])
    adaptation = np.array([x['adaptation_time'].sum() for x in dfs])
    search = np.array([x['index_search_time'].sum() for x in dfs])
    scan = np.array([x['scan_time'].sum() for x in dfs])
    
    fig = go.Figure(data=[
        go.Bar(name='Initialization', x=names, y=initializations),
        go.Bar(name='Adaptation', x=names, y=adaptation),
        go.Bar(name='Index Search', x=names, y=search),
        go.Bar(name='Scan', x=names, y=scan),
    ])
    
    # Change the bar mode
    fig.update_layout(barmode='stack')
    fig.update_layout(title=f'Time Breakdown ({experiment})',
                   yaxis_title='Time (seconds)')
    fig.show()

break_down([
[read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree'],
[read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'MedianKDTree'],
[read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree'],
[read_df(files['CrackingKDTreePerDimension'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree Per Predicate'],
[read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii'],
[read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.1)'],
[read_df(files['ProgressiveIndex'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.2)'],
[read_df(files['ProgressiveIndex'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.5)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.1)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.2)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.5)'],
# [read_df(files['FullScan'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan'],
])

In [21]:
# experiment = 'power'
# n_rows = '10000000.0'
# n_queries = '3000'
# sel='0.0'

def convergence(values):
    dfs = np.array(values)[:,0]
    names = np.array(values)[:,1]
    fig = go.Figure()

    convergences = []
    for df in dfs:
        c = [i for i, x in enumerate(df['adaptation_time']) if x < 0.001]
        if(len(c) == 0):
            convergences.append(n_queries)
        else:
            convergences.append(c[0])
    
    fig = go.Figure(data=[
        go.Bar(name='Query Number', x=names, y=convergences),
    ])
    fig.update_layout(title=f'Convergence ({experiment})',
                   yaxis_title='Query number')
    fig.show()

convergence([
# [read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree'],
# [read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'MedianKDTree'],
[read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree'],
[read_df(files['CrackingKDTreePerDimension'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree Per Predicate'],
[read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii'],
[read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.1)'],
[read_df(files['ProgressiveIndex'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.2)'],
[read_df(files['ProgressiveIndex'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.5)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.1)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.2)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.5)'],
# [read_df(files['FullScan'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan'],
])

In [22]:
# experiment = 'power'
# n_rows = '10000000.0'
# n_queries = '3000'
# sel='0.0'

def robustness(values):
    dfs = np.array(values)[:,0]
    names = np.array(values)[:,1]
    fig = go.Figure()

    variances = [np.var(df['query_time'][:30]) for df in dfs]
    fig = go.Figure(data=[
        go.Bar(name='Query Number', x=names, y=variances),
    ])
    fig.update_layout(title=f'Robustness ({experiment})',
                   yaxis_title='Variance of first 30 queries')
    fig.show()

robustness([
# [read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree'],
# [read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'MedianKDTree'],
[read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree'],
[read_df(files['CrackingKDTreePerDimension'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree Per Predicate'],
[read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii'],
[read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.1)'],
[read_df(files['ProgressiveIndex'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.2)'],
[read_df(files['ProgressiveIndex'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.5)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.1)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.2)'],
[read_df(files['ProgressiveIndexAdaptive'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.5)'],
# [read_df(files['FullScan'], '0.0', '0', experiment, n_rows, n_queries, sel), 'Full Scan'],
])

In [23]:
# experiment = 'power'
# n_rows = '10000000.0'
# n_queries = '3000'
# sel='0.0'

def payoff(values, baseline):
    dfs = np.array(values)[:,0]
    names = np.array(values)[:,1]
    fig = go.Figure()

    payoffs = []
    for df in dfs:
        c = [i for i, x in enumerate(df['query_time_cumsum'] - baseline['query_time_cumsum']) if x < 0]
        if(len(c) == 0):
            payoffs.append(n_queries)
        else:
            payoffs.append(c[0])
    
    fig = go.Figure(data=[
        go.Bar(name='Query Number', x=names, y=payoffs),
    ])
    fig.update_layout(title=f'Payoff ({experiment})',
                   yaxis_title='Query Number')
    fig.show()

payoff([
    # [read_df(files['AverageKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Average KDTree'],
    # [read_df(files['MedianKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'MedianKDTree'],
    [read_df(files['CrackingKDTree'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree'],
    [read_df(files['CrackingKDTreePerDimension'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Adaptive KDTree Per Predicate'],
    [read_df(files['Quasii'], '0.0', '1024', experiment, n_rows, n_queries, sel), 'Quasii'],
    [read_df(files['ProgressiveIndex'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.1)'],
    [read_df(files['ProgressiveIndex'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.2)'],
    [read_df(files['ProgressiveIndex'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndex (Delta=0.5)'],
    [read_df(files['ProgressiveIndexAdaptive'], '0.1', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.1)'],
    [read_df(files['ProgressiveIndexAdaptive'], '0.2', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.2)'],
    [read_df(files['ProgressiveIndexAdaptive'], '0.5', '1024', experiment, n_rows, n_queries, sel), 'ProgressiveIndexAdaptive (Delta=0.5)'],
],
    read_df(files['FullScanCandidateList'], '0.0', '0', experiment, n_rows, n_queries, sel)
)

In [13]:
# to calculate selectivity in %
df = read_df(files['CrackingKDTree'], '0.0', '1024', 'genomics_query_0', 10000000, 3000, 0.0)
df['sel'] = df['scan_overhead_after_adapt']/df['tuples_scanned'] * 100

      adaptation_time  index_search_time  initialization_time  max_height  \
0            0.279070           0.000033             0.500216          21   
1            0.000867           0.000017             0.000000          23   
2            0.000392           0.000008             0.000000          24   
3            0.000004           0.000007             0.000000          24   
4            0.011601           0.000021             0.000000          25   
...               ...                ...                  ...         ...   
2555         0.000003           0.000009             0.000000          54   
2556         0.000003           0.000009             0.000000          54   
2557         0.000004           0.000009             0.000000          54   
2558         0.000003           0.000009             0.000000          54   
2559         0.000003           0.000009             0.000000          54   

      memory_footprint  min_height  number_of_nodes  partitions_scanned  \


In [18]:
def latex_table_metrics(exp, n_rows, n_queries, sel, algs):
    
    algs = np.array(algs)
    algorithms = algs[:,0]
    abbreviations = algs[:, 1]
    deltas = algs[:, 2]

    data = {'Metric': ['First Query Time', 'Pay-Off', 'Convergence', 'Robustness']}
    for alg, abbr, delta in zip(algorithms, abbreviations, deltas):
        if abbr not in data:
            data[abbr] = []

        df = read_df(files[alg], delta, '1024', exp, n_rows, n_queries, sel)

        # First Query Time
        data[abbr].append(df['query_time'].iloc[0])

        # PayOff
        baseline = read_df(files['FullScanCandidateList'], '0.0', '0', exp, n_rows, n_queries, sel)
        c = [i for i, x in enumerate(df['query_time_cumsum'] - baseline['query_time_cumsum']) if x < 0]
        if(len(c) == 0):
            data[abbr].append(n_queries)
        else:
            data[abbr].append(c[0])

        # Convergence
        c = [i for i, x in enumerate(df['adaptation_time']) if x < 0.001]
        if(len(c) == 0):
            data[abbr].append('-')
        else:
            data[abbr].append(c[0])

        # Robustness
        data[abbr].append(np.var(df['query_time'][:30]))

    return pd.DataFrame(data)

In [20]:
exp = 'power'
n_rows = 10000000
n_queries = 3000
n_col = 3
sel = 0.0

algs = [
    ('AverageKDTree', 'AvgKD', '0.0'),
    ('CrackingKDTree', 'AdaptKD', '0.0'),
    ('Quasii', 'Q', '0.0'),
    ('ProgressiveIndex', 'PI', '0.1'),
    ('ProgressiveIndexCostModel', 'PI_CM', '0.1')
]
latex_table_metrics(exp, n_rows, n_queries, sel, algs)

Unnamed: 0,Metric,AvgKD,AdaptKD,Q,PI,PI_CM
0,First Query Time,0.830934,0.161131,0.196037,0.062774,0.063589
1,Pay-Off,16.0,7.0,12.0,6.0,9.0
2,Convergence,0.0,30.0,2.0,111.0,32.0
3,Robustness,0.021921,0.000887,0.001955,0.000361,0.000101
