In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
pd.set_option('display.max_rows', 1000)
%matplotlib inline

In [20]:
files = {
    'AverageKDTree': 'average_kd_tree',
    'MedianKDTree': 'median_kd_tree',
    'CrackingKDTree': 'cracking_kd_tree',
    'CrackingKDTreePerDimension': 'cracking_kd_tree_pd',
    'Quasii': 'quasii',
    'FullScan':'full_scan',
    'FullScanCandidate': 'full_scan_cl',
    'ProgressiveIndex': 'progressive_index'
}
def read_df(experiment, name, delta='0.0', partition='0'):
    df = pd.read_csv(f"results/{experiment}-{name}-{delta}-{partition}.csv")
    repetitions = df['repetition'].max() + 1
    step = int(len(df.index)/repetitions)
    df_final = df[:step].copy().reset_index()
    for rep in range(1, repetitions):
        df_final += df[step * (rep) : step * (rep + 1)].copy().reset_index()
    
    df_final = df_final/repetitions
    
    if 'index_search_time' not in df_final:
        df_final['index_search_time'] = 0.0
    df_final['query_time'] = df_final['initialization_time'] + df_final['index_search_time'] + df_final['scan_time'] + df_final['adaptation_time']
    df_final['query_time_cumsum'] = df_final['query_time'].cumsum()
    return df_final

# Cumulative Response Time
Cumulative response time using 2 attributes, 10M rows, and 0.1 selectivity

In [98]:
experiment = '2cols'
fig = go.Figure()

def plot_time(fig, df, name):
    fig.add_trace(go.Scatter(y=df['query_time_cumsum'], mode='lines',name=name))

plot_time(fig, read_df(experiment,files['AverageKDTree'], partition='1024'), 'Average KDTree (Partition=1024)')
# plot_time(fig,read_df(experiment,files['AverageKDTree'], partition='1048576'), 'Average KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment,files['MedianKDTree'], partition='1024'), 'Median KDTree (Partition=1024)')
# plot_time(fig, read_df(experiment,files['MedianKDTree'], partition='1048576'), 'Median KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1024'), 'Cracking KDTree (Partition=1024)')
# plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1048576'), 'Cracking KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1024'), 'Cracking KDTree Per Dimension (Partition=1024)')
# plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1048576'), 'Cracking KDTree Per Dimension (Partition=1048576)')

plot_time(fig, read_df(experiment, files['Quasii'], partition='1024'), 'Quasii (Partition=1024)')
# plot_time(fig, read_df(experiment, files['Quasii'], partition='1048576'), 'Quasii (Partition=1048576)')

plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.2)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.3)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.5)')
# plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.2)')
# plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.3)')
# plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.5)')



# plot_time(fig, read_df('2cols', files['FullScan']), 'Full Scan')
# plot_time(fig, read_df('2cols', files['FullScanCandidate']), 'Full Scan Candidate List')
fig.update_layout(title='Cumulative response time (2 columns, 10M rows, 0.01% selectivity)',
                   xaxis_title='Query',
                   yaxis_title='Time (seconds)')
fig.show()

# Response Time
Cumulative response time using 2 attributes, 10M rows, and 0.1 selectivity

In [41]:
experiment = '16cols'
fig = go.Figure()

def plot_time(fig, df, name):
    fig.add_trace(go.Scatter(y=df['query_time'], mode='lines',name=name))

plot_time(fig, read_df(experiment,files['AverageKDTree'], partition='1024'), 'Average KDTree (Partition=1024)')
plot_time(fig,read_df(experiment,files['AverageKDTree'], partition='1048576'), 'Average KDTree (Partition=1048576)')

df = read_df(experiment,files['MedianKDTree'], partition='1024')
df['query_time'][0] = 0
plot_time(fig, df, 'Median KDTree (Partition=1024)')

df = read_df(experiment,files['MedianKDTree'], partition='1048576')
df['query_time'][0] = 0
plot_time(fig, df, 'Median KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1024'), 'Cracking KDTree (Partition=1024)')
plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1048576'), 'Cracking KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1024'), 'Cracking KDTree Per Dimension (Partition=1024)')
plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1048576'), 'Cracking KDTree Per Dimension (Partition=1048576)')

plot_time(fig, read_df(experiment, files['Quasii'], partition='1024'), 'Quasii (Partition=1024)')
plot_time(fig, read_df(experiment, files['Quasii'], partition='1048576'), 'Quasii (Partition=1048576)')

plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.2)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.3)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.5)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.2)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.3)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.5)')



plot_time(fig, read_df('2cols', files['FullScan']), 'Full Scan')
plot_time(fig, read_df('2cols', files['FullScanCandidate']), 'Full Scan Candidate List')
fig.update_layout(title='Response time (2 columns, 10M rows, 0.01% selectivity)',
                   xaxis_title='Query',
                   yaxis_title='Time (seconds)')
fig.show()

# Time Breakdown

In [97]:
experiment = '2cols'

def stack_bar(dfs, names):
    fig = go.Figure()
    initializations = np.array([x['initialization_time'].sum() for x in dfs])
    adaptation = np.array([x['adaptation_time'].sum() for x in dfs])
    search = np.array([x['index_search_time'].sum() for x in dfs])
    scan = np.array([x['scan_time'].sum() for x in dfs])
    
#     plt.bar(ind, search, bottom=initializations + adaptation, label='Index Search Time')
#     plt.bar(ind, scan, bottom=initializations + adaptation + search, label='Table Scan Time')

    fig = go.Figure(data=[
        go.Bar(name='Initialization', x=names, y=initializations),
        go.Bar(name='Adaptation', x=names, y=adaptation),
        go.Bar(name='Index Search', x=names, y=search),
        go.Bar(name='Scan', x=names, y=scan),
    ])
    # Change the bar mode
    fig.update_layout(barmode='stack')
    fig.update_layout(title='Time Breakdown (2 columns, 10M rows, 0.01% selectivity)',
                   yaxis_title='Time (seconds)')
    fig.show()

stack_bar(
    [
#         read_df(experiment, files['AverageKDTree'], partition='1024'),
#         read_df(experiment, files['MedianKDTree'], partition='1024'),
#         read_df(experiment, files['CrackingKDTree'], partition='1024'),
#         read_df(experiment, files['CrackingKDTreePerDimension'], partition='1024'),
#         read_df(experiment, files['Quasii'], partition='1024'),
#         read_df(experiment, files['FullScan']),
#         read_df(experiment, files['FullScanCandidate']),
        read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1024'),
        read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1024'),
        read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1024')
    ],
    [
#         "Average KDTree",
#         "Median KDTree",
#         "Cracking KDTree",
#         "Cracking KDTree PD",
#         "Quasii",
#         "Full Scan",
#         "Full Scan Candidate List",
        "Progressive Index (delta=0.2)",
        "Progressive Index (delta=0.3)",
        "Progressive Index (delta=0.5)"
    ]
)

In [90]:
# to calculate selectivity in %
df = read_df('2cols', files['CrackingKDTree'], partition='1024')
df['scan_overhead_after_adapt']/df['tuples_scanned'] * 100

0      0.010077
1      0.010127
2      0.009971
3      0.009943
4      0.009908
5      0.009873
6      0.010047
7      0.010138
8      0.010002
9      0.009895
10     0.010053
11     0.010003
12     0.009890
13     0.010052
14     0.010256
15     0.010115
16     0.009990
17     0.009907
18     0.009930
19     0.010077
20     0.010036
21     0.009841
22     0.009917
23     0.009892
24     0.010009
25     0.009883
26     0.009910
27     0.010039
28     0.010016
29     0.010076
30     0.010101
31     0.010131
32     0.010102
33     0.009841
34     0.009994
35     0.010094
36     0.009893
37     0.010013
38     0.009881
39     0.010005
40     0.010088
41     0.010029
42     0.010008
43     0.010056
44     0.010008
45     0.010120
46     0.009888
47     0.009896
48     0.010113
49     0.009938
50     0.009775
51     0.010019
52     0.010091
53     0.010031
54     0.010024
55     0.009902
56     0.010174
57     0.009883
58     0.009861
59     0.009963
60     0.010117
61     0.010186
62     0