In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import numpy as np
pd.set_option('display.max_rows', 1000)
%matplotlib inline

In [8]:
files = {
    'AverageKDTree': 'average_kd_tree',
    'MedianKDTree': 'median_kd_tree',
    'CrackingKDTree': 'cracking_kd_tree',
    'CrackingKDTreePerDimension': 'cracking_kd_tree_pd',
    'Quasii': 'quasii',
    'FullScan':'full_scan',
    'FullScanCandidate': 'full_scan_cl',
    'ProgressiveIndex': 'progressive_index',
    'ProgressiveIndexAdaptive': 'progressive_index_adaptive'
}
def read_df(experiment, name, delta='0.0', partition='0'):
    df = pd.read_csv(f"results/{experiment}-{name}-{delta}-{partition}.csv")
    repetitions = df['repetition'].max() + 1
    step = int(len(df.index)/repetitions)
    df_final = df[:step].copy().reset_index()
    for rep in range(1, repetitions):
        df_final += df[step * (rep) : step * (rep + 1)].copy().reset_index()
    
    df_final = df_final/repetitions
    
    if 'index_search_time' not in df_final:
        df_final['index_search_time'] = 0.0
    df_final['query_time'] = df_final['initialization_time'] + df_final['index_search_time'] + df_final['scan_time'] + df_final['adaptation_time']
    df_final['query_time_cumsum'] = df_final['query_time'].cumsum()
    return df_final

# Cumulative Response Time
Cumulative response time using 2 attributes, 10M rows, and 0.1 selectivity

In [9]:
experiment = 'query0'
fig = go.Figure()

def plot_time(fig, df, name):
    fig.add_trace(go.Scatter(y=df['query_time_cumsum'], mode='lines',name=name))

plot_time(fig, read_df(experiment,files['AverageKDTree'], partition='1024'), 'Average KDTree (Partition=1024)')
# plot_time(fig,read_df(experiment,files['AverageKDTree'], partition='1048576'), 'Average KDTree (Partition=1048576)')

# plot_time(fig, read_df(experiment,files['MedianKDTree'], partition='1024'), 'Median KDTree (Partition=1024)')
# plot_time(fig, read_df(experiment,files['MedianKDTree'], partition='1048576'), 'Median KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1024'), 'Cracking KDTree (Partition=1024)')
# plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1048576'), 'Cracking KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1024'), 'Cracking KDTree Per Dimension (Partition=1024)')
# plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1048576'), 'Cracking KDTree Per Dimension (Partition=1048576)')

plot_time(fig, read_df(experiment, files['Quasii'], partition='1024'), 'Quasii (Partition=1024)')
# plot_time(fig, read_df(experiment, files['Quasii'], partition='1048576'), 'Quasii (Partition=1048576)')

plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.1', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.2)')
# plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.3)')
# plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.5)')

plot_time(fig, read_df(experiment, files['ProgressiveIndexAdaptive'], delta='0.1', partition='1024'), 'ProgressiveIndexAdaptive (Partition=1024, delta=0.2)')


plot_time(fig, read_df(experiment, files['FullScan']), 'Full Scan')
# plot_time(fig, read_df(experiment, files['FullScanCandidate']), 'Full Scan Candidate List')
fig.update_layout(title='Cumulative response time',
                   xaxis_title='Query',
                   yaxis_title='Time (seconds)')
fig.show()

# Response Time
Cumulative response time using 2 attributes, 10M rows, and 0.1 selectivity

In [41]:
experiment = '16cols'
fig = go.Figure()

def plot_time(fig, df, name):
    fig.add_trace(go.Scatter(y=df['query_time'], mode='lines',name=name))

plot_time(fig, read_df(experiment,files['AverageKDTree'], partition='1024'), 'Average KDTree (Partition=1024)')
plot_time(fig,read_df(experiment,files['AverageKDTree'], partition='1048576'), 'Average KDTree (Partition=1048576)')

df = read_df(experiment,files['MedianKDTree'], partition='1024')
df['query_time'][0] = 0
plot_time(fig, df, 'Median KDTree (Partition=1024)')

df = read_df(experiment,files['MedianKDTree'], partition='1048576')
df['query_time'][0] = 0
plot_time(fig, df, 'Median KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1024'), 'Cracking KDTree (Partition=1024)')
plot_time(fig, read_df(experiment, files['CrackingKDTree'], partition='1048576'), 'Cracking KDTree (Partition=1048576)')

plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1024'), 'Cracking KDTree Per Dimension (Partition=1024)')
plot_time(fig, read_df(experiment, files['CrackingKDTreePerDimension'], partition='1048576'), 'Cracking KDTree Per Dimension (Partition=1048576)')

plot_time(fig, read_df(experiment, files['Quasii'], partition='1024'), 'Quasii (Partition=1024)')
plot_time(fig, read_df(experiment, files['Quasii'], partition='1048576'), 'Quasii (Partition=1048576)')

plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.2)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.3)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1024'), 'ProgressiveIndex (Partition=1024, delta=0.5)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.2)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.3)')
plot_time(fig, read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1048576'), 'ProgressiveIndex (Partition=1048576, delta=0.5)')



plot_time(fig, read_df('2cols', files['FullScan']), 'Full Scan')
plot_time(fig, read_df('2cols', files['FullScanCandidate']), 'Full Scan Candidate List')
fig.update_layout(title='Response time (2 columns, 10M rows, 0.01% selectivity)',
                   xaxis_title='Query',
                   yaxis_title='Time (seconds)')
fig.show()

# Time Breakdown

In [17]:
experiment = 'query8'

def stack_bar(dfs, names):
    fig = go.Figure()
    initializations = np.array([x['initialization_time'].sum() for x in dfs])
    adaptation = np.array([x['adaptation_time'].sum() for x in dfs])
    search = np.array([x['index_search_time'].sum() for x in dfs])
    scan = np.array([x['scan_time'].sum() for x in dfs])
    
#     plt.bar(ind, search, bottom=initializations + adaptation, label='Index Search Time')
#     plt.bar(ind, scan, bottom=initializations + adaptation + search, label='Table Scan Time')

    fig = go.Figure(data=[
        go.Bar(name='Initialization', x=names, y=initializations),
        go.Bar(name='Adaptation', x=names, y=adaptation),
        go.Bar(name='Index Search', x=names, y=search),
        go.Bar(name='Scan', x=names, y=scan),
    ])
    # Change the bar mode
    fig.update_layout(barmode='stack')
    fig.update_layout(title='Time Breakdown (2 columns, 10M rows, 0.01% selectivity)',
                   yaxis_title='Time (seconds)')
    fig.show()

stack_bar(
    [
#         read_df(experiment, files['AverageKDTree'], partition='1024'),
#         read_df(experiment, files['MedianKDTree'], partition='1024'),
        read_df(experiment, files['CrackingKDTree'], partition='1024'),
        read_df(experiment, files['CrackingKDTreePerDimension'], partition='1024'),
#         read_df(experiment, files['Quasii'], partition='1024'),
#         read_df(experiment, files['FullScan']),
#         read_df(experiment, files['FullScanCandidate']),
#         read_df(experiment, files['ProgressiveIndex'], delta='0.2', partition='1024'),
#         read_df(experiment, files['ProgressiveIndex'], delta='0.3', partition='1024'),
#         read_df(experiment, files['ProgressiveIndex'], delta='0.5', partition='1024')
    ],
    [
#         "Average KDTree",
#         "Median KDTree",
        "Cracking KDTree",
        "Cracking KDTree PD",
#         "Quasii",
#         "Full Scan",
#         "Full Scan Candidate List",
#         "Progressive Index (delta=0.2)",
#         "Progressive Index (delta=0.3)",
#         "Progressive Index (delta=0.5)"
    ]
)

In [9]:
# to calculate selectivity in %
df = read_df('query0', files['CrackingKDTree'], partition='1024')
df['scan_overhead_after_adapt']/df['tuples_scanned'] * 100

0       0.000273
1       0.000296
2       0.000320
3       0.000320
4       0.000292
          ...   
1275         NaN
1276         NaN
1277         NaN
1278         NaN
1279         NaN
Length: 1280, dtype: float64

In [28]:
read_df('query0', files['FullScan']).head(1000)

Unnamed: 0,index,adaptation_time,initialization_time,scan_overhead,scan_time,tuples_scanned,repetition,index_search_time,query_time,query_time_cumsum
0,0.0,0.0,0.0,27.265638,0.278447,10000000.0,0.0,0.0,0.278447,0.278447
1,1.0,0.0,0.0,29.642981,0.271215,10000000.0,0.0,0.0,0.271215,0.549662
2,2.0,0.0,0.0,31.984749,0.268979,10000000.0,0.0,0.0,0.268979,0.818641
3,3.0,0.0,0.0,31.984749,0.267985,10000000.0,0.0,0.0,0.267985,1.086626
4,4.0,0.0,0.0,29.206886,0.291298,10000000.0,0.0,0.0,0.291298,1.377924
5,5.0,0.0,0.0,29.081942,0.268629,10000000.0,0.0,0.0,0.268629,1.646553
6,6.0,0.0,0.0,31.47683,0.268234,10000000.0,0.0,0.0,0.268234,1.914787
7,7.0,0.0,0.0,22.598104,0.273128,10000000.0,0.0,0.0,0.273128,2.187915
8,8.0,0.0,0.0,9.8422,0.287846,10000000.0,0.0,0.0,0.287846,2.475761
9,9.0,0.0,0.0,7.493218,0.290225,10000000.0,0.0,0.0,0.290225,2.765986
