In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator

import seaborn as sns
import os
import itertools

In [2]:
FIG_PREFIX = 'synthetic4'

In [3]:
precentages = [10, 30, 50]
exprs = [1, 2, 3, 4]

In [4]:
root_dir_fmt = '../result/synthetic_expr4_{}2' #/synthetic_expr4_rob0{}_{}_{}_result
root_free_dir = root_dir_fmt.format('free')
root_best_dir = root_dir_fmt.format('best')
root_lpms_dir = root_dir_fmt.format('lpms')
id_cols = ['name', 'num_threads', 'gram_size', 'selectivity', 'num_keys']

In [5]:
def build_stats_filename(row, reduced):
    name = row['name']
    if 'parallel' in name:
    # if '-' in name:
        name = name.split('-')[0]
    fields = [name, str(int(row['num_threads'])), str(int(row['gram_size'])), str(row['selectivity'] if row['selectivity'] > 0 else -1), 'stats.csv']

    if reduced is not None:
        fields.insert(-1, str(reduced))
    elif 'BEST' in name:
        fields.insert(-1, '-1')
    space_file = f"time_report_"
    if 'LPMS' in name:
        space_file += f"lpms_t{row['num_threads']}"
        space_file += '_determ' if 'DETER' in name else '_random'
        # max_num = -1
    else:
        space_file += f"{name.lower()}_t{row['num_threads']}"
        if reduced is not None:
            space_file += f"_red{reduced/1000:g}"
        space_file += f"_c{row['selectivity']}"
        if 'FREE' in name:
            space_file += f"_n{row['gram_size']}"
    if 'key_upper_bound' in row:
            space_file += f"_{row['key_upper_bound']}"
            fields.insert(-1, str(int(row['key_upper_bound'])))
    space_file += '.txt'
    return '_'.join(fields), space_file   

In [6]:
def get_compute_space(space_f, root_dir):
    fullname = os.path.join(root_dir, space_f)
    try:
        with open(fullname, 'r') as file:
            for line in file:
                if 'Maximum resident set size (kbytes)' in line:
                    return int(line.strip().split(': ')[-1].strip())
    except:
        # print(f'Error reading file {fullname}')
        pass
    return -1

In [7]:
def get_stats_df(row, root_dir, reduced=None):
    fname, space_f = build_stats_filename(row, reduced)
    return pd.read_csv(os.path.join(root_dir, fname), sep='\t'), get_compute_space(space_f, root_dir)

In [8]:
def add_num_after_filter(df, root_dir, reduced_list=None):
    new_cols = {
        'num_after_filter': [],
        'per_query_num_after_filter': [],
        'match_count': [],
        'compute_size': []
    }
    idx = 0
    reduced = None
    if reduced_list is not None:
        assert(reduced_list is None or len(reduced_list) == df.shape[0])
    for _, row in df.iterrows():
        if reduced_list is not None:
            reduced = reduced_list[idx]
            idx += 1
        detail_df, compute_size = get_stats_df(row, root_dir, reduced=reduced)
        new_cols['num_after_filter'].append(detail_df['num_after_filter'].sum())
        new_cols['per_query_num_after_filter'].append(detail_df['num_after_filter'].mean())
        new_cols['match_count'].append(detail_df['count'].sum())
        new_cols['compute_size'].append(compute_size)
    return df.assign(**new_cols)

In [9]:
def build_result_df(root_dir, reduced=None):
    summary_path = os.path.join(root_dir, 'summary.csv')
    summary_df = pd.read_csv(summary_path)
    summary_df.ffill(inplace=True)
    summary_df['overall_match_time'] = summary_df['compile_time'] + summary_df['match_time']
    summary_df.rename(columns={'overall_time': 'overall_index_time', '    name': 'name'}, inplace=True)
    summary_df['overall_time'] = summary_df['overall_index_time'] + summary_df['overall_match_time']
    # code for getting end-to-end trimmed mean
    summary_df = add_num_after_filter(summary_df, root_dir, reduced_list=reduced)
    res = summary_df.sort_values('overall_time')
    gb = res.groupby(by=id_cols)
    if gb.count()['selection_time'].min() > 2:
        res = res.drop(gb.tail(1).index)
        res = res.drop(gb.head(1).index)
    res_df = res.groupby(by=id_cols, as_index=False).min(numeric_only=True).apply(lambda x: x)
    return res_df

In [10]:
dfs = []
for e, p in itertools.product(exprs, precentages):
    root_dir = root_free_dir+f'/synthetic_expr4_rob0{e}_{p}_free_result2S'
    curr_df = build_result_df(root_dir)
    curr_df['expr'] = [e]*curr_df.shape[0]
    curr_df['prec_data_seen'] = [p]*curr_df.shape[0]
    dfs.append(curr_df)
res_free_df = pd.concat(dfs, ignore_index=True)
res_free_df.describe()

Unnamed: 0,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,index_size,compile_time,match_time,overall_match_time,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen
count,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0
mean,16.0,4.0,0.23,598.729167,-1.0,0.0,0.023936,0.006777,0.030713,574115.6,0.002936,0.082359,0.085295,0.116008,181582.833333,1815.828333,85571.833333,13679.708333,2.5,30.0
std,0.0,1.635836,0.225836,1150.029496,0.0,0.0,0.018462,0.006422,0.02464,467314.2,0.001142,0.04541,0.045567,0.055678,120102.416424,1201.024164,54436.450026,2812.061522,1.11998,16.358356
min,16.0,2.0,0.02,0.0,-1.0,0.0,0.005813,0.001261,0.007572,8280.0,0.001271,0.007244,0.008534,0.016221,0.0,0.0,0.0,8972.0,1.0,10.0
25%,16.0,2.0,0.0875,12.75,-1.0,0.0,0.011768,0.002237,0.014623,206584.0,0.002188,0.060201,0.063686,0.093747,97275.0,972.75,62767.0,11066.0,1.75,10.0
50%,16.0,4.0,0.135,127.0,-1.0,0.0,0.019781,0.004143,0.023891,490384.0,0.003187,0.082646,0.087147,0.100041,180850.5,1808.505,84080.0,13496.0,2.5,30.0
75%,16.0,6.0,0.275,512.0,-1.0,0.0,0.024452,0.008596,0.032801,1032104.0,0.003511,0.102102,0.104497,0.155559,242964.75,2429.6475,106896.0,15259.0,3.25,50.0
max,16.0,6.0,0.7,4096.0,-1.0,0.0,0.078118,0.02855,0.104397,1577888.0,0.005641,0.258119,0.261488,0.287082,486470.0,4864.7,173411.0,21160.0,4.0,50.0


In [11]:
res_free_df = res_free_df[(res_free_df['match_count'] > 0) & (res_free_df['name'].str.contains('parallel'))]#.reset_index()
new_free = res_free_df.loc[res_free_df.groupby(['expr', 'prec_data_seen']).match_time.idxmin()].sort_values(by=['expr', 'num_keys'])
# new_groups = new_free.groupby(['method', 'gram_size'])

In [12]:
new_free

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,compile_time,match_time,overall_match_time,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen
6,FREE-parallel,16,2,0.5,16,-1,0,0.006294,0.002212,0.008505,...,0.001436,0.081419,0.082855,0.09136,240545,2405.45,173411,11232,1,10
30,FREE-parallel,16,2,0.5,16,-1,0,0.005861,0.002004,0.007865,...,0.001376,0.081158,0.082534,0.090399,240545,2405.45,173411,10216,1,30
54,FREE-parallel,16,2,0.5,16,-1,0,0.006384,0.002235,0.008619,...,0.001431,0.080884,0.082315,0.090934,240545,2405.45,173411,11360,1,50
94,FREE-parallel,16,6,0.5,64,-1,0,0.012495,0.003671,0.016165,...,0.002368,0.081147,0.083515,0.09968,163355,1633.55,106896,12632,2,10
110,FREE-parallel,16,4,0.5,64,-1,0,0.011719,0.003842,0.015561,...,0.002439,0.081408,0.083847,0.099408,163355,1633.55,106896,12996,2,30
126,FREE-parallel,16,2,0.5,64,-1,0,0.010712,0.003093,0.013806,...,0.002409,0.081583,0.083992,0.097798,163355,1633.55,106896,12544,2,50
156,FREE-parallel,16,4,0.15,166,-1,0,0.02174,0.006449,0.028189,...,0.00342,0.065161,0.068581,0.09677,129390,1293.9,84080,13976,3,10
180,FREE-parallel,16,4,0.15,166,-1,0,0.017899,0.005976,0.023874,...,0.003528,0.065198,0.068726,0.0926,129390,1293.9,84080,13572,3,30
204,FREE-parallel,16,4,0.15,166,-1,0,0.020407,0.006006,0.026413,...,0.003428,0.065541,0.068969,0.095382,129390,1293.9,84080,13468,3,50
236,FREE-parallel,16,6,0.15,256,-1,0,0.02132,0.009233,0.030553,...,0.003499,0.059813,0.063312,0.093865,97275,972.75,62767,15028,4,10


In [13]:
dfs = []
for e, p in itertools.product(exprs, precentages):
    root_dir = root_best_dir+f'/synthetic_expr4_rob0{e}_{p}_best_result2'
    curr_df = build_result_df(root_dir)
    curr_df['expr'] = [e]*curr_df.shape[0]
    curr_df['prec_data_seen'] = [p]*curr_df.shape[0]
    dfs.append(curr_df)
e = 4
p = 10
root_dir = root_best_dir+f'/synthetic_expr4_rob0{e}_{p}_best_result3'
curr_df = build_result_df(root_dir)
curr_df['expr'] = [e]*curr_df.shape[0]
curr_df['prec_data_seen'] = [p]*curr_df.shape[0]
dfs.append(curr_df)
res_best_df = pd.concat(dfs, ignore_index=True)
res_best_df.describe()

Unnamed: 0,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,index_size,compile_time,match_time,overall_match_time,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen
count,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0,90.0
mean,16.0,-1.0,0.275333,254.211111,30.6,1166.666667,225.476319,0.000652,225.476943,440524.8,0.005806,0.264085,0.269891,225.746834,567770.6,2661.964956,174988.7,47553.511111,2.833333,23.333333
std,0.0,0.0,0.256113,281.085396,77.401158,793.22978,488.554786,0.00057,488.555102,357834.5,0.005925,0.320008,0.32519,488.483486,608944.8,1214.195578,114517.000897,29256.65186,1.192203,15.864596
min,16.0,-1.0,0.02,16.0,-1.0,500.0,0.598805,2.8e-05,0.598954,9096.0,0.001255,0.057815,0.060034,0.722478,97275.0,972.75,62767.0,12300.0,1.0,10.0
25%,16.0,-1.0,0.05,64.0,-1.0,500.0,8.026525,0.000226,8.026685,200168.0,0.002011,0.063213,0.066135,8.149082,163355.0,1479.984,84080.0,31229.0,2.0,10.0
50%,16.0,-1.0,0.2,144.0,-1.0,500.0,33.69775,0.000574,33.69825,293336.0,0.002946,0.106888,0.108945,33.790323,326434.0,2549.86,106896.0,39716.0,3.0,10.0
75%,16.0,-1.0,0.5,293.0,20.0,1500.0,145.08625,0.000912,145.08725,600184.0,0.01542,0.324874,0.340413,145.153491,559985.0,3619.885,354198.0,64473.0,4.0,30.0
max,16.0,-1.0,0.7,1207.0,300.0,2500.0,2607.75,0.002698,2607.75,1147936.0,0.015662,1.187794,1.203285,2607.874085,2286665.0,4950.18,354198.0,137224.0,4.0,50.0


In [14]:
# res_best_df = res_best_df[(res_best_df['match_count'] > 0) & (res_best_df['name'].str.contains('parallel'))]#.reset_index()
new_best = res_best_df.loc[res_best_df.groupby(['expr', 'prec_data_seen']).match_time.idxmin()].sort_values(by=['expr', 'num_keys'])
# new_groups = new_best.groupby(['method', 'gram_size'])

In [15]:
res_best_df

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,compile_time,match_time,overall_match_time,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen
0,BEST-parallel,16,-1,0.02,82,-1,500,0.598805,0.000149,0.598954,...,0.001272,0.122252,0.123524,0.722478,475325,4753.250,173411,12300,1,10
1,BEST-parallel,16,-1,0.05,131,-1,500,3.412670,0.000263,3.412930,...,0.001284,0.112366,0.113650,3.526580,445938,4459.380,173411,15620,1,10
2,BEST-parallel,16,-1,0.10,75,-1,500,2.541830,0.000187,2.542010,...,0.001257,0.088197,0.089454,2.631464,362119,3621.190,173411,18116,1,10
3,BEST-parallel,16,-1,0.20,64,-1,500,3.801640,0.000226,3.801860,...,0.001280,0.089145,0.090425,3.892285,346605,3466.050,173411,18040,1,10
4,BEST-parallel,16,-1,0.50,16,-1,500,1.106330,0.000157,1.106480,...,0.001289,0.059144,0.060433,1.166913,240545,2405.450,173411,21144,1,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,BEST-parallel,16,-1,0.50,251,300,500,60.195800,0.000942,60.196700,...,0.015541,0.325076,0.340617,60.537317,559985,1119.970,354198,41716,4,10
86,BEST-parallel,16,-1,0.70,20,20,500,3.569180,0.000096,3.569280,...,0.015501,0.864442,0.879943,4.449223,1707087,3414.174,354198,43372,4,10
87,BEST-parallel,16,-1,0.70,50,50,500,10.555700,0.000197,10.555900,...,0.015389,0.626484,0.641873,11.197773,1116714,2233.428,354198,43864,4,10
88,BEST-parallel,16,-1,0.70,100,100,500,43.037400,0.000358,43.037800,...,0.015632,0.427435,0.443067,43.480867,740622,1481.244,354198,44052,4,10


In [16]:
new_best

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,compile_time,match_time,overall_match_time,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen
4,BEST-parallel,16,-1,0.5,16,-1,500,1.10633,0.000157,1.10648,...,0.001289,0.059144,0.060433,1.166913,240545,2405.45,173411,21144,1,10
10,BEST-parallel,16,-1,0.5,16,-1,1500,3.13858,0.000155,3.13873,...,0.001361,0.058673,0.060034,3.198764,240545,2405.45,173411,41216,1,30
17,BEST-parallel,16,-1,0.7,16,-1,2500,7.75624,0.000129,7.75637,...,0.001314,0.058889,0.060203,7.816573,240545,2405.45,173411,69436,1,50
23,BEST-parallel,16,-1,0.7,64,-1,500,6.71607,0.000394,6.71646,...,0.002045,0.071907,0.073952,6.790412,163355,1633.55,106896,31212,2,10
28,BEST-parallel,16,-1,0.5,64,-1,1500,23.6028,0.000363,23.6032,...,0.002028,0.072005,0.074033,23.677233,163355,1633.55,106896,61016,2,30
35,BEST-parallel,16,-1,0.7,64,-1,2500,57.4597,0.000406,57.4601,...,0.002033,0.072107,0.07414,57.53424,163355,1633.55,106896,91160,2,50
39,BEST-parallel,16,-1,0.2,144,-1,500,26.2065,0.000648,26.2071,...,0.002972,0.062655,0.065627,26.272727,129299,1292.99,84080,39136,3,10
47,BEST-parallel,16,-1,0.7,144,-1,1500,128.335,0.000546,128.336,...,0.003048,0.062697,0.065745,128.401745,129299,1292.99,84080,70380,3,30
49,BEST-parallel,16,-1,0.5,144,-1,2500,237.076,0.000607,237.077,...,0.002937,0.062678,0.065615,237.142615,129299,1292.99,84080,117372,3,50
55,BEST-parallel,16,-1,0.5,251,-1,500,58.2135,0.000804,58.2143,...,0.00303,0.057876,0.060906,58.275206,97301,973.01,62767,43824,4,10


In [17]:
dfs = []
for e, p in itertools.product(exprs, precentages):
    root_dir = root_lpms_dir+f'/synthetic_expr4_rob0{e}_{p}_lpms_result2S'
    curr_df = build_result_df(root_dir)
    curr_df['expr'] = [e]*curr_df.shape[0]
    curr_df['prec_data_seen'] = [p]*curr_df.shape[0]
    dfs.append(curr_df)
e = 4
p = 10
root_dir = root_lpms_dir+f'/synthetic_expr4_rob0{e}_{p}_lpms_result3'
curr_df = build_result_df(root_dir)
curr_df['expr'] = [e]*curr_df.shape[0]
curr_df['prec_data_seen'] = [p]*curr_df.shape[0]
dfs.append(curr_df)
res_lpms_df = pd.concat(dfs, ignore_index=True)
res_lpms_df.describe()

Unnamed: 0,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,index_size,compile_time,match_time,overall_match_time,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen
count,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0
mean,16.0,-1.0,-1.0,53.266667,10.533333,1300.0,1.781053,-1.0,1.781053,116637.066667,0.004966,0.359057,0.364027,2.14508,749903.3,3949.342667,156270.4,30953.866667,2.8,26.0
std,0.0,0.0,0.0,77.855605,28.34448,861.891607,2.473466,0.0,2.473466,144039.693267,0.005517,0.40318,0.408663,2.699818,771308.0,1130.637889,109408.774181,15932.726275,1.207122,17.237832
min,16.0,-1.0,-1.0,0.0,-1.0,500.0,0.012103,-1.0,0.012103,8280.0,0.001231,0.087703,0.088934,0.101037,234904.0,2349.04,62767.0,16452.0,1.0,10.0
25%,16.0,-1.0,-1.0,8.0,-1.0,500.0,0.048738,-1.0,0.048738,11286.0,0.002001,0.135467,0.137413,0.212436,276812.0,2768.12,84080.0,19024.0,2.0,10.0
50%,16.0,-1.0,-1.0,16.0,-1.0,1500.0,0.399268,-1.0,0.399268,17232.0,0.002942,0.181387,0.184428,0.53075,500000.0,4490.412,106896.0,21400.0,3.0,30.0
75%,16.0,-1.0,-1.0,90.0,-1.0,2000.0,3.577995,-1.0,3.577995,229680.0,0.003099,0.258112,0.261211,4.27532,500000.0,5000.0,173411.0,50902.0,4.0,40.0
max,16.0,-1.0,-1.0,298.0,100.0,2500.0,8.68438,-1.0,8.68438,445720.0,0.015618,1.195813,1.211306,8.945441,2395106.0,5000.0,354198.0,54048.0,4.0,50.0


In [18]:
new_lpms = res_lpms_df.loc[res_lpms_df.groupby(['expr', 'prec_data_seen']).match_time.idxmin()].sort_values(by=['expr', 'num_keys'])

In [19]:
new_lpms

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,compile_time,match_time,overall_match_time,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen
0,LPMS-DETERM,16,-1,-1,0,-1,500,0.399268,-1,0.399268,...,0.00125,0.130232,0.131482,0.53075,500000,5000.0,173411,18272,1,10
1,LPMS-DETERM,16,-1,-1,0,-1,1500,0.095691,-1,0.095691,...,0.001261,0.129137,0.130398,0.226089,500000,5000.0,173411,16452,1,30
2,LPMS-DETERM,16,-1,-1,4,-1,2500,0.012103,-1,0.012103,...,0.001231,0.087703,0.088934,0.101037,308782,3087.82,173411,16656,1,50
4,LPMS-DETERM,16,-1,-1,8,-1,1500,0.035053,-1,0.035053,...,0.00199,0.141355,0.143345,0.178398,256655,2566.55,106896,19512,2,30
5,LPMS-DETERM,16,-1,-1,8,-1,2500,0.055251,-1,0.055251,...,0.002041,0.141491,0.143532,0.198783,256655,2566.55,106896,19492,2,50
3,LPMS-DETERM,16,-1,-1,9,-1,500,0.300079,-1,0.300079,...,0.002011,0.202794,0.204805,0.504884,500000,5000.0,106896,20136,2,10
8,LPMS-DETERM,16,-1,-1,12,-1,2500,0.034318,-1,0.034318,...,0.002928,0.127988,0.130916,0.165234,243879,2438.79,84080,18556,3,50
6,LPMS-DETERM,16,-1,-1,84,-1,500,1.19165,-1,1.19165,...,0.002971,0.194808,0.197779,1.389429,500000,5000.0,84080,31532,3,10
7,LPMS-DETERM,16,-1,-1,98,-1,1500,1.0422,-1,1.0422,...,0.002942,0.140703,0.143645,1.185845,296969,2969.69,84080,21424,3,30
11,LPMS-DETERM,16,-1,-1,16,-1,2500,0.042224,-1,0.042224,...,0.003041,0.181387,0.184428,0.226652,234904,2349.04,62767,21400,4,50


In [20]:
res_df = pd.concat([res_free_df, res_best_df, res_lpms_df], ignore_index=True)

In [21]:
# explore the distribution of number of keys and number
res_df['prop_error'] = (res_df['num_after_filter'] - res_df['match_count']) / res_df['num_after_filter']
res_df['precision'] = ( res_df['match_count']) / res_df['num_after_filter']
res_df['method'] = [name.split('-')[0] for name in res_df['name']]
new_res_df_temp = res_df[res_df['expr'] == 4]
# new_res_df_temp = new_res_df_temp.loc[new_res_df_temp.groupby(['expr', 'prec_data_seen', 'num_after_filter', 'precision', 'num_keys']).gram_size.idxmin()].sort_values(by=['prec_data_seen','num_keys'])
# new_groups = new_res_df.groupby(['expr', 'prec_data_seen', 'method', 'gram_size'])

In [22]:
new_res_df_temp.sort_values(by='precision')

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen,prop_error,precision,method
336,LPMS-DETERM,16,-1,-1.00,96,-1,500,3.436120,-1.000000,3.436120,...,3.697482,500000,5000.00,62767,49452,4,10,0.874466,0.125534,LPMS
337,LPMS-DETERM,16,-1,-1.00,298,-1,1500,8.684380,-1.000000,8.684380,...,8.945441,495036,4950.36,62767,54048,4,30,0.873207,0.126793,LPMS
171,FREE-parallel,16,2,0.10,4,-1,0,0.023834,0.001760,0.025594,...,0.287082,486470,4864.70,62767,14524,4,10,0.870975,0.129025,FREE
215,FREE-parallel,16,2,0.10,4,-1,0,0.022313,0.001973,0.024286,...,0.285685,486470,4864.70,62767,14016,4,50,0.870975,0.129025,FREE
193,FREE-parallel,16,2,0.10,4,-1,0,0.022449,0.001784,0.024233,...,0.284401,486470,4864.70,62767,13968,4,30,0.870975,0.129025,FREE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,FREE-parallel,16,4,0.20,256,-1,0,0.023033,0.008936,0.031969,...,0.095657,97275,972.75,62767,15324,4,10,0.354747,0.645253,FREE
181,FREE-parallel,16,4,0.15,256,-1,0,0.022624,0.008362,0.030986,...,0.094808,97275,972.75,62767,15284,4,10,0.354747,0.645253,FREE
297,BEST-parallel,16,-1,0.20,256,-1,1500,291.564000,0.001052,291.565000,...,291.625888,97275,972.75,62767,82740,4,30,0.354747,0.645253,BEST
299,BEST-parallel,16,-1,0.70,256,-1,1500,282.602000,0.000922,282.603000,...,282.664179,97275,972.75,62767,87372,4,30,0.354747,0.645253,BEST


In [23]:
new_res_df_temp

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen,prop_error,precision,method
171,FREE-parallel,16,2,0.10,4,-1,0,0.023834,0.001760,0.025594,...,0.287082,486470,4864.700,62767,14524,4,10,0.870975,0.129025,FREE
172,FREE-parallel,16,2,0.12,253,-1,0,0.023121,0.008593,0.031714,...,0.095541,97275,972.750,62767,15120,4,10,0.354747,0.645253,FREE
173,FREE-parallel,16,2,0.15,256,-1,0,0.023137,0.008821,0.031958,...,0.095731,97275,972.750,62767,15060,4,10,0.354747,0.645253,FREE
174,FREE-parallel,16,2,0.20,256,-1,0,0.022899,0.009043,0.031942,...,0.099349,97275,972.750,62767,15032,4,10,0.354747,0.645253,FREE
175,FREE-parallel,16,2,0.50,256,-1,0,0.023565,0.008387,0.031952,...,0.095419,97275,972.750,62767,15136,4,10,0.354747,0.645253,FREE
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337,LPMS-DETERM,16,-1,-1.00,298,-1,1500,8.684380,-1.000000,8.684380,...,8.945441,495036,4950.360,62767,54048,4,30,0.873207,0.126793,LPMS
338,LPMS-DETERM,16,-1,-1.00,16,-1,2500,0.042224,-1.000000,0.042224,...,0.226652,234904,2349.040,62767,21400,4,50,0.732797,0.267203,LPMS
339,LPMS-DETERM,16,-1,-1.00,20,20,500,3.719870,-1.000000,3.719870,...,4.931176,2395106,4790.212,354198,52548,4,10,0.852116,0.147884,LPMS
340,LPMS-DETERM,16,-1,-1.00,50,50,500,3.877080,-1.000000,3.877080,...,5.041836,2245206,4490.412,354198,52476,4,10,0.842243,0.157757,LPMS


In [24]:
new_res_df = res_df[(res_df['expr'] == 4) & (res_df['prec_data_seen'] == 10)]
new_res_df = new_res_df.loc[new_res_df.groupby(['expr', 'prec_data_seen', 'num_after_filter', 'precision', 'num_keys']).gram_size.idxmin()].sort_values(by=['prec_data_seen','num_keys'])
new_groups = new_res_df.groupby(['expr', 'prec_data_seen', 'method', 'gram_size'])

In [29]:
new_res_df[new_res_df.precision == new_res_df.precision.max()]

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen,prop_error,precision,method
172,FREE-parallel,16,2,0.12,253,-1,0,0.023121,0.008593,0.031714,...,0.095541,97275,972.75,62767,15120,4,10,0.354747,0.645253,FREE
173,FREE-parallel,16,2,0.15,256,-1,0,0.023137,0.008821,0.031958,...,0.095731,97275,972.75,62767,15060,4,10,0.354747,0.645253,FREE
180,FREE-parallel,16,4,0.12,301,-1,0,0.024137,0.009268,0.033405,...,0.097375,97275,972.75,62767,15428,4,10,0.354747,0.645253,FREE


In [30]:
name_map = {
    'BEST' : '\\best ',
    'FREE' : '\\free ',
    'FAST' : '\\lpms ',
    'LPMS' : '\\lpms '
}

def check_min_cell(row, col, format_out, curr_group):
    curr_str = ''
    if row[col] == curr_group[col].min():
        curr_str += '& \cellcolor{green!50}{'
        curr_str += format_out
        curr_str += '} '
    else:
        curr_str += f'& {format_out} '
    return curr_str

def check_max_cell(row, col, format_out, curr_group):
    curr_str = ''
    if row[col] == curr_group[col].max():
        curr_str += '& \cellcolor{yellow!50}{'
        curr_str += format_out
        curr_str += '} '
    else:
        curr_str += f'& {format_out} '
    return curr_str
summary_df = None
lims = [20, 50, 100, 300]
for up_lim in lims:
    curr_df = new_res_df[new_res_df['num_keys'] <= up_lim]
    # print(curr_df)
    curr_group = curr_df.sort_values(by=['precision', 'overall_time'], ascending=[False, True]).groupby(['method']).first()
    # curr_group = curr_df.loc[curr_df.groupby(['method']).precision.idxmax()].sort_values(by='precision', ascending=False)
    curr_group['key_upper_bound'] = [up_lim]*curr_group.shape[0]
    curr_group = curr_group.sort_values(by='name')
    count = 0
    curr_row_count = curr_group.shape[0]
    curr_group['max_key'] = [up_lim]*curr_row_count    
    for _, row in curr_group.iterrows():
        curr_str = ''
        count += 1;
        if count == 1:
            curr_str += '\multirow{'+str(curr_row_count)+'}{*}{\\bf{'+str(up_lim)+'}}'
        curr_str += f'& {name_map[row["name"][:4]]} '

        curr_str += check_min_cell(row, 'overall_match_time', f'{row["overall_match_time"]:.3f}', curr_group)
        curr_str += check_min_cell(row, 'overall_index_time', f'{row["overall_index_time"]:.3f}', curr_group)
        curr_str += check_min_cell(row, 'compute_size', f'{row["compute_size"]/1000000:.3f}', curr_group)
        curr_str += check_min_cell(row, 'index_size', f'{row["index_size"]/1000000:.3f}', curr_group)
        curr_str += check_max_cell(row, 'precision', f'{row["precision"]:.4g}', curr_group)
        print(curr_str, end='')
        if count < curr_row_count:
            print("\\\\ ")
        else:
            print("\\\\ \midrule")
            
    app = []
    for m in ['BEST', 'FREE', 'LPMS']:
        if m not in curr_group.index:
            app.append({'name': m, 
                        'overall_index_time': 0,
                        'index_size': 0,
                        'compute_size': 0,
                        'precision': 0})
    for a in app:
        curr_group = curr_group.append(a, ignore_index=True)

    if summary_df is None:
        summary_df = curr_group
    else:
        summary_df = pd.concat([summary_df, curr_group], ignore_index=True)
summary_df['name'] = [ nm.split('-')[0] for nm in summary_df['name'].to_list()]

\multirow{3}{*}{\bf{20}}& \best  & 0.829 & 3.398 & 0.042 & 0.091 & 0.2084 \\ 
& \free  & \cellcolor{green!50}{0.190} & \cellcolor{green!50}{0.027} & \cellcolor{green!50}{0.016} & 0.446 & \cellcolor{yellow!50}{0.2672} \\ 
& \lpms  & 1.211 & 3.720 & 0.053 & \cellcolor{green!50}{0.010} & 0.1479 \\ \midrule
\multirow{3}{*}{\bf{50}}& \best  & 0.642 & 10.556 & 0.044 & 0.226 & \cellcolor{yellow!50}{0.3172} \\ 
& \free  & \cellcolor{green!50}{0.190} & \cellcolor{green!50}{0.027} & \cellcolor{green!50}{0.016} & 0.446 & 0.2672 \\ 
& \lpms  & 1.165 & 3.877 & 0.052 & \cellcolor{green!50}{0.013} & 0.1578 \\ \midrule
\multirow{3}{*}{\bf{100}}& \best  & 0.444 & 31.659 & 0.043 & 0.452 & \cellcolor{yellow!50}{0.4793} \\ 
& \free  & \cellcolor{green!50}{0.190} & \cellcolor{green!50}{0.027} & \cellcolor{green!50}{0.016} & 0.446 & 0.2672 \\ 
& \lpms  & 1.063 & 3.791 & 0.052 & \cellcolor{green!50}{0.016} & 0.1757 \\ \midrule
\multirow{3}{*}{\bf{300}}& \best  & \cellcolor{green!50}{0.061} & 90.661 & 0.043 &

In [27]:
summary_df

Unnamed: 0,name,num_threads,gram_size,selectivity,num_keys,key_upper_bound,num_queries,selection_time,build_time,overall_index_time,...,overall_time,num_after_filter,per_query_num_after_filter,match_count,compute_size,expr,prec_data_seen,prop_error,precision,max_key
0,BEST,16,-1,0.2,20,20,500,3.39795,8.6e-05,3.39804,...,4.227165,1700004,3400.008,354198,42496,4,10,0.791649,0.208351,20
1,FREE,16,2,0.7,16,20,0,0.02287,0.003815,0.026685,...,0.216252,234904,2349.04,62767,16460,4,10,0.732797,0.267203,20
2,LPMS,16,-1,-1.0,20,20,500,3.71987,-1.0,3.71987,...,4.931176,2395106,4790.212,354198,52548,4,10,0.852116,0.147884,20
3,BEST,16,-1,0.7,50,50,500,10.5557,0.000197,10.5559,...,11.197773,1116714,2233.428,354198,43864,4,10,0.682821,0.317179,50
4,FREE,16,2,0.7,16,50,0,0.02287,0.003815,0.026685,...,0.216252,234904,2349.04,62767,16460,4,10,0.732797,0.267203,50
5,LPMS,16,-1,-1.0,50,50,500,3.87708,-1.0,3.87708,...,5.041836,2245206,4490.412,354198,52476,4,10,0.842243,0.157757,50
6,BEST,16,-1,0.5,100,100,500,31.6587,0.000404,31.6591,...,32.103296,738973,1477.946,354198,43416,4,10,0.520689,0.479311,100
7,FREE,16,2,0.7,16,100,0,0.02287,0.003815,0.026685,...,0.216252,234904,2349.04,62767,16460,4,10,0.732797,0.267203,100
8,LPMS,16,-1,-1.0,96,100,500,3.79051,-1.0,3.79051,...,4.853159,2015358,4030.716,354198,52352,4,10,0.824251,0.175749,100
9,BEST,16,-1,0.2,251,300,500,90.6599,0.000841,90.6607,...,90.722148,97301,973.01,62767,42932,4,10,0.354919,0.645081,300


In [28]:
SMALL_SIZe = 45
MEDIUM_SIZE = 20
BIGGER_SIZE = 22
SMALL_SMALL_SIZe = 43

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SMALL_SIZE)    # fontsize of the tick labels
# plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize

plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

NameError: name 'SMALL_SIZE' is not defined

In [None]:
colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f','#e5c494']
for col in ['precision', 'overall_index_time', 'index_size', 'compute_size']:
    fig, ax = plt.subplots(figsize=(4,2.8))
        
    x = np.array(lims)-150  # the label locations
    width = 0.15 *1000 # the width of the bars
    multiplier = 0
    for method, color in zip(['BEST', 'FREE', 'LPMS'], colors):
        curr = overall_df.loc[overall_df["name"] == method]#.sort_values('name')
        
        X = curr['key_upper_bound']
        
        # fig, ax = plt.subplots(layout='constrained')
        prev = 0
        offset = width * multiplier
        Y = curr[col].to_numpy()
        if col == 'compute_size':
            rects = ax.bar(x + offset, Y, width, alpha=0.7, label=method, color=color)
        else:
            rects = ax.bar(x + offset, Y, width, alpha=0.7, color=color)

        multiplier += 1

        curr_all_df = res_df[(res_df["method"] == method) & (res_df["num_keys"] <= lims[-1])]
        ax.scatter(curr_all_df['num_keys'], curr_all_df[col], c=color)
        
    ylb = ' '.join([w.capitalize() for w in col.split('_')[-2:]])
    if col == 'overall_index_time':
        ax.set_yscale('log')
        ax.ticklabel_format(axis='x', scilimits=[-3,3])
        ylb += ' (log)'
    else:
        ax.ticklabel_format(scilimits=[-3,3])
    ax.set_ylabel(ylb)
    ax.set_xlabel('Key Upper Bound')

    ax.set_xticks(ticks=lims)#, labels=[f'{int(v/1000)}k' for v in lims[1:]])
    
    # fig.text(0.5, 0.04, 'Key Upper Bound', ha='center', va='center')
    if col == 'compute_size':
        plt.legend(loc='lower left', bbox_to_anchor=(1, 0))
    plt.grid()
    plt.savefig(f'figs/{FIG_PREFIX}_{col}.pdf', bbox_inches='tight')
    plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,4))

colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f','#e5c494']
for (name, new_group), color in zip(new_groups, colors):
    print(name)
    new_group.loc[new_group["selectivity"] == -1, "selectivity"] = 0

    X = new_group['index_size']
    Y = new_group['precision']
    curr_lab = new_group['name'].to_list()[0].split('-')[0]
    if name[1] != -1:
        curr_lab += f' n={name[1]}'
    ax.plot(X, Y, c=color, label=curr_lab)
    ax.scatter(X, Y, marker='o', #label=curr_lab, 
               c=color,
               edgecolors='black',# alpha=new_group['selectivity'].to_list(), 
               #s=1000*new_group['selectivity'].to_numpy()
              )
    # for g, s, x, y in zip(new_group['gram_size'], new_group['selectivity'], X, Y):
    #     ax.annotate('(%s, %s)' % (g, s), xy=(x,y), textcoords='data') # <--
plt.gca().set_xscale("log")
ax.set_xlabel('Index Size')
ax.set_ylabel('Precision')

# ax.set_xlim([0, 5000])
plt.legend()
plt.savefig(f'figs/{FIG_PREFIX}_constrSize_prec.pdf', bbox_inches='tight')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20,4))

colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f','#e5c494']
for (name, new_group), color in zip(new_groups, colors):
    print(name)
    new_group.loc[new_group["selectivity"] == -1, "selectivity"] = 0

    X = new_group['overall_index_time']
    Y = new_group['precision']
    curr_lab = new_group['name'].to_list()[0].split('-')[0]
    if name[1] != -1:
        curr_lab += f' n={name[1]}'
    ax.plot(X, Y, c=color, label=curr_lab)
    ax.scatter(X, Y, marker='o', #label=curr_lab, 
               c=color,
               edgecolors='black',# alpha=new_group['selectivity'].to_list(), 
               #s=1000*new_group['selectivity'].to_numpy()
              )
    # for g, s, x, y in zip(new_group['gram_size'], new_group['selectivity'], X, Y):
    #     ax.annotate('(%s, %s)' % (g, s), xy=(x,y), textcoords='data') # <--
plt.gca().set_xscale("log")
ax.set_xlabel('Index Construction Time')
ax.set_ylabel('Precision')

# ax.set_xlim([0, 5000])
plt.legend()
plt.savefig(f'figs/{FIG_PREFIX}_constrTime_prec.pdf', bbox_inches='tight')
plt.show()

In [None]:
# explore the distribution of number of keys and number

groups = res_df.groupby('name')
colors = ['#7fc97f','#beaed4','#fdc086']
ax = plt.subplot()
for (name, group), color in zip(groups, colors):
    ax.scatter(group['num_keys'], group['num_after_filter'], c=color, label=name)
ax.set_xlabel('num_keys')
ax.set_ylabel('num_after_filter')

# inset Axes....
x1, x2, y1, y2 = 0,5000, 0, 30000000  # subregion of the original image
axins = ax.inset_axes(
    [0.3, 0.4, 0.67, 0.47],
    xlim=(x1, x2), ylim=(y1, y2), xticklabels=[], yticklabels=[])
for (name, group), color in zip(groups, colors):
    axins.scatter(group['num_keys'], group['num_after_filter'], c=color, label=name)
# axins.scatter(res_df['num_keys'], res_df['num_after_filter'])
plt.legend()
ax.indicate_inset_zoom(axins, edgecolor="black")

plt.show()

In [None]:
# get their configurations
curr_examine = res_df[(res_df['num_keys'] <= x2) & (res_df['num_after_filter'] <= y2)]
trimed = curr_examine.loc[curr_examine.groupby('num_after_filter').gram_size.idxmin()]
trimed = trimed.sort_values(by='num_after_filter')

In [None]:
trimed

In [None]:
groups = res_df.groupby('name')
colors = ['#7fc97f','#beaed4','#fdc086']
ax = plt.subplot()
ax.set_ylim([-0.01, 0.7])
for (name, group), color in zip(groups, colors):
    ax.scatter(group['num_keys'], group['precision'], c=color, label=name)
ax.set_xlabel('num_keys')
ax.set_ylabel('Precision')

# print( res_df['match_count'])
# print(res_df['num_after_filter'])
# print(res_df['prop_error'])
# print((239136631-913841)/239136631)

# inset Axes....
x1, x2, y1, y2 = 0,5500, 0.17, 0.6  # subregion of the original image
axins = ax.inset_axes(
    [0.37, 0.5, 0.6, 0.47],
    xlim=(x1, x2), ylim=(y1, y2), xticklabels=[], yticklabels=[])
for (name, group), color in zip(groups, colors):
    axins.scatter(group['num_keys'], group['precision'], c=color, label=name)
# axins.scatter(res_df['num_keys'], res_df['num_after_filter'])
plt.legend(loc='center right')
ax.indicate_inset_zoom(axins, edgecolor="black")

plt.show()

In [None]:
# get their configurations
curr_examine = res_df[(res_df['num_keys'] <= x2) & (res_df['precision']>= y1)]
trimed = curr_examine.loc[curr_examine.groupby('num_after_filter').gram_size.idxmin()]
sorted_df = trimed.sort_values(by='precision')

In [None]:
trimed

In [None]:
fig, ax = plt.subplots(figsize=(10,4))
new_res_df = res_df.loc[res_df.groupby(['num_after_filter', 'num_keys']).gram_size.idxmin()].sort_values(by='num_keys')

new_groups = new_res_df.groupby(['name', 'gram_size'])
colors = ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854','#ffd92f','#e5c494']
for (name, new_group), color in zip(new_groups, colors):
    print(name)
    new_group.loc[new_group["selectivity"] == -1, "selectivity"] = 1

    X = new_group['num_keys']
    Y = new_group['precision']
    curr_lab = new_group['name'].to_list()[0].split('-')[0]
    if name[1] != -1:
        curr_lab += f' n={name[1]}'
    ax.plot(X, Y, c=color)
    ax.scatter(X, Y, marker='o', label=curr_lab, c=color,
               edgecolors='black',# alpha=new_group['selectivity'].to_list(), 
               s=1000*new_group['selectivity'].to_numpy())
    # for g, s, x, y in zip(new_group['gram_size'], new_group['selectivity'], X, Y):
    #     ax.annotate('(%s, %s)' % (g, s), xy=(x,y), textcoords='data') # <--

ax.set_xlabel('num_keys')
ax.set_ylabel('precision')

ax.set_xlim([0, 5000])
plt.legend(loc="lower right", labelspacing=2)
plt.savefig(f'figs/{FIG_PREFIX}_numkey_prec.pdf', bbox_inches='tight')
plt.show()

In [None]:
new_res_df[new_res_df['num_keys'] < 5000].sort_values(by='num_after_filter', ascending=True)

In [None]:
data = np.sort(new_res_df['num_keys'].to_numpy())
fig, ax = plt.subplots(figsize=(10, 1))
ax.scatter(data, [10] * len(data))

In [None]:
# decide the split points
def pairwise_difference(lst):
    differences = []
    prec_diffs = []
    for i in range(len(lst) - 1):
        prec_diffs.append((lst[i + 1] - lst[i])/lst[i])
        differences.append(lst[i + 1] - lst[i])
    return differences, prec_diffs
diffs, prec_diffs = pairwise_difference(data)
idxs = np.argsort(diffs)
prec_idxs = np.argsort(prec_diffs)
print(idxs)

In [None]:
fig, ax = plt.subplots(figsize=(30, 1))
ax.scatter(data, [10] * len(data))
bins = [np.min(data)-1]
for idx in prec_idxs[-9:]:
    print(f'idx[{idx}], between {data[idx]} and {data[idx+1]}')
    curr_x = (data[idx] + data[idx + 1])/2
    bins.append(curr_x)
    ax.axvline(x=curr_x, color='red')
bins = np.sort(bins)
bins = np.append(bins, int(np.max(data)*1.01))

In [None]:
fig, ax = plt.subplots(figsize=(10, 1))
ax.scatter(data, [10] * len(data))
ax.set_xlim([0, 10000])

for idx in prec_idxs[-9:]:
    if data[idx] < 10000:
        print(f'idx[{idx}], between {data[idx]} and {data[idx+1]} at {(data[idx] + data[idx + 1])/2}')
        curr_x = (data[idx] + data[idx + 1])/2
        ax.axvline(x=curr_x, color='red')

In [None]:
print(bins)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(data, bins=bins, edgecolor='black', alpha=0.5)
print(np.log(bins))
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

plt.gca().set_xscale("log")
ax.set_xticks(bins)
ax.set_xticklabels([int(b) for b in bins], rotation=45)

ax2 = ax.twinx()

for (name, new_group), color in zip(new_groups, colors):
    new_group.loc[new_group["selectivity"] == -1, "selectivity"] = 1

    X = new_group['num_keys']
    Y = new_group['precision']
    ax2.plot(X, Y, c=color)
    curr_lab = new_group['name'].to_list()[0].split('-')[0]
    if name[1] != -1:
        curr_lab += f' n={name[1]}'
    sc = ax2.scatter(X, Y, marker='o', label=curr_lab, c=color,
               edgecolors='black', #alpha=new_group['selectivity'].to_list(), 
               s=1000*new_group['selectivity'].to_numpy())
    if name[0] == 'FREE-parallel':
        labs = (new_group['selectivity'].unique())
        labs = np.sort(labs)
        handles, labels = sc.legend_elements(prop="sizes", alpha=0.4,num=4)     
        # labels = new_group['selectivity'].unique()    
        # legend = ax.legend(handles, [" " , "Selectivity\nThreshold"," " ,  " "], title="Size")
        # ax.legend(*sc.legend_elements("sizes", num=4, alpha=0.5), loc='center right')

    # for g, s, x, y in zip(new_group['gram_size'], new_group['selectivity'], X, Y):
    #     ax2.annotate('(%s, %s)' % (g, s), xy=(x,y), textcoords='data') # <--
ax2.set_ylabel('Precision')
ax.set_ylabel('Index Count')
ax.set_xlabel('num_keys')
plt.legend(labelspacing=2)
plt.savefig(f'figs/{FIG_PREFIX}_numkey_prec_hist.pdf', bbox_inches='tight')
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.hist(data, bins=bins, edgecolor='black');
# plt.gca().set_xscale("log")
ax2 = ax.twinx()

for (name, new_group), color in zip(new_groups, colors):
    new_group.loc[new_group["selectivity"] == -1, "selectivity"] = 1
    if name == 'FREE-parallel':
        curr_new_group = new_group[new_group['gram_size'] == 4]
    else:
        curr_new_group = new_group
    X = curr_new_group['num_keys']
    Y = curr_new_group['precision']
    ax2.plot(X, Y, c=color)
    ax2.scatter(X, Y, marker='o', label=curr_new_group['name'].to_list()[0], c=color,
               edgecolors='black', #alpha=new_group['selectivity'].to_list(), 
               s=1000*curr_new_group['selectivity'].to_numpy())
    for g, s, x, y in zip(curr_new_group['gram_size'], curr_new_group['selectivity'], X, Y):
        ax2.annotate('(%s, %s)' % (g, s), xy=(x,y), textcoords='data') # <--
ax2.set_ylabel('precision')
ax.set_ylabel('count')
ax.set_xlabel('num_keys')
ax.set_xlim([0, 6000])
plt.legend(labelspacing=2)

In [None]:
# bin them by number of 
BIN_NUM = 10
new_res_df.hist(column='num_keys', bins=BIN_NUM)
heights, bins = np.histogram(data, bins=BIN_NUM)

In [None]:
_, bins = np.histogram(np.log10(data + 1), bins=BIN_NUM)
plt.hist(data, bins=10**bins);
plt.gca().set_xscale("log")

In [None]:
edges = 10**bins
prev = 0
for b in edges:
    print(new_res_df[(new_res_df['num_keys'] < b) & (new_res_df['num_keys'] >= prev)].sort_values(by='num_after_filter', ascending=True)[['name', 'num_keys', 'selection_time', 'num_after_filter']])
    prev = b
print([int(e) for e in edges])

In [None]:
sns.histplot(new_res_df['num_keys'], bins=BIN_NUM, log_scale=True)

In [None]:
prev = 0
for b in bins:
    print(new_res_df[(new_res_df['num_keys'] < b) & (new_res_df['num_keys'] >= prev)].sort_values(by='num_after_filter', ascending=True))
    prev = b

In [None]:
def histedges_equalN(x, nbin):
    npt = len(x)
    return np.interp(np.linspace(0, npt, nbin + 1),
                     np.arange(npt),
                     np.sort(x))
x = new_res_df['num_keys'].to_numpy()
n, bins, patches = plt.hist(x, histedges_equalN(x, 20))

In [None]:
prev = 0
for b in bins:
    print(new_res_df[(new_res_df['num_keys'] < b) & (new_res_df['num_keys'] >= prev)].sort_values(by='num_after_filter', ascending=True))
    prev = b

In [None]:
best_df = res_df[res_df['name'].str.contains('BEST')]

In [None]:
free_df = res_df[(res_df['name'] == 'FREE') | (res_df['name'] == 'FREE-parallel')]

In [None]:
# free with c = 0.1, plot false positive rate against gramsize 
# free_df = res_df[(res_df['name'] == 'FREE') | (res_df['name'] == 'FREE-parallel')]
temp_free = free_df[free_df['selectivity'] == 0.1]

In [None]:
# gram size vs accuracy
temp = temp_free[temp_free['num_threads'] == 1]
ax = plt.subplot()
x = temp['gram_size']
y = (temp['num_after_filter'] - temp['match_count']) / temp['num_after_filter']
ax.plot(x, y) 
plt.xlabel('Max Gram Size')
plt.ylabel('False Positive Rate')
plt.show()

In [None]:
# gram size vs index size; gram size vs index build time
fig, axs = plt.subplots(2,1, gridspec_kw={'height_ratios': [2, 1]}, sharex=True)

colors = ['#66c2a5','#fc8d62','#8da0cb']

temp = temp_free[temp_free['num_threads'] == 1]
x = temp['gram_size']
y_1 = temp['overall_index_time']
y_1_1 = temp['selection_time']
y_1_2 = temp['build_time']

y_2 = temp['num_keys']

axs[0].stackplot(x, y_1_1, y_1_2, labels=[
    'Gram Selection', 'Posting Lists Filling' #, 'Overall Index Construction'
],
    colors=colors[:-1])

# axs[0].plot(x, y_1, label='Overall Index Construction') 
# axs[0].plot(x, y_1_1, label='Gram Selection') 
# axs[0].plot(x, y_1_2, label='Posting Lists Filling') 
axs[0].set_ylabel('Time (s)')
axs[0].legend(loc='upper left')

axs[1].plot(x, y_2, marker='o', c=colors[-1])
axs[1].set_ylabel('Num Selected')

plt.xlabel('Max Gram Size')
plt.savefig(f'figs/{FIG_PREFIX}free_n_vs_index-time_num-keys.pdf', bbox_inches='tight')

plt.show()

In [None]:
# thread vs. build time; label by gram size
ax = plt.subplot()
seq_colors = sns.color_palette("flare")#, as_cmap=True)
for i, gram_size in enumerate(temp_free['gram_size'].unique()):
    temp = temp_free[temp_free['gram_size'] == gram_size]
    x = temp['num_threads']
    y = temp['overall_index_time']
    ax.plot(x, y, label='FREE max {}-grams'.format(int(gram_size)), marker='o', color=seq_colors[i]) 
# ax.set_facecolor('gainsboro')
plt.xlabel('Number of Threads')
plt.ylabel('Index Building Time (s)')
plt.legend() 
plt.savefig(f'figs/{FIG_PREFIX}free_thread_vs_index-time.pdf', bbox_inches='tight')
plt.show()

In [None]:
# thread vs. build time; label by gram size
# temp_best = best_df[best_df['selectivity'] == 0.05]
# print(temp_best)
ax = plt.subplot()
seq_colors = sns.color_palette("crest")#, n_colors=11)#, as_cmap=True)
# for i, sel in enumerate(best_df['selectivity'].unique()):
for i, sel in enumerate([0.01, 0.05,0.1, 0.2]):
    temp = best_df[best_df['selectivity'] == sel]
    x = temp['num_threads']
    y = temp['overall_index_time']
    ax.plot(x, y, label='BEST c <= {}'.format(sel), marker='o', color=seq_colors[i]) 
# ax.set_facecolor('gainsboro')
plt.xlabel('Number of Threads')
plt.ylabel('Index Building Time (s)')
plt.legend() 
plt.savefig(f'figs/{FIG_PREFIX}_best_thread_vs_index-time.pdf', bbox_inches='tight')
plt.show()