In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt 

def json_dump(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f)

In [2]:
import ast
from selection.index_selection_evaluation import DBMSYSTEMS
from selection.what_if_index_creation import WhatIfIndexCreation
from selection.data_preparation import read_csv, index_conversion
from selection.table_generator import TableGenerator
from selection.workload import Query, Index, Column, Table

config_file = "config.json"
with open(config_file) as f:
    config = json.load(f)

In [3]:
config_file = "config.json"
with open(config_file) as f:
    config = json.load(f)
dbms_class = DBMSYSTEMS[config["database_system"]]
generating_connector = dbms_class(None, autocommit=True)
table_generator = TableGenerator(config["benchmark_name"], config["scale_factor"], generating_connector)

In [34]:
data_table_info = read_csv("../tpcds50trow.csv")[2:-2]
table_dict = {}
for table_info in data_table_info:
    table_info_tuple = table_info[0].split('|')
    if len(table_info_tuple) < 2: continue
    table_name = table_info_tuple[0].strip()
    row_count = float(table_info_tuple[1].strip())
    table = Table(table_name)
    table.set_row_count(row_count)
    table_dict[table_name] = table

In [35]:
TPC_DS_TABLE_PREFIX = {
    "dv": "dbgen_version",
    "ca": "customer_address",
    "cd": "customer_demographics",
    "d": "date_dim",
    "w": "warehouse",
    "sm": "ship_mode",
    "t": "time_dim",
    "r": "reason",
    "ib": "income_band",
    "i": "item",
    "s": "store",
    "cc": "call_center",
    "c": "customer",
    "web": "web_site",
    "sr": "store_returns",
    "hd": "household_demographics",
    "wp": "web_page",
    "p": "promotion",
    "cp": "catalog_page",
    "inv": "inventory",
    "cr": "catalog_returns",
    "wr": "web_returns",
    "ws": "web_sales",
    "cs": "catalog_sales",
    "ss": "store_sales"
}

data_column_info = read_csv("../tpcds50stats.csv")[2:-2]
column_dict = {}
for column_info in data_column_info:
    column_info_tuple = column_info[0].split('|')
    if len(column_info_tuple) < 3: continue
    column_name = column_info_tuple[0].strip()
    n_distinct = float(column_info_tuple[2].strip())
    column = Column(column_name)
    column.set_cardinality(n_distinct)
    if (prefix := column_name.split('_')[0]) in TPC_DS_TABLE_PREFIX.keys():
        table_name = TPC_DS_TABLE_PREFIX[prefix]
        if table_name in table_dict.keys():
            column.table = table_dict[table_name]
            column.table.add_column(column)      
    column_dict[column_name] = column

In [30]:
def convert_configuration_to_obj(columns_dict, confg_string):
    configs = []
    for config_s in confg_string:
        if config_s == "[]": 
            configs.append([])
            continue
        config = []
        indexes_s = config_s.split('I')
        for index_s in indexes_s:
            if index_s == '': continue
            table_columns = index_s.split('C')
            indexed_columns = []
            for table_column in table_columns:
                table_column = table_column.strip('(), ')
                if table_column == '': continue
                column_name = table_column.split('.')[-1]
                if column_name in columns_dict:
                    indexed_columns.append(columns_dict[column_name])
            config.append(Index(indexed_columns))
        configs.append(config)
    return configs

In [44]:
# data_list[i][0]: Query ID and Query text for the i-th query
# data_list[i][1]: Index configurations for the i-th query
# data_list[i][2]: Average cost of each configuration for the i-th query
# data_list[i][3]: Query execution plan of each configuration for the i-th query
# data_list[i][4]: Details execution costs (each query is executed 4 times and the last 3 times are recorded) of each configuration for the i-th query

ds_data_list_string = read_csv("../TPC_DS_50GB.csv")
def read_data(data_list_string):
    data, queries = [], []
    for i in range(len(data_list_string)):
        data_list_string[i][0] = ast.literal_eval(data_list_string[i][0])
        query = Query(data_list_string[i][0][0], data_list_string[i][0][1])
        query.columns = [column for column_name, column in column_dict.items() if column_name in query.text]
        queries.append(query)
        indexes_string = index_conversion(data_list_string[i][1])
        index_configurations = convert_configuration_to_obj(column_dict, indexes_string)
        average_costs = ast.literal_eval(data_list_string[i][2])
        plans = ast.literal_eval(ds_data_list_string[i][3])
        queries.append(query)
        data.append([query, index_configurations, average_costs, plans])
    return data, queries

ds_data, queries = read_data(ds_data_list_string)

### Index Filter

#### Labels

In [45]:
PHYISCAL_TO_LOGICAL_OPERATOR_MAP = {
    "Seq Scan": "Scan",
    "Bitmap Index Scan": "Scan",
    "Bitmap Heap Scan": "Scan",
    "Index Scan": "Scan",
    "Index Only Scan": "Scan",
    "CTE Scan": "Scan",
    "Subquery Scan": "Scan",
    "Sort": "Sort",
    "Incremental Sort": "Sort",
    "Hash Join": "Join",
    "Merge Join": "Join",
    "Nested Loop": "Join",
    "Aggregate": "Aggregate",
    "WindowAgg": "Aggregate",
    "Group": "Aggregate",
    "Gather Merge": "",
    "Gather": "",
    "BitmapOr": "",
    "BitmapAnd": "",
    "Limit": "",
    "Hash": "",
    "Result": "",
    "SetOp": "",
    "Append": "",
    "Materialize": "",
    "Unique": "",
    "Merge Append": "",
    
}

LOGICAL_OPERATORS = ["Scan", "Join", "Aggregate", "Sort"]

In [46]:
def has_child_node(query_plan):
    return "Plans" in query_plan.keys()

def has_filtering_property(query_plan):
    if "Filter" in query_plan.keys():
        return query_plan["Filter"]
    if "Hash Cond" in query_plan.keys():
        return query_plan["Hash Cond"]
    if "Join Filter" in query_plan.keys():
        return query_plan["Join Filter"]
    return ""

def is_join_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Join"

def is_sort_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Sort"

def is_aggregate_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Aggregate"

def is_scan_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Scan"

def check_indexed_column_in_condition(index, condition):
    for column in index.columns:
        if column.name in condition:
            return True

def get_table_from_plan_node(query_plan):
    table = ""
    if "Relation Name" in query_plan.keys():
        table = query_plan["Relation Name"]
    return table

In [47]:
physical_operators = set()

def collect_physical_operators(physical_operators, query_plan): 
    physical_operators.add(query_plan["Node Type"])
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            collect_physical_operators(physical_operators, child_node)
            
# for _, query_plan in query_plans_with_index.items():
#     collect_physical_operators(physical_operators, query_plan) 

for sample in ds_data:
    query_plans = sample[3]
    for query_plan in query_plans:
        collect_physical_operators(physical_operators, query_plan)

physical_operators = list(physical_operators)  
print(physical_operators)

['Nested Loop', 'Limit', 'Materialize', 'Unique', 'Gather Merge', 'Hash Join', 'Aggregate', 'Result', 'Bitmap Index Scan', 'Seq Scan', 'Index Only Scan', 'Group', 'BitmapAnd', 'Merge Join', 'SetOp', 'Append', 'Index Scan', 'Hash', 'Merge Append', 'Gather', 'WindowAgg', 'Bitmap Heap Scan', 'Subquery Scan', 'BitmapOr', 'Sort', 'CTE Scan', 'Incremental Sort']


#### Feautures

In [48]:
# get cardinatlity statistics
def generate_cardinality_statistics(db_connector, tables):
    for table in tables:
        row_count = db_connector.table_row_count(table.name)
        table.set_row_count(row_count)
        for column in table.columns:
            card = db_connector.get_column_cardinality(column)
            column.set_cardinality(card)

In [49]:
# signal 1
def estimate_index_utility(index, original_query_plan, indexed_query_plan):
    total_cost = 0
    if has_child_node(original_query_plan):
        for original_child_node, indexed_child_node in zip(indexed_query_plan["Plans"], indexed_query_plan["Plans"]):
            total_cost += estimate_index_utility(index, original_child_node, indexed_child_node)
    current_operator = indexed_query_plan["Node Type"]
    current_cost = original_query_plan["Total Cost"]
    if (condition := has_filtering_property(indexed_query_plan)) != "":
        if is_join_operator(current_operator):
            join_output_rows = indexed_query_plan["Plan Rows"]
            left_input_rows = indexed_query_plan["Plans"][0]["Plan Rows"]
            right_input_rows = indexed_query_plan["Plans"][1]["Plan Rows"]
            if check_indexed_column_in_condition(index, condition):    
                current_cost = (1-np.sqrt(join_output_rows/(left_input_rows*right_input_rows)))*original_query_plan["Total Cost"]
        else:
            selectivities = [indexed_query_plan["Plan Rows"]/column.table.row_count for column in index.columns if column.name in condition]
            average_selectivity = sum(selectivities)/len(selectivities) if len(selectivities) > 0 else 0
            current_cost = (1-average_selectivity)*original_query_plan["Total Cost"]
    elif is_sort_operator(current_operator) and "Sort Key" in indexed_query_plan.keys():
        sort_conditions = indexed_query_plan["Sort Key"]
        for sort_condition in sort_conditions:
            if check_indexed_column_in_condition(index, sort_condition):
                current_cost = indexed_query_plan["Total Cost"]
    elif is_aggregate_operator(current_operator) and "Group Key" in indexed_query_plan.keys():
        group_conditions = indexed_query_plan["Group Key"]
        for group_condition in group_conditions:
            if check_indexed_column_in_condition(index, group_condition):
                current_cost = indexed_query_plan["Total Cost"]
    return total_cost+current_cost

In [60]:
# signal 2
def extract_shape_of_query_and_index(index, original_query_plan, indexed_query_plan):
    query_shape, index_shape = {}, []
    _extract_query_shape(query_shape, original_query_plan)
    visited = set()
    _extract_index_shape(index_shape, index, indexed_query_plan, visited)
    return query_shape, index_shape

def _extract_query_shape(query_shape, query_plan):
    current_operator = query_plan["Node Type"]
    logical_operator = PHYISCAL_TO_LOGICAL_OPERATOR_MAP[current_operator]
    if is_scan_operator(current_operator):
        table = get_table_from_plan_node(query_plan)
        if table in query_shape.keys():
            query_shape[table].append(logical_operator)
        else:
            query_shape[table] = [logical_operator]
        return table
    
    tables = []    
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            table = _extract_query_shape(query_shape, child_node)
            if table and logical_operator:
                tables.append(table)
                query_shape[table].append(logical_operator)
    return tables[0] if 0<len(tables)<2 else ""

def _extract_index_shape(index_shape, index, query_plan, visited):
    current_operator = query_plan["Node Type"]
    logical_operator = PHYISCAL_TO_LOGICAL_OPERATOR_MAP[current_operator]
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            _extract_index_shape(index_shape, index, child_node, visited)
            
    if (condition := has_filtering_property(query_plan)) != "":
        for column in index.columns:
            if column in visited: continue
            elif column.name in condition:
                index_shape.append(logical_operator)
                visited.add(column)
    elif is_sort_operator(current_operator) and "Sort Key" in query_plan.keys():
        sort_conditions = query_plan["Sort Key"]
        for sort_condition in sort_conditions:
            for column in index.columns:
                if column in visited: continue
                elif column.name in sort_condition:
                    index_shape.append(logical_operator)
                    visited.add(column)
    elif is_aggregate_operator(current_operator) and "Group Key" in query_plan.keys():
        aggregate_conditions = query_plan["Group Key"]
        for aggregate_condition in aggregate_conditions:
            for column in index.columns:
                if column in visited: continue
                elif column.name in aggregate_condition:
                    index_shape.append(logical_operator)
                    visited.add(column)

In [62]:
# signal 3
def evaluate_operator_relevance(index, query_plan):
    result = {}
    _evaluate_operator_relevance(result, index, query_plan)
    return result

def _evaluate_operator_relevance(operator_relevance, index, query_plan):
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            _evaluate_operator_relevance(operator_relevance, index, child_node)
            
    current_operator = query_plan["Node Type"]
    relevance = 0
    if (condition := has_filtering_property(query_plan)) != "":
        selectivities = [query_plan["Plan Rows"]/column.table.row_count for column in index.columns if column.name in condition]
        relevance = sum(selectivities)/len(selectivities) if len(selectivities) > 0 else 0
    elif is_sort_operator(current_operator) and "Sort Key" in query_plan.keys():
        densities = []
        conditions = query_plan["Sort Key"]
        for condition in conditions:
            for column in index.columns:
                if column.name in condition:
                    densities.append(column.cardinality/column.table.row_count)
        relevance = sum(densities)/len(densities) if len(densities) > 0 else 0
    elif is_aggregate_operator(current_operator) and "Group Key" in query_plan.keys():
        densities = []
        conditions = query_plan["Group Key"]
        for condition in conditions:
            for column in index.columns:
                if column.name in condition:
                    densities.append(column.cardinality/column.table.row_count)
        relevance = sum(densities)/len(densities) if len(densities) > 0 else 0
    if current_operator not in operator_relevance: 
        operator_relevance[current_operator] = []
    operator_relevance[current_operator].append(relevance)

In [57]:
# signal 4
def get_number_of_pages(query_plan):
    return query_plan["Shared Hit Blocks"] + query_plan["Shared Read Blocks"] + query_plan["Local Hit Blocks"] + query_plan["Local Read Blocks"]

### seems not supported by postgres: https://stackoverflow.com/questions/20410444/postgres-ignoring-clustered-index-on-date-query
# check primary key
def count_clustered_index(db_connector, table_name):
    count = db_connector.count_clustered_indexes(table_name)
    return count

def check_bitmap(query_plan):
    current_operator = query_plan["Node Type"]
    use = False
    if is_scan_operator(current_operator):
        use = "Bitmap" in current_operator
    
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            use |= check_bitmap(child_node)
    return use

In [53]:
## testing
# import pandas as pd
# from sklearn.preprocessing import OneHotEncoder
# # create an example dataframe to work with
# df = pd.DataFrame([
#     ["Scan", "Join"],
#     ["Scan", "Aggregate", "Join"],
#     ["Aggregate", "Sort"],
#     ["Sort", "Join"],
#     ["Aggregate", "Scan"]
# ], columns=["operator1", "operator2", "operator3"])

# # create a OneHotEncoder that ignores (0 encodes) unseen categories
# # and encode the categorical features for the example dataframe
# encoder = OneHotEncoder(sparse_output=False)
# X_encoded = encoder.fit_transform(df)
# print(X_encoded)
# print(encoder.categories_)

In [65]:
labels = []
feature_columns = ["utility", "num_pages", "use_bitmap"]
operator_relevance_columns = [f"relevance_{operator}" for operator in LOGICAL_OPERATORS]
feature_columns.extend(operator_relevance_columns)
query_shape_columns = [f"query_shape_operator{i}_on_{table.name}" for table in table_dict.values() for i in range(5)]
feature_columns.extend(query_shape_columns)
index_shape_columns = [f"index_shape_operator{i}" for i in range(5)]
feature_columns.extend(index_shape_columns)

single_index_query_ids = [(i,j) for i, entry in enumerate(ds_data) for j, config in enumerate(entry[1]) if len(config) == 1]
features = pd.DataFrame(columns=feature_columns, index=range(len(single_index_query_ids)))


for k, (i,j) in enumerate(single_index_query_ids):
    query = ds_data[i][0]
    index_configs = ds_data[i][1]
    costs = ds_data[i][2]
    plans = ds_data[i][3]
    
    # only consider single index config
    index = index_configs[j][0]
    labels.append(costs[j])
    original_query_plan = plans[0] # no indexed query plan
    original_query_cost = costs[0] # no index query cost
    indexed_query_plan = plans[j]
    utility = estimate_index_utility(index, original_query_plan, indexed_query_plan)/original_query_cost
    query_shape, index_shape = extract_shape_of_query_and_index(index, original_query_plan, indexed_query_plan)
    for table, operator_seq in query_shape.items():
        for j, operator in enumerate(operator_seq):
            features.iloc[k][f"query_shape_operator{j}_on_{table}"] = operator
    for j, operator in enumerate(index_shape):
        features.iloc[k][f"index_shape_operator{i}"] = operator
    relevance = evaluate_operator_relevance(index, original_query_plan)
    for operator in LOGICAL_OPERATORS:
        if operator in relevance: features.iloc[k][f"relevance_{operator}"] = sum(relevance[operator])/len(relevance[operator])
        else: features.iloc[k][f"relevance_{operator}"] = 0
    num_pages = get_number_of_pages(indexed_query_plan)
    use_bitmap = check_bitmap(indexed_query_plan)
    features.iloc[k]["utility"] = utility
    features.iloc[k]["num_pages"] = num_pages
    features.iloc[k]["use_bitmap"] = int(use_bitmap)
    
features[["utility", "num_pages"]+operator_relevance_columns] = features[["utility", "num_pages"]+operator_relevance_columns].apply(pd.to_numeric)
features["use_bitmap"] = features["use_bitmap"].astype('int')

print("labels:\n", labels)
features


labels:
 [19060.650666666665, 86390.44, 31637.523333333334, 84700.35333333333, 28964.59866666667, 29166.20133333333, 29158.77, 29151.090999999997, 29048.755666666664, 29061.021999999997, 29803.020666666667, 19833.858000000004, 19516.41233333333, 19727.17066666667, 19733.321666666667, 8468.100666666667, 8577.196333333333, 1004.2973333333333, 2530.663666666667, 10901.480666666668, 10907.659666666666, 10843.375, 10852.939333333334, 4832.593666666667, 1515.6973333333333, 7016.096333333334, 7033.080333333334, 7009.8623333333335, 6946.266333333333, 7028.550666666667, 4570.013333333333, 8352.834333333334, 1148.7563333333335, 8358.429333333333, 7814.27, 1186.5693333333331, 4319.167666666667, 2596.393, 2542.141, 566.945, 2269.0626666666667, 29221.80733333333, 29201.373000000003, 8233.919, 9156.184, 9162.527333333333, 9137.969333333334, 1134.9986666666666, 10917.825333333332, 10751.322, 11104.259333333335, 10914.589333333333, 9327.393000000002, 11589.276333333333, 11845.522333333334, 11872.51533

Unnamed: 0,utility,num_pages,use_bitmap,relevance_Scan,relevance_Join,relevance_Aggregate,relevance_Sort,query_shape_operator0_on_dbgen_version,query_shape_operator1_on_dbgen_version,query_shape_operator2_on_dbgen_version,...,query_shape_operator0_on_store_sales,query_shape_operator1_on_store_sales,query_shape_operator2_on_store_sales,query_shape_operator3_on_store_sales,query_shape_operator4_on_store_sales,index_shape_operator0,index_shape_operator1,index_shape_operator2,index_shape_operator3,index_shape_operator4
0,2993.190102,2808510,0,0,0,-0.000002,-0.000002,,,,...,,,,,,Aggregate,,,,
1,2282.666436,2589713,0,0,0,0.002738,0.002738,,,,...,Scan,Join,,,,,Sort,,,
2,1588.985611,14223598,0,0,0,0.000000,0.000000,,,,...,Scan,Join,,,,,,,,
3,2282.254151,2586357,1,0,0,0.000000,0.000000,,,,...,Scan,Join,,,,,Join,,,
4,4500.301597,6781512,0,0,0,0.000000,0.000000,,,,...,,,,,,,,Join,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,828.058530,11683592,0,0,0,0.000000,0.000000,,,,...,,,,,,,,,,
708,2999.280964,2808510,0,0,0,-0.000002,-0.000002,,,,...,,,,,,,,,,
709,2764.022450,2590435,0,0,0,0.002738,0.002738,,,,...,Scan,Join,,,,,,,,
710,2761.515419,2590424,0,0,0,0.000000,0.000000,,,,...,Scan,Join,,,,,,,,


In [66]:
encoder = OneHotEncoder(sparse_output=False)
features_cat = features.select_dtypes(include="object")
features_num = features.select_dtypes(exclude="object")
features_cat_encoded = encoder.fit_transform(features_cat)
features_encoded = np.concatenate((features_num.to_numpy(), features_cat_encoded), axis=1)


In [71]:
# split training set and test set
total_samples = len(features_encoded)
train_size = int(0.67 * total_samples)
X_train, X_test, y_train, y_test = features_encoded[:train_size], features_encoded[train_size:], labels[:train_size], labels[train_size:]

#### Modeling

##### Random Forest

In [69]:
# RF parameters are set according to the paper
regr = RandomForestRegressor(n_estimators=40, max_depth=10, random_state=0).fit(X_train, y_train)

In [72]:
y_estimated = regr.predict(X_test)
y_estimated

array([ 29265.36550894,  29218.87182388,  29265.36550894,  29621.56193087,
        17693.46203303,  17628.95316986,  17643.40675736,  17680.62701359,
         9824.87040333,   9818.63235654,    847.69397358,   3473.2294146 ,
         9649.74603036,   9649.74603036,   9649.74603036,   9649.74603036,
         6397.45989189,    973.9708349 ,   6819.33863069,   6819.33863069,
         6876.6617718 ,   6819.33863069,   6876.6617718 ,   4263.15789435,
         6819.33863069,   9112.96013229,    964.16421456,   9377.36502151,
         9112.96013229,    993.28407393,   5329.70642508,   2105.83119125,
         2127.61695422,   3903.16424331,   2431.56707639,  27849.50326056,
        27160.67869759,   9513.09457426,   9707.79274537,   9386.07649537,
         9386.07649537,   1146.80810296,   9420.98293788,   9423.63022399,
         9420.98293788,   9416.72470176,   6620.43021783,   9760.43774628,
         9810.31832584,   9830.62550775,  16164.7776532 ,  12373.46768374,
        10511.21334804,  

In [73]:
filter_threshold = 0.05

filtered_query_index_pairs = []
test_set = single_index_query_ids[train_size:]
for k, (i,j) in enumerate(test_set):
    query = ds_data[i][0]
    index_configs = ds_data[i][1]
    costs = ds_data[i][2]
    plans = ds_data[i][3]
    
    original_query_cost = costs[j]
    percentage_diff = (original_query_cost - y_estimated[k])/ original_query_cost
    if percentage_diff >= filter_threshold:
        filtered_query_index_pairs.append((query, index))

In [74]:
filtered_query_index_pairs

[(Q207, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q207, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q207, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q207, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q212, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q213, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q213, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q213, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q213, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q215, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q218, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q218, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q218, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q220, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q221, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q221, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q222, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q222, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q226, I(C item.i_manufact_id,C item.i_item_sk)),
 (Q226, I(C item.i_manufact_id,

### Index Cost Model

In [71]:
import itertools

query_configurations = {}
for (query, index) in filtered_query_index_pairs:
    if query not in query_configurations:
        query_configurations[query] = set()
    query_configurations[query].add(index)


for query, indexes in query_configurations.items():
    combi = itertools.combinations(list(indexes), 3) # not sure, hard-coded the 3 here
    query_configurations[query] = list(combi)

In [None]:
def calculate_parameter_selectivity(query_plan):
    param_selectivities = []
    _calculate_parameter_selectivity(param_selectivities, query_plan)
    return param_selectivities
    
def _calculate_parameter_selectivity(param_selectivities, query_plan):
    if has_child_node(original_query_plan):
        for child_node in query_plan["Plans"]:
            _calculate_parameter_selectivity(child_node)
    if (condition := has_filtering_property(query_plan)) != "":
        for column in index.columns:
            if column.name in condition:
                param_selectivities.append(column, query_plan["Plan Rows"]/column.table.row_count)

def evaluate_configuration(configuration_features, configuration, query_plan):
    current_operator = query_plan["Node Type"]
    if has_child_node(original_query_plan):
        for child_node in query_plan["Plans"]:
            evaluate_configuration(configuration_features, configuration, child_node)
    if (condition := has_filtering_property(query_plan)) != "":
        for index in configuration:
            for column in index.columns:
                if column in condition:
                    configuration_features[f"selectivity_{column.name}"] = query_plan["Plan Rows"]/column.table.row_count
                    if column.name not in configuration_features.keys():
                        configuration_features[column.name] = []
                    configuration_features[column.name].append(current_operator)
    elif is_aggregate_operator(current_operator) and "Group Key" in query_plan.keys():
        conditions = query_plan["Group Key"]
        for index in configuration:
            for column in index.columns:
                for condition in conditions:
                    if column in condition:
                        if column.name not in configuration_features.keys():
                            configuration_features[column.name] = []
                        configuration_features[column.name].append(current_operator)
    elif is_sort_operator(current_operator) and "Sort Key" in query_plan:
        conditions = query_plan["Sort Key"]
        for index in configuration:
            for column in index.columns:
                for condition in conditions:
                    if column in condition:
                        if column.name not in configuration_features.keys():
                            configuration_features[column.name] = []
                        configuration_features[column.name].append(current_operator)

In [None]:
# training 

# parameters
error_threshold = 0.05
training_sample_size_alpha = None
training_sample_size_beta = None
stopping_threshold = None

In [1]:
query_configuration_list = [(q, c) for q, c in query_configurations.items()]
feature_columns = []

features = pd.DataFrame(columns=feature_columns, index=range(len(query_configuration_list)))

for i, (query, config) in enumerate(query_configuration_list):
    features.iloc[i]["selectivity"]

    

SyntaxError: invalid syntax (4096138919.py, line 7)

#### Modeling

##### Random Forest

In [None]:
regr = RandomForestRegressor(n_estimators=5, max_depth=6, random_state=0).fit(X_train, y_train)