In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neural_network import MLPRegressor, MLPClassifier
import lightgbm as lgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt 

def json_dump(filename, data):
    with open(filename, 'w') as f:
        json.dump(data, f)

In [2]:
import logging
import pickle
from selection.index_selection_evaluation import DBMSYSTEMS
from selection.query_generator import QueryGenerator
from selection.table_generator import TableGenerator
from selection.what_if_index_creation import WhatIfIndexCreation
from selection.workload import Workload

config_file = "config.json"
with open(config_file) as f:
    config = json.load(f)

In [3]:
dbms_class = DBMSYSTEMS[config["database_system"]]
generating_connector = dbms_class(None, autocommit=True)

OperationalError: connection to server on socket "/tmp/.s.PGSQL.5432" failed: No such file or directory
	Is the server running locally and accepting connections on that socket?


In [None]:
table_generator = TableGenerator(
    config["benchmark_name"], config["scale_factor"], generating_connector
)
database_name = table_generator.database_name()

In [None]:
database_system = config["database_system"]
db_connector = DBMSYSTEMS[database_system](database_name)
what_if = WhatIfIndexCreation(db_connector)

In [4]:
query_generator = QueryGenerator(
    config["benchmark_name"],
    config["scale_factor"],
    db_connector,
    config["queries"],
    table_generator.columns,
)

NameError: name 'db_connector' is not defined

In [7]:
workload = Workload(query_generator.queries)

In [10]:
from selection.candidate_generation import syntactically_relevant_indexes
all_syntactically_relevant_indexes, query_index_pairs = set(), []
for query in workload.queries:
    indexes = syntactically_relevant_indexes(query, len(query.columns))
    all_syntactically_relevant_indexes.update(indexes)
    query_index_pairs.extend([(query, index) for index in indexes])
print(len(query_index_pairs))

16730


### Index Filter

#### Labels

In [9]:
from selection.index_selection_evaluation import IndexSelection 

workload_cost = None
index_selection = IndexSelection()
index_selection._run_algorithms(config_file)

In [None]:
csv_path= f"benchmark_results/results_no_index_tpch_19_queries.csv"
no_index_df = pd.read_csv(csv_path, sep=';')
workload_cost_no_index = 0
for _, v in no_index_df.loc[0, "q1": "q22"].to_dict().items():
    workload_cost_no_index += float(json.loads(v)["Cost"])
print(workload_cost_no_index)

46075564.57


In [8]:
from selection.cost_evaluation import CostEvaluation
cost_evaluation = CostEvaluation(db_connector)

# workload_cost_with_index = []
# for index in all_syntactically_relevant_indexes:
#     cost = cost_evaluation.calculate_cost(workload, set([index]))
#     workload_cost_with_index.append(round(cost, 2))
#     cost_evaluation._unsimulate_or_drop_index(index)

# print(workload_cost_with_index)

In [12]:
PHYISCAL_TO_LOGICAL_OPERATOR_MAP = {
    "Seq Scan": "Scan",
    "Bitmap Index Scan": "Scan",
    "Bitmap Heap Scan": "Scan",
    "Index Scan": "Scan",
    "Index Only Scan": "Scan",
    "Sort": "Sort",
    "Hash Join": "Join",
    "Merge Join": "Join",
    "Nested Loop": "Join",
    "Aggregate": "Aggregate",
    "Gather Merge": "",
    "Gather": "",
    "BitmapOr": "",
    "Limit": "",
    "Hash": "",
}

LOGICAL_OPERATORS = ["Scan", "Join", "Aggregate", "Sort"]

#### Feautures

In [11]:
query_plans_with_index, query_costs_with_index = {}, {}
query_plans_with_index_dump = []
for query in workload.queries:
    query_plan = db_connector.get_plan_with_statistics(query)
    query_plans_with_index[(query, None)] = query_plan
    query_costs_with_index[(query, None)] = db_connector.get_cost(query)
    query_plans_with_index_dump.append(query_plan)
    indexes = syntactically_relevant_indexes(query, len(query.columns))
    for index in indexes:
        what_if.simulate_index(index)
        indexed_query_plan = db_connector.get_plan_with_statistics(query)
        indexed_query_cost = db_connector.get_cost(query)
        what_if.drop_simulated_index(index)
        query_plans_with_index_dump.append(indexed_query_plan)
        query_plans_with_index[(query,index)] = indexed_query_plan
        query_costs_with_index[(query,index)] = indexed_query_cost

UndefinedFunction: function hypopg_create_index(unknown) does not exist
LINE 1: select * from hypopg_create_index( 'create index on lineitem...
                      ^
HINT:  No function matches the given name and argument types. You might need to add explicit type casts.


In [None]:
json_dump("indexed_query_plans.json", query_plans_with_index_dump)

In [14]:
# get cardinatlity statistics
for table in table_generator.tables:
    row_count = db_connector.table_row_count(table.name)
    table.set_row_count(row_count)
    for column in table.columns:
        card = db_connector.get_column_cardinality(column)
        column.set_cardinality(-card * row_count if card < 0 else card)

In [15]:
def has_filtering_property(query_plan):
    if "Filter" in query_plan.keys():
        return query_plan["Filter"]
    if "Hash Cond" in query_plan.keys():
        return query_plan["Hash Cond"]
    if "Join Filter" in query_plan.keys():
        return query_plan["Join Filter"]
    return ""

def has_child_node(query_plan):
    return "Plans" in query_plan.keys()

def is_join_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Join"

def is_sort_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Sort"

def is_aggregate_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Aggregate"

def is_scan_operator(operator):
    return PHYISCAL_TO_LOGICAL_OPERATOR_MAP[operator] == "Scan"

def check_indexed_column_in_condition(index, condition):
    for column in index.columns:
        if column.name in condition:
            return True

def get_table_from_plan_node(query_plan):
    table = ""
    if "Relation Name" in query_plan.keys():
        table = query_plan["Relation Name"]
    return table

In [16]:
# signal 1
def estimate_index_utility_recursive(index, original_query_plan, indexed_query_plan):
    total_cost = 0
    if has_child_node(original_query_plan):
        for original_child_node, indexed_child_node in zip(indexed_query_plan["Plans"], indexed_query_plan["Plans"]):
            total_cost += estimate_index_utility_recursive(index, original_child_node, indexed_child_node)
    current_operator = indexed_query_plan["Node Type"]
    current_cost = original_query_plan["Total Cost"]
    if (condition := has_filtering_property(indexed_query_plan)) != "":
        if is_join_operator(current_operator):
            join_output_rows = indexed_query_plan["Plan Rows"]
            left_input_rows = indexed_query_plan["Plans"][0]["Plan Rows"]
            right_input_rows = indexed_query_plan["Plans"][1]["Plan Rows"]
            if check_indexed_column_in_condition(index, condition):    
                current_cost = (1-np.sqrt(join_output_rows/(left_input_rows*right_input_rows)))*original_query_plan["Total Cost"]
        else:
            selectivities = [indexed_query_plan["Plan Rows"]/column.table.row_count for column in index.columns if column.name in condition]
            average_selectivity = sum(selectivities)/len(selectivities) if len(selectivities) > 0 else 0
            current_cost = (1-average_selectivity)*original_query_plan["Total Cost"]
    elif is_sort_operator(current_operator):
        sort_conditions = indexed_query_plan["Sort Key"]
        for sort_condition in sort_conditions:
            if check_indexed_column_in_condition(index, sort_condition):
                current_cost = indexed_query_plan["Total Cost"]
    elif is_aggregate_operator(current_operator) and "Group Key" in query_plan.keys():
        group_conditions = query_plan["Group Key"]
        for group_condition in group_conditions:
            if check_indexed_column_in_condition(index, group_condition):
                current_cost = indexed_query_plan["Total Cost"]
    return total_cost+current_cost

In [4]:
# signal 2
def extract_shape_of_query_and_index(index, original_query_plan):
    query_shape, index_shape = {}, []
    _extract_query_shape(query_shape, original_query_plan)
    _extract_index_shape(index_shape, index, original_query_plan)
    return query_shape, index_shape


def _extract_query_shape(query_shape, query_plan):
    current_operator = query_plan["Node Type"]
    logical_operator = PHYISCAL_TO_LOGICAL_OPERATOR_MAP[current_operator]
    if is_scan_operator(current_operator):
        table = get_table_from_plan_node(query_plan)
        if table in query_shape.keys():
            query_shape[table].append(logical_operator)
        else:
            query_shape[table] = [logical_operator]
        return table
    
    tables = []    
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            table = _extract_query_shape(query_shape, child_node)
            if table and logical_operator:
                tables.append(table)
                query_shape[table].append(logical_operator)
    return tables[0] if 0<len(tables)<2 else ""
    
    
def convert_query_shape_to_df(query_shape):
    data = []
    table_names = []
    for table_name, shape in query_shape.values():
        data.append(shape)
        table_names.append(table_name)
    return pd.DataFrame(data, index = table_names)

    
def _extract_index_shape(index_shape, index, query_plan):
    current_operator = query_plan["Node Type"]
    logical_operator = PHYISCAL_TO_LOGICAL_OPERATOR_MAP[current_operator]
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            _extract_index_shape(index_shape, index, child_node)
            
    if (condition := has_filtering_property(query_plan)) != "":
        if check_indexed_column_in_condition(index, condition):
            index_shape.append(logical_operator)
    elif is_sort_operator(current_operator):
        sort_conditions = query_plan["Sort Key"]
        for sort_condition in sort_conditions:
            if check_indexed_column_in_condition(index, sort_condition):
                index_shape.append(logical_operator)
    elif is_aggregate_operator(current_operator) and "Group Key" in query_plan.keys():
        aggregate_conditions = query_plan["Group Key"]
        for aggregate_condition in aggregate_conditions:
            if check_indexed_column_in_condition(index, aggregate_condition):
                index_shape.append(logical_operator)

SyntaxError: invalid syntax (3077099684.py, line 34)

In [18]:
# signal 3
def evaluate_operator_relevance(operator_relevance, index, query_plan):
    current_operator = query_plan["Node Type"]
    relevance = 0
    if (condition := has_filtering_property(query_plan)) != "":
        selectivities = [query_plan["Plan Rows"]/column.table.row_count for column in index.columns if column.name in condition]
        relevance = sum(selectivities)/len(selectivities)
    elif is_sort_operator(current_operator) and "Sort Key" in query_plan:
        densities = []
        conditions = query_plan["Sort Key"]
        for condition in conditions:
            for column in index.columns:
                if column.name in condition:
                    densities.append(column.cardinality/column.table.row_count)
        relevance = sum(densities)/len(densities) if len(densities) > 0 else 0
    elif is_aggregate_operator(current_operator) and "Group Key" in query_plan.keys():
        densities = []
        conditions = query_plan["Group Key"]
        for condition in conditions:
            for column in index.columns:
                if column.name in condition:
                    densities.append(column.cardinality/column.table.row_count)
        relevance = sum(densities)/len(densities) if len(densities) > 0 else 0
    if current_operator not in operator_relevance: 
        operator_relevance[current_operator] = []
    operator_relevance[current_operator].append(relevance)
    
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            evaluate_operator_relevance[child_node]

In [14]:
# signal 4
def get_number_of_pages(query_plan):
    return query_plan["Shared Hit Blocks"] + query_plan["Shared Read Blocks"] + query_plan["Local Hit Blocks"] + query_plan["Local Read Blocks"]


# seems not supported by postgres:
# https://stackoverflow.com/questions/20410444/postgres-ignoring-clustered-index-on-date-query
def count_clustered_index(table_name):
    count = db_connector.count_clustered_indexes(table_name)
    return count

def check_using_bitmap(query_plan):
    current_operator = query_plan["Node Type"]
    use = False
    if is_scan_operator(current_operator):
        use = "Bitmap" in current_operator
    
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            use |= check_using_bitmap(child_node)
    return use

In [21]:
physical_operators = set()

def collect_physical_operators(physical_operators, query_plan): 
    physical_operators.add(query_plan["Node Type"])
    if has_child_node(query_plan):
        for child_node in query_plan["Plans"]:
            collect_physical_operators(physical_operators, child_node)
            
for _, query_plan in query_plans_with_index.items():
    collect_physical_operators(physical_operators, query_plan) 

physical_operators = list(physical_operators)  
print(physical_operators)

['Limit', 'Gather', 'Nested Loop', 'BitmapOr', 'Bitmap Index Scan', 'Aggregate', 'Merge Join', 'Hash Join', 'Sort', 'Bitmap Heap Scan', 'Index Only Scan', 'Index Scan', 'Seq Scan', 'Hash', 'Gather Merge']


In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
# create an example dataframe to work with
df = pd.DataFrame([
    ["Scan", "Join"],
    ["Scan", "Aggregate", "Join"],
    ["Aggregate", "Sort"],
    ["Sort", "Join"],
    ["Aggregate", "Scan"]
], columns=["operator1", "operator2", "operator3"])

# create a OneHotEncoder that ignores (0 encodes) unseen categories
# and encode the categorical features for the example dataframe
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(df)
print(X_encoded)
print(encoder.categories_)

[[0. 1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 1. 0. 1.]
 [0. 0. 1. 0. 1. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 1. 0. 0. 1.]]
[array(['Aggregate', 'Scan', 'Sort'], dtype=object), array(['Aggregate', 'Join', 'Scan', 'Sort'], dtype=object), array(['Join', None], dtype=object)]


In [23]:
labels = []
feature_columns = ["utility", "relevance", "num_pages", "use_itmap"]
features = pd.DataFrame(columns=feature_columns, index=range(len(query_index_pairs)))

i = 0
for (query, index), indexed_query_plan in query_plans_with_index.items():
    if index == None: continue
    labels.append(query_costs_with_index[(query, index)])
    original_query_plan = query_plans_with_index[(query, None)]
    original_query_cost = query_costs_with_index[(query, None)]
    utility = estimate_index_utility_recursive(index, original_query_plan, indexed_query_plan)/original_query_cost
    query_shape, index_shape = extract_shape_of_query_and_index(index, original_query_plan)
    query_shape
    operator_relevance = {}
    evaluate_operator_relevance(operator_relevance, index, original_query_plan)
    num_pages = get_number_of_pages(indexed_query_plan)
    use_bitmap = check_using_bitmap(indexed_query_plan)
    features.iloc[i]["utility"] = utility
    features.iloc[i]["num_pages"] = num_pages
    features.iloc[i]["use_bitmap"] = use_bitmap
    i+=1

# print(features)
# print(labels)

1124556
1124556
1124556
1124556
1124556
1124556


#### Modeling

##### Logistic Regression

In [None]:
lr = LogisticRegression(random_state=0).fit(features, labels)

##### LGBM

In [None]:
num_round = 10
bst = lgb.train(features, num_round)

##### Random Forest

In [None]:
regr = RandomForestRegressor(max_depth=2, random_state=0).fit(features, labels)

##### MLP

In [None]:
mlp = MLPRegressor(random_state=1, max_iter=500).fit(features, labels)

### Index Cost Model

In [None]:
filtered_query_index_pairs = []
configurations = {}
for (query, index) in filtered_query_index_pairs:
    if query not in configurations:
        configurations[query] = set()
    configurations[query].add(index)

In [None]:
def calculate_parameter_selectivity(query_plan):
    param_selectivities = []
    calculate_parameter_selectivity(param_selectivities, query_plan)
    return param_selectivities
    
    
def _calculate_parameter_selectivity(param_selectivities, query_plan):
    if has_child_node(original_query_plan):
        for child_node in query_plan["Plans"]:
            _calculate_parameter_selectivity(child_node)
    if (condition := has_filtering_property(query_plan)) != "":
        for column in index.columns:
            if column.name in condition:
                param_selectivities.append(column, query_plan["Plan Rows"]/column.table.row_count)
    

columns_features = {}
def evaluate_query(query_plan):
    operator = query_plan["Node Type"]
    

In [None]:
# training 

# parameters
error_threshold = 0.05
training_sample_size_alpha = None
training_sample_size_beta = None
stopping_threshold = None