In [1]:
import json
import csv
import logging
import itertools

from selection.index_selection_evaluation import DBMSYSTEMS
from selection.query_generator import QueryGenerator
from selection.table_generator import TableGenerator
from selection.what_if_index_creation import WhatIfIndexCreation
from selection.cost_evaluation import CostEvaluation
from selection.workload import Workload
from selection.index import Index, index_merge
from selection.candidate_generation import syntactically_relevant_indexes, candidates_per_query
from selection.utils import get_utilized_indexes, indexes_by_table

        
config_file = "config.json"
with open(config_file) as f:
    config = json.load(f)
dbms_class = DBMSYSTEMS[config["database_system"]]
generating_connector = dbms_class(None, autocommit=True)
table_generator = TableGenerator(config["benchmark_name"], config["scale_factor"], generating_connector)
database_name = table_generator.database_name()
database_system = config["database_system"]
db_connector = DBMSYSTEMS[database_system](database_name)
query_generator = QueryGenerator(
    config["benchmark_name"],
    config["scale_factor"],
    db_connector,
    config["queries"],
    table_generator.columns,
)
workload = Workload(query_generator.queries)
cost_evaluation = CostEvaluation(db_connector, cost_estimation="actual_runtimes")

Prepare to load table
Loading table done
Reading column names


In [3]:
# table_generator._generate()

Generating dsb data
Files generated: ['customer_address.dat', 'income_band.dat', 'date_dim.dat', 'warehouse.dat', 'call_center.dat', 'dbgen_version.dat', 'catalog_sales.dat', 'web_returns.dat', 'promotion.dat', 'web_site.dat', 'store_returns.dat', 'web_sales.dat', 'store_sales.dat', 'customer_demographics.dat', 'store.dat', 'customer.dat', 'catalog_page.dat', 'time_dim.dat', 'inventory.dat', 'household_demographics.dat', 'catalog_returns.dat', 'web_page.dat', 'ship_mode.dat', 'item.dat', 'reason.dat']


In [5]:
# table_files = ['catalog_page.dat']
# import os
# from io import StringIO  

# print("Loading data into the tables")
# for filename in table_files:
#     print("    Loading file {}".format(filename))
#     table = filename.replace(".tbl", "").replace(".dat", "")
#     path = table_generator.directory + "/" + filename
#     print(f"    Import data of path {path}")
#     with open(path, 'r') as file:
#         lines = file.readlines()
#         for line in lines:
#             to_write = line.rstrip('\n')[:-1]
#             f = StringIO(to_write+'\n')
#             db_connector._cursor.copy_from(f, table, sep="|", null="")
#     db_connector.commit()

In [2]:
from selection.index_selection_evaluation import IndexSelection
import time

# index_selection = IndexSelection()
# with open(config_file) as f:
#     config = json.load(f)
# index_selection._setup_config(config)
db_connector.drop_indexes()

# Set the random seed to obtain deterministic statistics (and cost estimations)
# because ANALYZE (and alike) use sampling for large tables
# index_selection.db_connector.create_statistics()
# index_selection.db_connector.commit()
candidates = candidates_per_query(workload,2,candidate_generator=syntactically_relevant_indexes)

Generate candidates
Potential indexes: 55
Potential indexes: 55
Potential indexes: 55
Potential indexes: 120
Potential indexes: 120
Potential indexes: 120
Potential indexes: 175
Potential indexes: 175
Potential indexes: 175
Potential indexes: 110
Potential indexes: 110
Potential indexes: 110
Potential indexes: 62
Potential indexes: 62
Potential indexes: 62
Potential indexes: 111
Potential indexes: 111
Potential indexes: 111
Potential indexes: 97
Potential indexes: 97
Potential indexes: 97
Potential indexes: 67
Potential indexes: 67
Potential indexes: 67
Potential indexes: 81
Potential indexes: 81
Potential indexes: 81
Potential indexes: 84
Potential indexes: 84
Potential indexes: 84
Potential indexes: 92
Potential indexes: 92
Potential indexes: 92
Potential indexes: 57
Potential indexes: 57
Potential indexes: 57
Potential indexes: 104
Potential indexes: 104
Potential indexes: 104
Potential indexes: 101
Potential indexes: 101
Potential indexes: 101
Potential indexes: 159
Potential index

In [5]:
len(candidates)

111

In [6]:
len(workload.queries)

111

In [3]:
import random
def sample_candidates(candidates_per_query, max_config_width):
    candidates = [[]]
    for width in range(1, max_config_width+1):
        possible_candidates = random.sample(candidates_per_query, width)
        if width == 1:
            candidates.append(possible_candidates)
            continue
        else:
            # check if a config contains same column index
            column_check = set()
            for candidate in possible_candidates:
                if column_check & set(candidate.columns): possible_candidates.remove(candidate)
                else: column_check |= set(candidate.columns)
            # keep sample until reaches the width
            while len(possible_candidates) < width:
                candidate = random.sample(candidates_per_query, 1)[0]
                if column_check & set(candidate.columns): continue
                else: 
                    column_check |= set(candidate.columns)
                    possible_candidates.append(candidate)
        candidates.append(possible_candidates)
    return candidates
        
sample_candidates(candidates[0], 4)

[[],
 [I(C customer.c_current_addr_sk,C customer.c_current_cdemo_sk)],
 [I(C income_band.ib_income_band_sk), I(C household_demographics.hd_demo_sk)],
 [I(C customer.c_last_name,C customer.c_first_name),
  I(C customer.c_customer_id,C customer.c_first_name),
  I(C customer_address.ca_city)],
 [I(C customer.c_current_cdemo_sk,C customer.c_current_hdemo_sk),
  I(C income_band.ib_upper_bound),
  I(C income_band.ib_upper_bound,C income_band.ib_lower_bound),
  I(C customer.c_customer_id,C customer.c_first_name)]]

In [4]:
number_of_actual_runs = 4

data = []
filename = "../data/DSB/dsb.csv"
iter = 48
for  query, candidates_per_query in zip(workload.queries[iter:], candidates[iter:]):
    print(f"iteration no: {iter}: ")
    entry = [[query.nr, query.text]]
    index_configs_per_query = sample_candidates(candidates_per_query, 4)
    formatted_index_configs_per_query = []
    average_execution_times_per_index_config, execution_time_list_per_config, query_plans_per_index_config = [], [], []
    for index_config in index_configs_per_query:
        if len(index_config) == 0: formatted_index_configs_per_query.append([])
        elif len(index_config) == 1: formatted_index_configs_per_query.append(index_config[0])
        else: formatted_index_configs_per_query.append(tuple(index_config))
        cost_evaluation._prepare_cost_calculation(index_config)
        execution_time_list = []
        for i in range(number_of_actual_runs):
            print(f"\tnow running the {i}th run")
            actual_execu_time, plan = db_connector.exec_query(query)
            execution_time_list.append(actual_execu_time)
        average_execution_time = sum(execution_time_list)/len(execution_time_list)
        average_execution_times_per_index_config.append(average_execution_time)
        execution_time_list_per_config.append(execution_time_list[1:])
        query_plans_per_index_config.append(plan)
        cost_evaluation.complete_cost_estimation()
    entry.append(formatted_index_configs_per_query)
    entry.append(average_execution_times_per_index_config)
    entry.append(query_plans_per_index_config)
    entry.append(execution_time_list_per_config)
    data.append(entry)
    with open(filename, "a+") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(entry)
    iter+=1

iteration no: 48: 
	now running the 0th run
	now running the 1th run
	now running the 2th run
	now running the 3th run
	now running the 0th run
	now running the 1th run
	now running the 2th run
