In [2]:
import pandas as pd
import sqlglot
import mysql.connector
from functools import partial
import json
import numpy as np
import random
from tqdm import tqdm
import copy

pd.set_option("display.max_rows", 100)

database = mysql.connector.connect(
    user='root', 
    password='password',
    host='127.0.0.1', 
    port=3307,
    database="TPCH",
)

cursor = database.cursor()

def partial_transformer(node, table_index_info):
    if isinstance(node, sqlglot.exp.Table):
        table_name = node.this.output_name
        if table_name in table_index_info:
            indexes = [ f"index_{table_name}_{column}" for column in table_index_info[table_name]["indexes"]]
            use_index_flag = table_index_info[table_name]["use_index_flag"] if len(indexes) > 0 else True
        else:
            use_index_flag = True
            indexes = list()

        table_hint = sqlglot.exp.IndexTableHint()
        table_hint.set("this", "USE" if use_index_flag and len(indexes) > 0 else "USE" if use_index_flag else "IGNORE")
        indexes_identifier = sqlglot.exp.Identifier()
        indexes_identifier.set("this", ", ".join(indexes))
        table_hint.set("expressions", table_hint.expressions + [indexes_identifier])
        node.set("hints", node.expressions + [table_hint])
        return node
    return node

def get_query_cost(query, table_index_info, default=False) -> float:
    table_index_info = copy.deepcopy(table_index_info)

    for table in table_index_info:
        if table_index_info[table]["use_index_flag"]:
            table_index_info[table]["indexes"] = [col for col in table_index_info[table]["indexes"] if col in query]
    
    if not default:
        expression_tree = sqlglot.parse_one(query)
        transformer = partial(partial_transformer, table_index_info=table_index_info)
        transformed_tree = expression_tree.transform(transformer)
        index_specified_query = transformed_tree.sql()
        # print(index_specified_query)
        cursor.execute(f"EXPLAIN FORMAT='JSON' {index_specified_query}")
    else:
        cursor.execute(f"EXPLAIN FORMAT='JSON' {query}")
    

    query_cost = json.loads(cursor.fetchall()[0][0])["query_block"]["cost_info"]["query_cost"]

    return float(query_cost)

def get_query_cost_actual(query, table_index_info, default=False) -> float:
    table_index_info = copy.deepcopy(table_index_info)

    for table in table_index_info:
        if table_index_info[table]["use_index_flag"]:
            table_index_info[table]["indexes"] = [col for col in table_index_info[table]["indexes"] if col in query]
    
    if not default:
        expression_tree = sqlglot.parse_one(query)
        transformer = partial(partial_transformer, table_index_info=table_index_info)
        transformed_tree = expression_tree.transform(transformer)
        index_specified_query = transformed_tree.sql()
        cursor.execute(f"EXPLAIN ANALYZE {index_specified_query}")
    else:
        cursor.execute(f"EXPLAIN ANALYZE {query}")
    
    return float(cursor.fetchall()[0][0].split("actual time=")[1].split("..")[1].split(" ")[0])

def get_index_cost(table_index_info) -> float:
    index_name_list = []
    for table_name in table_index_info:
        for column in table_index_info[table_name]["indexes"]:
            index_name_list.append(f"index_{table_name}_{column}")

    index_name_list_string = "('"+ "','".join(index_name_list) + "')"
        
    cursor.execute(f"SELECT ROUND(SUM(stat_value * @@innodb_page_size / 1024 / 1024), 2) size_in_mb FROM mysql.innodb_index_stats WHERE stat_name = 'size' AND index_name != 'PRIMARY' AND database_name = 'TPCH' AND index_name IN {index_name_list_string}")
    return float(cursor.fetchone()[0])

def std(array):
    return np.std(array, ddof=len(array) - 1)

def average_std(data):
    return np.sqrt(np.square(data).sum())

def get_table_index_info_extremes(database):
    tables_list = pd.read_sql("SHOW TABLES", database)["Tables_in_TPCH"].tolist()
    index_table_mapping = dict()
    index_list = list()
    for table in tables_list:
        query_result = pd.read_sql(f"SHOW indexes FROM {table} WHERE key_name LIKE 'index_%'", database)
        index_table_mapping[table] = query_result["Column_name"].tolist()
        index_list += query_result["Key_name"].tolist()

    table_names = [x.split("_")[1] for x in index_list]

    table_index_info_noopt = dict()
    for i in range(len(index_list)):
        index = index_list[i]
        table = table_names[i]
        index_col = index.replace(f"index_{table}_", "")
        if table in table_index_info_noopt:
            table_index_info_noopt[table]["indexes"].append(index_col)
        else:
            table_index_info_noopt[table] = {
                "use_index_flag": False,
                "indexes": [index_col],
            }
    
    table_index_info_allopt = copy.deepcopy(table_index_info_noopt)

    for table in table_index_info_allopt:
        table_index_info_allopt[table]["use_index_flag"] = True

    return table_index_info_noopt, table_index_info_allopt

def get_table_index_info_inverse(table_index_info, table_index_info_noopt):
    table_index_info_inverse = dict()

    for table in table_index_info_noopt:
        table_index_info_inverse[table] = copy.deepcopy(table_index_info_noopt[table])
        if table in table_index_info:
            indexes = table_index_info[table]["indexes"]
            table_index_info_inverse[table]["indexes"] = list(set(table_index_info_inverse[table]["indexes"]) - set(indexes))
    
    return table_index_info_inverse
    

In [3]:
sql_reader = open("queries/test_queries.sql")
queries = sql_reader.read().split(";")
sql_reader.close()

In [4]:
table_index_info_humopt = {
    "lineitem": {
        "use_index_flag": True,
        "indexes": ["l_shipdate", "l_partkey", "l_quantity"],
    },
    "orders": {
        "use_index_flag": True,
        "indexes": ["o_orderpriority", "o_custkey", "o_orderdate"],
    },
    "part": {
        "use_index_flag": True,
        "indexes": ["p_type"]
    }   
}

table_index_info_greedyopt =  {
    'orders': {
        'use_index_flag': True, 
        'indexes': ['o_custkey']
    }, 
    'lineitem': {
        'use_index_flag': True, 
        'indexes': ['l_suppkey']
    }, 
    'part': {
        'use_index_flag': True, 
        'indexes': ['p_container']
    }, 
    'customer': {
        'use_index_flag': True, 
        'indexes': ['c_nationkey', 'c_mktsegment']
    }
}

table_index_info_mlopt = {'orders': {'use_index_flag': True, 'indexes': ['o_custkey', 'o_orderdate']},
 'customer': {'use_index_flag': True, 'indexes': ['c_nationkey']},
 'part': {'use_index_flag': True, 'indexes': ['p_brand']},
 'lineitem': {'use_index_flag': True, 'indexes': ['l_suppkey']}}

table_index_info_mlopt_25 = {'lineitem': {'use_index_flag': True, 'indexes': ['l_shipdate']},
 'orders': {'use_index_flag': True, 'indexes': ['o_orderdate']},
 'part': {'use_index_flag': True, 'indexes': ['p_container', 'p_brand']},
 'partsupp': {'use_index_flag': True, 'indexes': ['ps_suppkey']}}

table_index_info_mlopt_50 =  {'lineitem': {'use_index_flag': True, 'indexes': ['l_shipdate']},
 'part': {'use_index_flag': True, 'indexes': ['p_size', 'p_container']},
 'partsupp': {'use_index_flag': True, 'indexes': ['ps_suppkey']},
 'orders': {'use_index_flag': True, 'indexes': ['o_custkey', 'o_orderdate']},
 'supplier': {'use_index_flag': True, 'indexes': ['s_nationkey']},
 'customer': {'use_index_flag': True,
  'indexes': ['c_nationkey', 'c_mktsegment']}}

table_index_info_mlopt_75 =   {'lineitem': {'use_index_flag': True, 'indexes': ['l_shipdate']}, 'part': {'use_index_flag': True, 'indexes': ['p_size', 'p_type', 'p_mfgr']}, 'supplier': {'use_index_flag': True, 'indexes': ['s_nationkey']}}

table_index_info_mlopt_100 =   {'lineitem': {'use_index_flag': True, 'indexes': ['l_shipdate']}, 'part': {'use_index_flag': True, 'indexes': ['p_size', 'p_container', 'p_mfgr']}, 'partsupp': {'use_index_flag': True, 'indexes': ['ps_suppkey']}}

In [5]:
table_index_info_noopt, table_index_info_allopt = get_table_index_info_extremes(database)
table_index_info_humopt_inverse = get_table_index_info_inverse(table_index_info_humopt, table_index_info_noopt)
table_index_info_greedyopt_inverse = get_table_index_info_inverse(table_index_info_greedyopt, table_index_info_noopt)
table_index_info_mlopt_inverse = get_table_index_info_inverse(table_index_info_mlopt, table_index_info_noopt)
table_index_info_mlopt_25_inverse = get_table_index_info_inverse(table_index_info_mlopt_25, table_index_info_noopt)
table_index_info_mlopt_50_inverse = get_table_index_info_inverse(table_index_info_mlopt_50, table_index_info_noopt)
table_index_info_mlopt_75_inverse = get_table_index_info_inverse(table_index_info_mlopt_75, table_index_info_noopt)
table_index_info_mlopt_100_inverse = get_table_index_info_inverse(table_index_info_mlopt_100, table_index_info_noopt)

  tables_list = pd.read_sql("SHOW TABLES", database)["Tables_in_TPCH"].tolist()
  query_result = pd.read_sql(f"SHOW indexes FROM {table} WHERE key_name LIKE 'index_%'", database)
  query_result = pd.read_sql(f"SHOW indexes FROM {table} WHERE key_name LIKE 'index_%'", database)
  query_result = pd.read_sql(f"SHOW indexes FROM {table} WHERE key_name LIKE 'index_%'", database)


In [7]:
print(f"The cost for noopt was {0}")
print(f"The cost for humopt was {get_index_cost(table_index_info_humopt)}")
print(f"The cost for greedyopt was {get_index_cost(table_index_info_greedyopt)}")
print(f"The cost for mlopt was {get_index_cost(table_index_info_mlopt)}")
print(f"The cost for allopt was {get_index_cost(table_index_info_allopt)}")
print(f"The cost for mlopt_25 was {get_index_cost(table_index_info_mlopt_25)}")
print(f"The cost for mlopt_50 was {get_index_cost(table_index_info_mlopt_50)}")
print(f"The cost for mlopt_75 was {get_index_cost(table_index_info_mlopt_75)}")
print(f"The cost for mlopt_100 was {get_index_cost(table_index_info_mlopt_100)}")


The cost for noopt was 0
The cost for humopt was 675.8
The cost for greedyopt was 245.0
The cost for mlopt was 268.03
The cost for allopt was 2758.23
The cost for mlopt_25 was 220.89
The cost for mlopt_50 was 263.69
The cost for mlopt_75 was 185.56
The cost for mlopt_100 was 201.89


In [None]:
test_query_13 = True
query_list = [13] if test_query_13 else range(len(queries))
models = ["noopt", "humopt", "mlopt", "mlopt_25", "mlopt_50", "mlopt_75", "mlopt_100"] if test_query_13 else ["noopt", "greedyopt", "allopt", "humopt"]

In [54]:
data = list()
for trial_id in range(10):
    tests = list()
    for query_id in query_list:
        for table_index_info_type in models:
            tests.append({"query_id": query_id, "index_type": table_index_info_type})
    random.shuffle(tests)
    for test in tqdm(tests, desc=f"Query Test Iteration for {trial_id}"):
        query_id = test["query_id"]
        table_index_info_type = test["index_type"]
        print(query_id, table_index_info_type)
        if table_index_info_type == "noopt":
            table_index_info = table_index_info_noopt
        elif table_index_info_type == "humopt":
            table_index_info = table_index_info_humopt_inverse
        elif table_index_info_type == "allopt":
            table_index_info = dict()
        elif table_index_info_type == "mlopt":
            table_index_info = table_index_info_mlopt_inverse
        elif table_index_info_type == "greedyopt":
            table_index_info = table_index_info_greedyopt_inverse
        elif table_index_info_type == "mlopt_25":
            table_index_info = table_index_info_mlopt_25_inverse
        elif table_index_info_type == "mlopt_50":
            table_index_info = table_index_info_mlopt_50_inverse
        elif table_index_info_type == "mlopt_75":
            table_index_info = table_index_info_mlopt_75_inverse
        elif table_index_info_type == "mlopt_100":
            table_index_info = table_index_info_mlopt_100_inverse

        
        query_time = get_query_cost_actual(queries[query_id], table_index_info, table_index_info_type == "allopt")
        # print(f"The query time for {table_index_info_type} and query {query_id} is {query_time}")
        data.append({"trial_id": trial_id, "query_id": query_id, "query_time": query_time, "index_type":  table_index_info_type})    

Query Test Iteration for 0:   0%|          | 0/7 [00:00<?, ?it/s]

13 noopt


Query Test Iteration for 0:  14%|█▍        | 1/7 [00:04<00:25,  4.19s/it]

13 mlopt_25


Query Test Iteration for 0:  29%|██▊       | 2/7 [00:05<00:11,  2.35s/it]

13 mlopt_75


Query Test Iteration for 0:  43%|████▎     | 3/7 [00:06<00:07,  1.77s/it]

13 humopt


Query Test Iteration for 0:  57%|█████▋    | 4/7 [00:07<00:04,  1.51s/it]

13 mlopt_50


Query Test Iteration for 0:  71%|███████▏  | 5/7 [00:08<00:02,  1.36s/it]

13 mlopt


Query Test Iteration for 0:  86%|████████▌ | 6/7 [00:12<00:02,  2.17s/it]

13 mlopt_100


Query Test Iteration for 0: 100%|██████████| 7/7 [00:13<00:00,  1.91s/it]
Query Test Iteration for 1:   0%|          | 0/7 [00:00<?, ?it/s]

13 mlopt_50


Query Test Iteration for 1:  14%|█▍        | 1/7 [00:01<00:07,  1.20s/it]

13 mlopt_100


Query Test Iteration for 1:  29%|██▊       | 2/7 [00:02<00:05,  1.17s/it]

13 humopt


Query Test Iteration for 1:  43%|████▎     | 3/7 [00:03<00:04,  1.18s/it]

13 noopt


Query Test Iteration for 1:  57%|█████▋    | 4/7 [00:07<00:06,  2.29s/it]

13 mlopt_75


Query Test Iteration for 1:  71%|███████▏  | 5/7 [00:08<00:03,  1.87s/it]

13 mlopt_25


Query Test Iteration for 1:  86%|████████▌ | 6/7 [00:09<00:01,  1.62s/it]

13 mlopt


Query Test Iteration for 1: 100%|██████████| 7/7 [00:13<00:00,  1.94s/it]
Query Test Iteration for 2:   0%|          | 0/7 [00:00<?, ?it/s]

13 noopt


Query Test Iteration for 2:  14%|█▍        | 1/7 [00:03<00:22,  3.77s/it]

13 mlopt_25


Query Test Iteration for 2:  29%|██▊       | 2/7 [00:04<00:10,  2.19s/it]

13 mlopt_75


Query Test Iteration for 2:  43%|████▎     | 3/7 [00:05<00:06,  1.70s/it]

13 mlopt


Query Test Iteration for 2:  57%|█████▋    | 4/7 [00:09<00:07,  2.51s/it]

13 mlopt_50


Query Test Iteration for 2:  71%|███████▏  | 5/7 [00:10<00:04,  2.00s/it]

13 humopt


Query Test Iteration for 2:  86%|████████▌ | 6/7 [00:11<00:01,  1.69s/it]

13 mlopt_100


Query Test Iteration for 2: 100%|██████████| 7/7 [00:13<00:00,  1.87s/it]
Query Test Iteration for 3:   0%|          | 0/7 [00:00<?, ?it/s]

13 noopt


Query Test Iteration for 3:  14%|█▍        | 1/7 [00:03<00:22,  3.82s/it]

13 mlopt_25


Query Test Iteration for 3:  29%|██▊       | 2/7 [00:04<00:11,  2.24s/it]

13 mlopt_100


Query Test Iteration for 3:  43%|████▎     | 3/7 [00:06<00:06,  1.75s/it]

13 mlopt


Query Test Iteration for 3:  57%|█████▋    | 4/7 [00:09<00:07,  2.55s/it]

13 mlopt_50


Query Test Iteration for 3:  71%|███████▏  | 5/7 [00:11<00:04,  2.04s/it]

13 humopt


Query Test Iteration for 3:  86%|████████▌ | 6/7 [00:12<00:01,  1.74s/it]

13 mlopt_75


Query Test Iteration for 3: 100%|██████████| 7/7 [00:13<00:00,  1.90s/it]
Query Test Iteration for 4:   0%|          | 0/7 [00:00<?, ?it/s]

13 mlopt


Query Test Iteration for 4:  14%|█▍        | 1/7 [00:03<00:22,  3.80s/it]

13 mlopt_75


Query Test Iteration for 4:  29%|██▊       | 2/7 [00:04<00:11,  2.23s/it]

13 mlopt_25


Query Test Iteration for 4:  43%|████▎     | 3/7 [00:06<00:06,  1.73s/it]

13 mlopt_100


Query Test Iteration for 4:  57%|█████▋    | 4/7 [00:07<00:04,  1.53s/it]

13 humopt


Query Test Iteration for 4:  71%|███████▏  | 5/7 [00:08<00:02,  1.40s/it]

13 mlopt_50


Query Test Iteration for 4:  86%|████████▌ | 6/7 [00:09<00:01,  1.31s/it]

13 noopt


Query Test Iteration for 4: 100%|██████████| 7/7 [00:13<00:00,  1.93s/it]
Query Test Iteration for 5:   0%|          | 0/7 [00:00<?, ?it/s]

13 mlopt_25


Query Test Iteration for 5:  14%|█▍        | 1/7 [00:01<00:06,  1.15s/it]

13 noopt


Query Test Iteration for 5:  29%|██▊       | 2/7 [00:05<00:13,  2.74s/it]

13 mlopt_75


Query Test Iteration for 5:  43%|████▎     | 3/7 [00:06<00:08,  2.02s/it]

13 mlopt_100


Query Test Iteration for 5:  57%|█████▋    | 4/7 [00:07<00:05,  1.69s/it]

13 humopt


Query Test Iteration for 5:  71%|███████▏  | 5/7 [00:08<00:03,  1.53s/it]

13 mlopt


Query Test Iteration for 5:  86%|████████▌ | 6/7 [00:12<00:02,  2.45s/it]

13 mlopt_50


Query Test Iteration for 5: 100%|██████████| 7/7 [00:14<00:00,  2.01s/it]
Query Test Iteration for 6:   0%|          | 0/7 [00:00<?, ?it/s]

13 humopt


Query Test Iteration for 6:  14%|█▍        | 1/7 [00:01<00:07,  1.33s/it]

13 mlopt_50


Query Test Iteration for 6:  29%|██▊       | 2/7 [00:02<00:06,  1.24s/it]

13 mlopt_75


Query Test Iteration for 6:  43%|████▎     | 3/7 [00:03<00:05,  1.27s/it]

13 mlopt_100


Query Test Iteration for 6:  57%|█████▋    | 4/7 [00:05<00:03,  1.26s/it]

13 mlopt


Query Test Iteration for 6:  71%|███████▏  | 5/7 [00:08<00:04,  2.19s/it]

13 mlopt_25


Query Test Iteration for 6:  86%|████████▌ | 6/7 [00:10<00:01,  1.84s/it]

13 noopt


Query Test Iteration for 6: 100%|██████████| 7/7 [00:13<00:00,  2.00s/it]
Query Test Iteration for 7:   0%|          | 0/7 [00:00<?, ?it/s]

13 humopt


Query Test Iteration for 7:  14%|█▍        | 1/7 [00:01<00:07,  1.21s/it]

13 noopt


Query Test Iteration for 7:  29%|██▊       | 2/7 [00:04<00:13,  2.71s/it]

13 mlopt_50


Query Test Iteration for 7:  43%|████▎     | 3/7 [00:06<00:08,  2.03s/it]

13 mlopt_100


Query Test Iteration for 7:  57%|█████▋    | 4/7 [00:07<00:05,  1.68s/it]

13 mlopt_75


Query Test Iteration for 7:  71%|███████▏  | 5/7 [00:08<00:02,  1.49s/it]

13 mlopt


Query Test Iteration for 7:  86%|████████▌ | 6/7 [00:12<00:02,  2.31s/it]

13 mlopt_25


Query Test Iteration for 7: 100%|██████████| 7/7 [00:13<00:00,  1.95s/it]
Query Test Iteration for 8:   0%|          | 0/7 [00:00<?, ?it/s]

13 humopt


Query Test Iteration for 8:  14%|█▍        | 1/7 [00:01<00:06,  1.17s/it]

13 mlopt_75


Query Test Iteration for 8:  29%|██▊       | 2/7 [00:02<00:05,  1.19s/it]

13 mlopt_25


Query Test Iteration for 8:  43%|████▎     | 3/7 [00:03<00:04,  1.22s/it]

13 mlopt_100


Query Test Iteration for 8:  57%|█████▋    | 4/7 [00:04<00:03,  1.25s/it]

13 noopt


Query Test Iteration for 8:  71%|███████▏  | 5/7 [00:08<00:04,  2.19s/it]

13 mlopt


Query Test Iteration for 8:  86%|████████▌ | 6/7 [00:12<00:02,  2.73s/it]

13 mlopt_50


Query Test Iteration for 8: 100%|██████████| 7/7 [00:13<00:00,  1.98s/it]
Query Test Iteration for 9:   0%|          | 0/7 [00:00<?, ?it/s]

13 mlopt_75


Query Test Iteration for 9:  14%|█▍        | 1/7 [00:01<00:07,  1.23s/it]

13 mlopt


Query Test Iteration for 9:  29%|██▊       | 2/7 [00:05<00:13,  2.78s/it]

13 mlopt_50


Query Test Iteration for 9:  43%|████▎     | 3/7 [00:06<00:08,  2.06s/it]

13 mlopt_25


Query Test Iteration for 9:  57%|█████▋    | 4/7 [00:07<00:05,  1.70s/it]

13 noopt


Query Test Iteration for 9:  71%|███████▏  | 5/7 [00:11<00:05,  2.55s/it]

13 mlopt_100


Query Test Iteration for 9:  86%|████████▌ | 6/7 [00:12<00:02,  2.09s/it]

13 humopt


Query Test Iteration for 9: 100%|██████████| 7/7 [00:13<00:00,  1.98s/it]


In [55]:
dataframe = pd.DataFrame(data)

query_performance = (
    dataframe
    .groupby(["query_id", "index_type"], as_index=False)
    .agg(
        min_time = pd.NamedAgg(column="query_time", aggfunc="min"),
        median_time = pd.NamedAgg(column= "query_time", aggfunc="median"),  
        mean_time = pd.NamedAgg(column= "query_time", aggfunc="mean"),    
        sd_time = pd.NamedAgg(column= "query_time", aggfunc=std),    
        max_time = pd.NamedAgg(column="query_time", aggfunc="max"),       
    )
)

In [56]:
index_performance = (
    query_performance
    .groupby("index_type", as_index=False)
    .agg(
        mean_time = pd.NamedAgg(column="mean_time", aggfunc="sum"),
        sd_time = pd.NamedAgg(column="sd_time", aggfunc=average_std)
    )
)

In [57]:
pd.set_option("display.max_rows", 120)

In [58]:
display(index_performance)
display(query_performance)

Unnamed: 0,index_type,mean_time,sd_time
0,humopt,1175.0,202.859557
1,mlopt,3841.7,441.84624
2,mlopt_100,1178.3,151.241859
3,mlopt_25,1143.4,192.982901
4,mlopt_50,1168.0,176.470961
5,mlopt_75,1150.8,185.799892
6,noopt,3903.0,395.026581


Unnamed: 0,query_id,index_type,min_time,median_time,mean_time,sd_time,max_time
0,13,humopt,1085.0,1161.5,1175.0,202.859557,1324.0
1,13,mlopt,3751.0,3784.0,3841.7,441.84624,4240.0
2,13,mlopt_100,1074.0,1189.0,1178.3,151.241859,1237.0
3,13,mlopt_25,1049.0,1137.0,1143.4,192.982901,1273.0
4,13,mlopt_50,1082.0,1177.0,1168.0,176.470961,1272.0
5,13,mlopt_75,1074.0,1133.0,1150.8,185.799892,1279.0
6,13,noopt,3748.0,3874.5,3903.0,395.026581,4177.0


In [33]:
pivoted_data = query_performance.pivot(index="query_id", columns="index_type", values=["mean_time", "sd_time"])

In [61]:
# dataframe.to_csv("skew_results/base_test_data.csv", index=False)
# query_performance.to_csv("skew_results/query_performance.csv", index=False)
# index_performance.to_csv("skew_results/index_performance.csv", index=False)
# pivoted_data.to_csv("results/pivoted_performance.csv", index=False)

In [59]:
query_performance.pivot(index="query_id", columns="index_type", values=["mean_time", "sd_time"])

Unnamed: 0_level_0,mean_time,mean_time,mean_time,mean_time,mean_time,mean_time,mean_time,sd_time,sd_time,sd_time,sd_time,sd_time,sd_time,sd_time
index_type,humopt,mlopt,mlopt_100,mlopt_25,mlopt_50,mlopt_75,noopt,humopt,mlopt,mlopt_100,mlopt_25,mlopt_50,mlopt_75,noopt
query_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
13,1175.0,3841.7,1178.3,1143.4,1168.0,1150.8,3903.0,202.859557,441.84624,151.241859,192.982901,176.470961,185.799892,395.026581


# Define Queries DBA Can Look At

In [18]:
queries_length = [len("".join(query.split())) for query in queries]
candidate_queries = sorted(range(len(queries_length)), key=lambda sub: queries_length[sub])[:5]


In [19]:
candidate_queries

[5, 15, 12, 13, 3]

In [20]:
for query in candidate_queries:
    print(queries)





select
	sum(l_extendedprice * l_discount) as revenue
from
	lineitem
where
	l_shipdate >= date '1997-01-01'
	and l_shipdate < date '1997-01-01' + interval '1' year
	and l_discount between 0.06 - 0.01 and 0.06 + 0.01
	and l_quantity < 25




select
	sum(l_extendedprice) / 7.0 as avg_yearly
from
	lineitem,
	part
where
	p_partkey = l_partkey
	and p_brand = 'Brand#25'
	and p_container = 'WRAP BAG'
	and l_quantity < (
		select
			0.2 * avg(l_quantity)
		from
			lineitem
		where
			l_partkey = p_partkey
	)




select
	c_count,
	count(*) as custdist
from
	(
		select
			c_custkey,
			count(o_orderkey)
		from
			customer left outer join orders on
				c_custkey = o_custkey
				and o_comment not like '%unusual%deposits%'
		group by
			c_custkey
	) as c_orders (c_custkey, c_count)
group by
	c_count
order by
	custdist desc,
	c_count desc




select
	100.00 * sum(case
		when p_type like 'PROMO%'
			then l_extendedprice * (1 - l_discount)
		else 0
	end) / sum(l_extendedprice * (1 - l_discount)) as