In [1]:
import argparse
import json
import logging
import sys

import copy
import numpy as np
import pandas as pd
import pdb

import glob
import os
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt

import loki
from loki.util import constraints, postprocessing
from loki.solver import sat
from loki.util.postprocessing import *

%load_ext autoreload
%autoreload 2

In [2]:
## add progress bar showing upload to postgres
## print out some model variables while sat solving
## remove error messages

# Setting up parameters

In [8]:

os.chdir("/flash1/pari/loki")
parser = argparse.ArgumentParser(description='Run Loki.')
parser.add_argument('--config', '-c', default='application.conf')
parser.add_argument('--workload', '-w', default='imdb')
parser.add_argument('--db',  default='imdb')
parser.add_argument('--table', '-t', default='n')
parser.add_argument('--vars_per_col', '-n', default=1000)
parser.add_argument('--logfile', '-lf', default='logs/loki.log')
parser.add_argument('--verbose', '-v', default=False, action='store_true')
#args = parser.parse_args()
args = parser.parse_args(args=[])

# Input Files (IR of Parsed Query Workload)

In [9]:
loki.load_config(args.config)
c_df = pd.read_csv(loki.config[args.workload]['constraints_df'])
c_df = c_df.sample(frac=1.0)
c_df.head(5)

Unnamed: 0,exprhash,RowCount,InputCardinality,RowSql,Column,input,jobid,Op,Value,Selectivity
67105,513634991045095779917163387442763459289913020008,3395,14835720,SELECT COUNT(*) from movie_info AS mi1 WHERE m...,mi1.info,mi,2588,=,'MET:150 m',0.000229
2376,895918758229320809296116480909318578258404415055,1,12,SELECT COUNT(*) from role_type AS rt WHERE rt....,rt.role,rt,93,=,'producer',0.083333
57109,331652677167321217491811120964587578955014717773,34,2528312,SELECT COUNT(*) from title AS t WHERE t.title ...,t.title,t,2235,=,'Dansk melodi grand prix',1.3e-05
38436,304016124209722010575770180899170087677087776839,1,134170,SELECT COUNT(*) from keyword AS k WHERE k.keyw...,k.keyword,k,1490,=,'father-daughter-relationship',7e-06
69736,1270056137664121432157104510888238657306402496000,2456181,2528312,SELECT COUNT(*) from title AS t WHERE t.produc...,t.production_year,t,2699,<=,2015,0.971471


# Converting the cardinalities ---> SAT constraints, and solving it for a potential database that satisfies those constraints

In [13]:
import warnings
warnings.filterwarnings('ignore')
import time

start = time.time()
logging.basicConfig(
    level=logging.DEBUG if args.verbose else logging.INFO,
    format="%(asctime)s %(levelname)-8s [%(name)s]  %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.FileHandler(args.logfile), logging.StreamHandler()])

logger = logging.getLogger('stdout')

#logger.info('Loki initialized with configuration:\n%s' % json.dumps(loki.config, sort_keys=True, indent=4))

print('Loki initialized with configuration: \n{}'.format(json.dumps(loki.config, sort_keys=True, indent=4)))
df = pd.read_csv(loki.config[args.workload]['df'])
tables = set(df['input'])
table_columns = {table: set(df['column'].where(df['input'] == table).dropna()) for table in tables}

table = args.table
orig_table = table
# table = '"{}"'.format(table)

columns = table_columns[table]

constraints_df = constraints.get_constraints_df(c_df, table)
print("Operators used: ", set(constraints_df["Op"]))

table_cardinality = constraints.get_table_cardinality(constraints_df)
co_optimized_columns = constraints.get_co_optimized_columns(constraints_df, columns)
programs = constraints.get_programs(co_optimized_columns)

leftover_constraints = []

vars_per_col = args.vars_per_col

solutions = []

constraints_df = constraints_df[constraints_df["Value0"] != "None"]
constraints_df = constraints_df[constraints_df["Value1"] != "None"]
constraints_df = constraints_df[constraints_df["Value"] != "None"]
# pdb.set_trace()
for program in programs:
    #logger.info(f'Solving: {program}')
    print(f'Solving: {program}')
    constraints_ = constraints.parse_constraints(program, constraints_df)
    model, vars, cols, col_values_ids_map = sat.build_model(program, constraints_, leftover_constraints, table_cardinality, vars_per_col)
    solution = sat.solve(model, vars, cols, col_values_ids_map, vars_per_col)
    solutions.append(solution)
    #logger.info(f'Solved: {program}')
    print(f'Solved: {program}')

# Combine all programs' solutions into a single dictionary solution
full_solution = {k: v for s in solutions for k, v in s.items()}

postprocessing.apply_leftover_constraints(full_solution, leftover_constraints)
solution_df = postprocessing.solution_to_df(full_solution)
final_solution_df = postprocessing.scale_solution_df(solution_df, table_cardinality, vars_per_col)

final_solution_df.to_csv(f'results/{orig_table}.csv')

print("Generating solutions took: ", time.time()-start)

Loki initialized with configuration: 
{
    "imdb": {
        "constraints_df": "sample-data/literal_df.csv",
        "df": "sample-data/op_df.csv"
    }
}
Operators used:  {'=', '=|=', 'like'}
Solving: {'name'}
Problem scale: 155000
Stopping after 70 constraints!
Status = OPTIMAL
Solved: {'name'}
Solving: {'name_pcode_nf', 'surname_pcode', 'name_pcode_cf', 'gender'}
Problem scale: 1194000
Stopping after 1095 constraints!
Status = OPTIMAL
Solved: {'name_pcode_nf', 'surname_pcode', 'name_pcode_cf', 'gender'}
Generating solutions took:  100.27274751663208


# Exploring the solution

In [6]:
## a lot of the cells in the final solution are unspecified since we may not have enough data
tmp = final_solution_df.dropna()
tmp.sample(frac=0.1).head(10)

Unnamed: 0,name,surname_pcode,name_pcode_cf,name_pcode_nf,gender
127179,%co%,%j1%,C6325,K3656,m
86102,%sta%,D16,S3652,V4321,f
178161,%co%,%p64%,L1525,M2563,f
131196,%co%,%m3%,A5262,E3631,m
174580,%co%,S5,B6156,R2416,f
122732,%co%,B452,%g4%,J2124,m
172609,%co%,S5,B6156,R2416,f
28843,%rich%,W256,C6264,A5316,m
142250,%co%,M54,R2461,B4325,f
110569,%co%,S415,B4353,S52,m


# Uploading the created table to Postgres

In [7]:
DKIND="s1"

## TO try: shuffle=False; null_strs=True
upload_to_postgres(final_solution_df, args.db, args.table, DKIND, shuffle=True, null_strs=False)

uploading to postgres took:  178.49


# Setting up execution result directories

In [23]:
import sys
import os
import shutil

os.chdir("/spinning/pari/WorkloadCharacterization")

try:
    shutil.rmtree('./new_results')
except Exception as e:
    pass

try:
    os.mkdir("./new_results")
except:
    pass

NUMQ = 100

# Executing Workload Queries on true data

In [24]:
CMD = "python3 eval_data.py --data_kind true_cols --num_queries {} \
--inp_to_eval n --port 5432 --workload ceb --db_name imdb --result_dir new_results".format(NUMQ)
out = os.system(CMD)

data/ceb-all/sqls/dfs/expr_df.csv
Number of sqls to evaluate: 783
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('C6231','E4213','F6362','F6525','J513','M6251','M6263','P3625','R1632','R1636','R2631','R2632','S2153')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A6361','D1326','D1352','G6262','J5241','J5245','J5263','L2142','M6216','S3151')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.surname_pcode in ('B2','B6','B62','B653','C2','C5','J52','J525','L15','L52','M62','P62','R3','W425')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4236','A5356','A5361','E3631','J25','M2412','M2415','M6242','M6352','P3656','R1632','S2525','S3541','V2362')
SELECT COUNT(

SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('B1626','C6426','F6521','F6524','J52','J5265','J5425','R1631','R1635','R1636','R2632','R2635')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name_pcode_cf like '%h6%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A4253','C6235','E4213','J5162','K6235')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_cf in ('A5362','B6261','D1232','H4236','O4252','P5235','P6252','R363','S3152','W5165')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('C6235')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4152','A6362','B6514','D1324','J5241','K5252','L2142','M6352','P3656','W4125')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('C6235','M4145','P3625','R1635','R2631','S3152')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gen

SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4216','C6231','C6241','D252','D5262','D5435','D5463','G6535','H6323','J2142','J2452','M3265','T5212')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name_pcode_cf like '%r2%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_cf in ('A5362','B4525','D1232','J5252','O4252','R1632','R363')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4163','A5352','C6231','F6521','F6525','J252','J5162','J5216','J5235','J5252','M2416','R2631','S3152')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.surname_pcode in ('B2','B4','C2','C5','D12','G6','H63','K5','L5','W42','W452')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.surname_pcode in ('B2','B452','C2','C5','D12','H2','R5','S35','T46','W3','W42')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.surname_pcode in ('B624','C5','G6','J52

SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.surname_pcode in ('A45','B42','B6','B63','D12','D5','H62','R','R5','S23')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%bran%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.name_pcode_nf in ('A1431','A5242','J242','J5245','J5352','K5315','M6153','M6323','N5253','R5414','R5636','S5152','Z3625')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%th%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_cf in ('H4236','P5235','P6252')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A5365','B1626','B5252','B6526','D1324','E6252','F4241','G6261','G6265','J5235','J5241','M2415','M2424','S3152','S3156')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A5242','E4213','K6235')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%val%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.name_pcode_cf like '%s4

SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4163','A4253','A5362','E6523','F6521','F6525','M6263','R1632')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4253','A5352','A5362','F6521','F6525','M6263','R1632')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%kim%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A5625','B6162','C6215','C6231','D5256','J2124','J2353','J5216','K3652','P425','S5326')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4163','A4253','A5352','A5362','E6523','F6521','F6525','M6263')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A6362','B6514','G6256','J25','J5243','J5263','M2453','M4245','M6241','S3151','S3153','T5252')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%lov%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in 

SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('C6231','F6525','M6251','P3625','R1631','R1632','R2631','R2632','S2153')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A4253','A5354','C3654','C6231','G6252','J2152','J23','M2426','M6215','M6312','M6542','R2414','S1652','W5245')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%danie%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%will%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.surname_pcode in ('B65','J52','J525','R2','S53')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%fra%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A5242','C6235','E4213','F6521','I6532','M6153','M6323','R215','R5636','S3521','Z3625')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4163','A4253','A5242','A5352','D1316','F6521','F6525','J5252','M2425','P3625','R1631','S

SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_cf in ('A5362','A6143','L2352','M6352')
SELECT COUNT(*) from n_true_cols AS n WHERE n.name like '%po%'
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_cf in ('A5362','B6261','D1232','H4236','J5252','M6352','R1632','R363','W5165')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_cf in ('P6252')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.surname_pcode in ('B6','B626','C45','G62','H4','H62','J52','J525','K4','R24','R5','W5')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A5145','A5236','A5325','C6454','D525','J5161','L6325','M6324','M6453','P3624','S2516','S5365','T6252','V53','V6521')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('B6161','C6235','K3451','M6263')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode

# Executing Workload Queries on our Generated Data

In [25]:
CMD2 = "python3 eval_data.py --data_kind {} --num_queries {} \
--inp_to_eval n --port 5432 --workload ceb --db_name imdb --result_dir new_results".format(DKIND, NUMQ)
out = os.system(CMD2)

data/ceb-all/sqls/dfs/expr_df.csv
Number of sqls to evaluate: 783
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('C6231','E4213','F6362','F6525','J513','M6251','M6263','P3625','R1632','R1636','R2631','R2632','S2153')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A6361','D1326','D1352','G6262','J5241','J5245','J5263','L2142','M6216','S3151')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f') AND n.surname_pcode in ('B2','B6','B62','B653','C2','C5','J52','J525','L15','L52','M62','P62','R3','W425')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4236','A5356','A5361','E3631','J25','M2412','M2415','M6242','M6352','P3656','R1632','S2525','S3541','V2362')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.n

SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('C6235','M4145','P3625','R1635','R2631','S3152')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A6543','B6352','D1315','D2564','D5456','E3634','L2165','L6524','M2463','R5262','T5232','T532')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%ke%'
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('B3625','B3626','B6164','D5251','J2612','J4524','J5262','M2424','P436','R53','S252','S5253')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A4253','A5242','A5362','C6452','K5164','S3152')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.name_pcode_cf in ('D1232','H4236','O4252','P5235','R363','S3152','W5165')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f') AND n.name_pcode_cf in ('G5242','L2654','U1562')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('D1316','F6521'

SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_cf in ('A5362','B6245','B6261','B6526','D1232','J5252','M6352','O4252','P6252','R2632','R363','S3152','S5362','W5165')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('C6421','C6423','C6424','C6426','F6524','R1631','R2632','V4356','W4362')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%we%'
SELECT COUNT(*) from n_s1 AS n WHERE n.surname_pcode like '%a4%'
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4163','A4253','A5352','A5362','C6235','F6521','R1631','R1632','R2632')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A2352','A6543','B5252','C6253','D1325','J256','J5','J5232','J563','M5425','P3615','R54','T5214','T525')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%brian%'
SELECT COUNT(*) from n_s1 AS n WHERE n.surname_pcode like '%l15%'
SELECT COUNT(*) from n_s1 AS n WHERE n.name_pcode_cf in ('B4353','B6156','D

SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%cree%'
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A5351','C6235','C6421','C6425','F6362','F6521','J5216','J525','P3623','R1632','R1636','V4356','V5253')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('C3656','C6416','C6453','C6454','G1642','J5245','M6532','M6536','R2425','S2525','S2526','T6253')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f') AND n.surname_pcode in ('B4','C2','C5','G6','J525','L','P62','S23','S5','T521')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('C6231','C6425','E3241','F6524','J5252','J5265','M2416','P3625','R1635','R2635','S3152')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.surname_pcode in ('B4','B62','B626','B63','C2','C52','F652','J52','L2','M46','M62','R2','R24')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4163','B1614','B1626','C6231','C6424','

SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4163','A4253','A5362','C6421','C6423','C6424','E6523','F6524','F6525')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_cf in ('A2365','A6252','C52','D1614','E1524','E2163','L1214','L2','P5215','Q5325','R2425','S1452','T5212','V4524','V4626')
SELECT COUNT(*) from n_s1 AS n WHERE n.surname_pcode like '%e25%'
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%wi%'
SELECT COUNT(*) from n_s1 AS n WHERE n.name_pcode_cf in ('A2365','A6252','D1614','E1524','L1214','L2','M3425','M6352','P5215','Q5325','R2425','R3626','S5325','V4626')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('C6231','F6362','J513','M6251','M6263','P3625','R1631','R1632','R1636','R2631','R2632','S2153')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%kar%'
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.surname_pcode in ('A45','B65','C4','C65','F652','G6','H52','J52

SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A2351','A5264','C6235','D1325','E4532','G5362','G6351','H5215','J2352','J2612','J56','M6325','M6542','O4212','S6253')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%ha%'
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%hard%'
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%sa%'
SELECT COUNT(*) from n_s1 AS n WHERE n.surname_pcode in ('B624','D25','J525','L','L2','L52','L532','P62','R2','R52')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A353','A3562','D5463','E6214','G5362','G624','H6252','J6251','L5631','P3653','R252','S6216','T5256')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4163','A4253','A5242','A5352','D1316','E4213','F6521','J5252','M2425','P3625','S3151')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%ger%'
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%bri%'
SELECT COUNT(*) from n_s1 AS n WHERE n.gende

SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4253','A5352','A5362','C6421','C6423','C6424','C6425','E6523','F6521','F6524','F6525','R1631','R1632')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A4262','D5252','G6212','J5212','M2436','M6242','M6352','P425','S3126','T5214')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('C6423','C6424','C6426','F6524','F6532','R1631','R2632','V4356','W4362')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%par%'
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A5242','A5253','A5324','C6431','D5252','F6525','F6526','J5253','J5262','K3453','M5456','S525')
SELECT COUNT(*) from n_s1 AS n WHERE n.gender in ('f') AND n.name_pcode_nf in ('A4253','A5216','A525','B6232','I2142','K3651','K3654','K6525','M2416','M2425','M6353','T6252')
SELECT COUNT(*) from n_s1 AS n WHERE n.name like '%tan%'
SELECT COUNT(*) from n_s1 AS n WHERE n.ge

# Exploring the results

In [26]:
def extract_val(d, key):
    val_start = d.find(key)
    val_end = val_start + d[val_start:].find(",")
    if val_end < val_start:
        val_end = val_start + d[val_start:].find(")")
    val = d[val_start:val_end].split("=")[1]
    return val.replace("'", "")

def load_logs(logfile):
    with open(logfile, "r") as f:
        data = f.readlines()
    
    retdata = defaultdict(list)
    alg = None
    #port = None
    
    for d in data:
        if "data_kind" in d:
            #d = d.replace("\n", "")
            #d = d.replace(" ", "")
            data_kind = extract_val(d, "data_kind")
            inp_to_eval = extract_val(d, "inp_to_eval")
            nq = int(extract_val(d, "num_queries"))
            wk = extract_val(d, "workload")
            if "port" not in d:
                break
            
            port = int(extract_val(d, "port"))
            skip_likes = int(extract_val(d, "skip_likes"))
            
        if "-->" in d:
            linedata = d.split(" ")
            ehash = linedata[5]
            rt = float(linedata[-1].replace("\n", ""))
            #rep = int(linedata[-5])
            #qerr = float(linedata[-3].replace("QErr:", ""))
            
            estc = float(linedata[-3].replace("EstC:", ""))
            truec = float(linedata[-5].replace("TrueC:", ""))
            rep = int(linedata[-9])
            qerr = float(linedata[-7].replace("QErr:", ""))
            
            retdata["ehash"].append(ehash)
            retdata["rt"].append(rt)
            retdata["rep"].append(rep)
            retdata["data_kind"].append(str(data_kind))
            retdata["inp_to_eval"].append(inp_to_eval)
            retdata["num_queries"].append(nq)
            retdata["workload"].append(wk)
            retdata["port"].append(port)
            retdata["skip_likes"].append(skip_likes)
            retdata["qerr"].append(qerr)
            retdata["truec"].append(truec)
            retdata["estc"].append(estc)
            
    df = pd.DataFrame(retdata)
    return df

In [27]:
LOGDIR = "./new_results/"
fns = glob.glob(LOGDIR + "*.log")
dfs = []

for fn in fns:
    df = load_logs(fn)
    dfs.append(df)
df = pd.concat(dfs)

In [28]:
mapping = {}
mapping["true_cols"] = "TrueData"
mapping["gen_shuffle"] = "GeneratedData"
mapping["s1"] = "GeneratedData"
mapping["s2"] = "GeneratedData"

df["data_kind"] = df.apply(lambda x: mapping[x["data_kind"]] , axis=1)

In [29]:
tmp = df.groupby(["ehash", "data_kind"])["rt"].mean().reset_index()
edf = tmp.set_index(['ehash', 'data_kind'])['rt'].unstack().reset_index()

In [30]:
import pickle
esqls = {}
if os.path.exists("esqls.pkl"):
    with open('esqls.pkl', 'rb') as handle:
        esqls = pickle.load(handle)
    
eqerrs = {}
etruecs = {}
estcs = {}

qdf = df[df["data_kind"] == "GeneratedData"]
for i,row in qdf.iterrows():
    eqerrs[row["ehash"]] = row["qerr"]
    etruecs[row["ehash"]] = row["truec"]
    estcs[row["ehash"]] = row["estc"]
    
true_rt = round(df.groupby(["data_kind"])["rt"].sum()["TrueData"], 2)
gen_rt = round(df.groupby(["data_kind"])["rt"].sum()["GeneratedData"], 2)

In [31]:
# fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(6,5))
# true_rt = round(df.groupby(["data_kind"])["rt"].sum()["TrueData"], 2)
# gen_rt = round(df.groupby(["data_kind"])["rt"].sum()["GeneratedData"], 2)

# title = "Total Latency, True: {}; Generated: {}".format(true_rt, gen_rt)

# sns.scatterplot(data=edf, x = "GeneratedData", y="TrueData", alpha=1.0, ax=ax)

# ax.set_ylabel("True Data", fontsize=16)
# ax.set_xlabel("Generated Data", fontsize=16)

# plt.title(title, fontsize=16)

# #FN= "{}-{}-scatterplot.pdf".format(INP , NUMQ)
# #plt.savefig(FN, bbox_inches="tight")
# plt.show()

In [37]:
import sqlparse
edf["sql"] = edf.apply(lambda x: sqlparse.format(esqls[x["ehash"]], reindent=True,keyword_case="upper") ,
                       axis=1)
edf["qerr"] = edf.apply(lambda x: eqerrs[x["ehash"]] ,axis=1)
edf["estc"] = edf.apply(lambda x: estcs[x["ehash"]] ,axis=1)
edf["truec"] = edf.apply(lambda x: etruecs[x["ehash"]] ,axis=1)

sqlparse.format(edf.sql.values[5], reindent=True, keyword_case='upper')

"SELECT COUNT(*)\nFROM name AS n\nWHERE n.gender in ('m')\n  AND n.name_pcode_nf in ('A4253',\n                          'A5352',\n                          'A5362',\n                          'C6421',\n                          'C6423',\n                          'C6424',\n                          'C6425',\n                          'E6523',\n                          'F6521',\n                          'F6524',\n                          'F6525',\n                          'R1631',\n                          'R1632')"

In [39]:
from bokeh.io import output_file, show, reset_output, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
import pandas as pd

reset_output()
output_notebook()
#output_file("name-final-results.html")

source = ColumnDataSource(data=edf)

# Define the hover tool to display the 'data_kind' column from edf
hover = HoverTool(tooltips=[
                             ("Gen Data", "@GeneratedData"),
                             ("True Data", "@TrueData"),
                             ("Gen Cardinality", "@estc"),
                             ("True Cardinality", "@truec"),
                             #("Cardinality Q-Error", "@qerr"),
                             ("SQL", "@sql"),
                            ])

# Create the Bokeh figure
p = figure(title="Total Latency, True: {}; Generated: {}".format(true_rt, gen_rt), 
                  tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
          width=600, height=500,
          )

# Add the scatter plot to the Bokeh figure
p.scatter('GeneratedData', 'TrueData', source=source, 
         size=8)
p.add_tools(hover)

# Set the x and y axis labels
p.xaxis.axis_label = "Generated Data"
p.yaxis.axis_label = "True Data"

# Display the plot
show(p)

In [44]:
DKIND2="s2"

In [40]:
DKIND2="s2"

## TO try: shuffle=False; null_strs=True
upload_to_postgres(final_solution_df, args.db, args.table, DKIND, shuffle=False, null_strs=False)

uploading to postgres took:  117.22


In [41]:
import sys
import os
import shutil

os.chdir("/spinning/pari/WorkloadCharacterization")

try:
    shutil.rmtree('./new_results')
except Exception as e:
    pass

try:
    os.mkdir("./new_results")
except:
    pass

NUMQ = 100

In [42]:
CMD = "python3 eval_data.py --data_kind true_cols --num_queries {} \
--inp_to_eval n --port 5432 --workload ceb --db_name imdb --result_dir new_results".format(NUMQ)
out = os.system(CMD)

data/ceb-all/sqls/dfs/expr_df.csv
Number of sqls to evaluate: 100
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('C6231','E4213','F6362','F6525','J513','M6251','M6263','P3625','R1632','R1636','R2631','R2632','S2153')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A6361','D1326','D1352','G6262','J5241','J5245','J5263','L2142','M6216','S3151')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f') AND n.surname_pcode in ('B2','B6','B62','B653','C2','C5','J52','J525','L15','L52','M62','P62','R3','W425')
SELECT COUNT(*) from n_true_cols AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4236','A5356','A5361','E3631','J25','M2412','M2415','M6242','M6352','P3656','R1632','S2525','S3541','V2362')
SELECT COUNT(

NameError: name 'DKIND2' is not defined

In [45]:
CMD2 = "python3 eval_data.py --data_kind {} --num_queries {} \
--inp_to_eval n --port 5432 --workload ceb --db_name imdb --result_dir new_results".format(DKIND2, NUMQ)
out = os.system(CMD2)

data/ceb-all/sqls/dfs/expr_df.csv
Number of sqls to evaluate: 100
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('m')
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('C6231','E4213','F6362','F6525','J513','M6251','M6263','P3625','R1632','R1636','R2631','R2632','S2153')
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('f')
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('m') AND n.name_pcode_nf in ('A6361','D1326','D1352','G6262','J5241','J5245','J5263','L2142','M6216','S3151')
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('f','m')
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('f') AND n.surname_pcode in ('B2','B6','B62','B653','C2','C5','J52','J525','L15','L52','M62','P62','R3','W425')
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('f','m') AND n.name_pcode_nf in ('A4236','A5356','A5361','E3631','J25','M2412','M2415','M6242','M6352','P3656','R1632','S2525','S3541','V2362')
SELECT COUNT(*) from n_s2 AS n WHERE n.gender in ('m') AND n.n

In [46]:
mapping = {}
mapping["true_cols"] = "TrueData"
mapping["gen_shuffle"] = "GeneratedData"
mapping["s1"] = "GeneratedData"
mapping["s2"] = "GeneratedData"

LOGDIR = "./new_results/"
fns = glob.glob(LOGDIR + "*.log")
dfs = []

for fn in fns:
    df = load_logs(fn)
    dfs.append(df)
df = pd.concat(dfs)
df["data_kind"] = df.apply(lambda x: mapping[x["data_kind"]] , axis=1)
tmp = df.groupby(["ehash", "data_kind"])["rt"].mean().reset_index()
edf = tmp.set_index(['ehash', 'data_kind'])['rt'].unstack().reset_index()

import pickle
esqls = {}
if os.path.exists("esqls.pkl"):
    with open('esqls.pkl', 'rb') as handle:
        esqls = pickle.load(handle)
    
eqerrs = {}
etruecs = {}
estcs = {}

qdf = df[df["data_kind"] == "GeneratedData"]
for i,row in qdf.iterrows():
    eqerrs[row["ehash"]] = row["qerr"]
    etruecs[row["ehash"]] = row["truec"]
    estcs[row["ehash"]] = row["estc"]
    
true_rt = round(df.groupby(["data_kind"])["rt"].sum()["TrueData"], 2)
gen_rt = round(df.groupby(["data_kind"])["rt"].sum()["GeneratedData"], 2)

import sqlparse
edf["sql"] = edf.apply(lambda x: sqlparse.format(esqls[x["ehash"]], reindent=True,keyword_case="upper") ,
                       axis=1)
edf["qerr"] = edf.apply(lambda x: eqerrs[x["ehash"]] ,axis=1)
edf["estc"] = edf.apply(lambda x: estcs[x["ehash"]] ,axis=1)
edf["truec"] = edf.apply(lambda x: etruecs[x["ehash"]] ,axis=1)

sqlparse.format(edf.sql.values[5], reindent=True, keyword_case='upper')

"SELECT COUNT(*)\nFROM name AS n\nWHERE n.gender in ('f',\n                   'm')\n  AND n.surname_pcode in ('A436',\n                          'B2',\n                          'B452',\n                          'C16',\n                          'C462',\n                          'C65',\n                          'H2',\n                          'J52',\n                          'L2',\n                          'M5',\n                          'R2',\n                          'S23',\n                          'W425')"

In [47]:
from bokeh.io import output_file, show, reset_output, output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure
import pandas as pd

reset_output()
output_notebook()
#output_file("name-final-results.html")

source = ColumnDataSource(data=edf)

# Define the hover tool to display the 'data_kind' column from edf
hover = HoverTool(tooltips=[
                             ("Gen Data", "@GeneratedData"),
                             ("True Data", "@TrueData"),
                             ("Gen Cardinality", "@estc"),
                             ("True Cardinality", "@truec"),
                             #("Cardinality Q-Error", "@qerr"),
                             ("SQL", "@sql"),
                            ])

# Create the Bokeh figure
p = figure(title="Total Latency, True: {}; Generated: {}".format(true_rt, gen_rt), 
                  tools="pan,wheel_zoom,save,reset", active_scroll='wheel_zoom',
          width=600, height=500,
          )

# Add the scatter plot to the Bokeh figure
p.scatter('GeneratedData', 'TrueData', source=source, 
         size=8)
p.add_tools(hover)

# Set the x and y axis labels
p.xaxis.axis_label = "Generated Data"
p.yaxis.axis_label = "True Data"

# Display the plot
show(p)