In [16]:
import os
import psycopg
import pandas as pd
from tqdm import tqdm
from tqdm import tqdm
import binascii
import psycopg
from collections import defaultdict

from tools.josie import JosieDB
from tools.utils.datalake import SimpleDataLakeHelper
from tools.utils.misc import is_valid_table, create_token_set

In [2]:
datalake_location, dataset, size = 'mongodb', 'wikiturlsnap', 'standard'

dlh = SimpleDataLakeHelper(datalake_location, dataset, size)

In [5]:
test_name, dataset, mode = 'main', 'wikiturlsnap', 'bag'

In [6]:
dbname = 'nanni'
tables_prefix = f'{test_name}_d{dataset}_m{mode}'
results_directory = '/data4/nanni/tesi-magistrale/notebooks'
results_file = '/data4/nanni/tesi-magistrale/notebooks/results.csv'
k = 10

In [20]:
josiedb = JosieDB(dbname, tables_prefix)
josiedb.open()

1. Trovare le tabelle con 'Country', 'Team', 'Location'

In [14]:
import random

ntables = dlh.get_number_of_tables()
names = ['Country', 'Team', 'Location']

N = 5
queries = []
for table_obj in tqdm(dlh.scan_tables(), total=ntables, leave=False):
# while True:
#     id = random.randint(0, ntables - 1)
#     if id in queries: continue
#     table_obj = dlh.get_table_by_numeric_id(id)
    if is_valid_table(table_obj['content'], table_obj['numeric_columns']):
        tabset = set(create_token_set(table_obj['content'], 'set', table_obj['numeric_columns']))
        if sum(token in tabset for token in names) >= 2:
            queries.append(table_obj)
    if len(queries) >= N:
        break

                                                           

In [15]:
pd.DataFrame(queries[0]['content'])

Unnamed: 0,0,1,2,3,4,5
0,Alexandria Blue Anchors,"Alexandria, Minnesota",Knute Nelson Memorial Park,,,
1,Duluth Huskies,"Duluth, Minnesota",Wade Stadium,4200,,
2,Eau Claire Express,"Eau Claire, Wisconsin",Carson Park,3800,,
3,Mankato MoonDogs,"Mankato, Minnesota",Franklin Rogers Park,1400,,
4,Rochester Honkers,"Rochester, Minnesota",Mayo Field,2570,,
5,St. Cloud Rox,"St. Cloud, Minnesota",Joe Faber Field,2000,,
6,Thunder Bay Border Cats,"Thunder Bay , Ontario , Canada",Port Arthur Stadium,3031,,
7,Waterloo Bucks,"Waterloo, Iowa",Riverfront Stadium,5000,,
8,Willmar Stingers,"Willmar, Minnesota",Bill Taunton Stadium,,,
9,Division,Team,Location,Stadium,Capacity,


2. Dalle tabelle estratte, creare i bag con le colonne sia 'Team' sia 'Location'

In [10]:
queries_bags = {}

for q in queries:
    table = q['content']
    numeric_columns = q['numeric_columns']
    
    table = [[row[i] for row in table] for i in range(len(table[0])) if numeric_columns[i] == 0]
    size1 = len(table) * len(table[0])
    table = [column for column in table if any(token in column for token in ['Team', 'Location', 'Country'])]
    size2 = len(table) * len(table[0])

    queries_bags[q['_id_numeric']] = create_token_set(table, 'bag', [0] * len(table[0]))
    print(size1, size2, len(queries_bags[q['_id_numeric']]))

120 40 38
68 34 34
68 34 34
36 12 12
35 14 14


3. Creare l'input per JOSIE, cioè gli integer set

In [21]:
queries = defaultdict(set)

for qid, qbag in queries_bags.items():
    result = josiedb._dbconn.execute(f"SELECT tokens FROM {josiedb._SET_TABLE_NAME} WHERE id = {qid}").fetchall()[0][0]

    for id in result:
        raw_token = josiedb._dbconn.execute(f"SELECT raw_token FROM {josiedb._INVERTED_LISTS_TABLE_NAME} WHERE token = {id}").fetchone()[0]
        if binascii.unhexlify(raw_token).decode('utf-8') in qbag:
            queries[qid].add(id)

    print(len(result), len(queries[qid]), len(qbag))

84 38 38
68 34 34
68 34 34
31 12 12
29 14 14


4. Eseguire i test con JOSIE

In [22]:
def create_query_table(queries:dict[int:set[int]], dbname, tables_prefix):
    
    josiedb.open()
    josiedb.clear_query_table()
    for table_id, tokens_ids in queries.items():
        josiedb._dbconn.execute(f"INSERT INTO {tables_prefix}_queries VALUES ({table_id}, ARRAY[{','.join(map(str, tokens_ids))}]);")
    
    josiedb._dbconn.commit()
    GOPATH = os.environ['GOPATH']
    josie_cmd_dir = f'{GOPATH}/src/github.com/ekzhu/josie/cmd'
    os.chdir(josie_cmd_dir)
    
    # if cost sampling tables already exist we assume they are correct and won't recreate them
    sample_costs_tables_exist = josiedb.cost_tables_exist()
    josiedb.close()

    if not sample_costs_tables_exist:
        os.system(f'go run {josie_cmd_dir}/sample_costs/main.go \
                    --pg-database={dbname} \
                    --test_tag={tables_prefix} \
                    --pg-table-queries={tables_prefix}_queries')

In [23]:
def query(results_file, k, results_directory, dbname, tables_prefix):
    # we are not considering the query preparation steps, since in some cases this will 
    # include also the cost sampling phase and in other cases it won't
    token_table_on_memory = False
    
    GOPATH = os.environ['GOPATH']
    josie_cmd_dir = f'{GOPATH}/src/github.com/ekzhu/josie/cmd'
    os.chdir(josie_cmd_dir)
    
    x = 'true' if token_table_on_memory else 'false'

    os.system(f'go run {josie_cmd_dir}/topk/main.go \
                --pg-database={dbname} \
                --test_tag={tables_prefix} \
                --outputDir={results_directory} \
                --resultsFile={results_file} \
                --useMemTokenTable={x} \
                --k={k} \
                --verbose=false')

In [24]:
import re

def josie_multi_query(queries:dict[int:set[int]], k, results_file, dbname, tables_prefix):
    results_directory = os.path.dirname(results_file)

    create_query_table(queries, dbname, tables_prefix)
    query(results_file, k, results_directory, dbname, tables_prefix)



def josie_single_query(set_id, token_set, k, results_file, dbname, tables_prefix):
    results_directory = os.path.dirname(results_file)

    create_query_table({set_id: token_set}, dbname, tables_prefix)
    query(results_file, k, results_directory, dbname, tables_prefix)
    results = pd.read_csv(results_file)

    get_result_ids = lambda s: str(list(map(int, re.findall(r'\d+', s)[::2])))
    get_result_overlaps = lambda s: str(list(map(int, re.findall(r'\d+', s)[1::2])))

    resids = get_result_ids(results['results'].values[0])
    resov = get_result_overlaps(results['results'].values[0])
    return resids, resov

In [25]:
josie_multi_query(queries, 5, results_file, dbname, tables_prefix)

2024/08/29 18:12:59 ==== Begin experiments for k = 5
2024/08/29 18:12:59 Running algorithm [merge_probe_cost_model_greedy], output to /data4/nanni/tesi-magistrale/notebooks/results.csv


5 / 5 queries


2024/08/29 18:13:00 Finished all queries in 0 minutes


In [None]:
results = josie_single_query(*list(queries.items())[0], k, results_file, dbname, tables_prefix)