In [1]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

In [2]:
#EXPERIMENT = "asdf"
#INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
#OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"
#INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
#OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
#templatedf_path = f"./outputs/{EXPERIMENT}-step2/templatedf.csv"
EXPERIMENT = ""
INPUT_ZIP = ""
OUTPUT_DIR = ""
INDEX_COLS = []
OUTPUT_COLS = []
templatedf_path = ""

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q2"
OUTPUT_DIR = "./outputs/ap-duckdb-q2-step3"
INPUT_ZIP = "./outputs/ap-duckdb-q2-step2.zip"
INDEX_COLS = ["s_name", "p_partkey", "n_name", "s_acctbal"]
OUTPUT_COLS = ["min(ps_supplycost)"]
templatedf_path = "./outputs/ap-duckdb-q2-step1/template.pkl"


In [4]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# Import saved variables from the first step
templatedf: pl.DataFrame = pickle.load(open(templatedf_path, 'rb'))

In [6]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 61, 36, 41, 16, 57, 6, 7, 56, 17, 40, 37, 60, 21, 47, 10, 51, 0, 26, 30, 31, 27, 1, 50, 11, 46, 2, 28, 53, 12, 45, 32, 24, 49, 48, 25, 33, 44, 13, 52, 29, 3, 34, 8, 22, 59, 18, 38, 4, 55, 14, 43, 42, 15, 54, 5, 39, 19, 58, 23, 62, 9, 35])

In [7]:
# Make sure the data types of the row fields are correct
for i in range(len(alldata)):
    alldata[i]['row'] = pl.DataFrame(alldata[i]['row']).cast(templatedf.select(INDEX_COLS).schema).to_dicts()[0]

In [8]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

s_name,p_partkey,n_name,s_acctbal
str,i64,str,"decimal[15,2]"
"""Supplier#000000574""",13784,"""RUSSIA""",8096.98
"""Supplier#000000949""",9430,"""UNITED KINGDOM""",91.39
"""Supplier#000000470""",6213,"""ROMANIA""",727.89
"""Supplier#000000138""",8363,"""ROMANIA""",906.07
"""Supplier#000000323""",3563,"""RUSSIA""",704.83
…,…,…,…
"""Supplier#000000574""",323,"""RUSSIA""",8096.98
"""Supplier#000000408""",18139,"""RUSSIA""",6173.87
"""Supplier#000000957""",10956,"""UNITED KINGDOM""",4324.51
"""Supplier#000000384""",13120,"""GERMANY""",1342.17


In [9]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()
    print(row)

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

{'s_name': 'Supplier#000000574', 'p_partkey': 13784, 'n_name': 'RUSSIA', 's_acctbal': Decimal('8096.98')}
{'s_name': 'Supplier#000000949', 'p_partkey': 9430, 'n_name': 'UNITED KINGDOM', 's_acctbal': Decimal('91.39')}
{'s_name': 'Supplier#000000470', 'p_partkey': 6213, 'n_name': 'ROMANIA', 's_acctbal': Decimal('727.89')}
{'s_name': 'Supplier#000000138', 'p_partkey': 8363, 'n_name': 'ROMANIA', 's_acctbal': Decimal('906.07')}
{'s_name': 'Supplier#000000323', 'p_partkey': 3563, 'n_name': 'RUSSIA', 's_acctbal': Decimal('704.83')}
{'s_name': 'Supplier#000000812', 'p_partkey': 10551, 'n_name': 'FRANCE', 's_acctbal': Decimal('8615.50')}
{'s_name': 'Supplier#000000290', 'p_partkey': 2037, 'n_name': 'FRANCE', 's_acctbal': Decimal('167.56')}
{'s_name': 'Supplier#000000509', 'p_partkey': 18972, 'n_name': 'FRANCE', 's_acctbal': Decimal('4315.15')}
{'s_name': 'Supplier#000000690', 'p_partkey': 9430, 'n_name': 'ROMANIA', 's_acctbal': Decimal('7448.46')}
{'s_name': 'Supplier#000000563', 'p_partkey': 5

Naive reconstruction

In [10]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['s_name', 'p_partkey', 'n_name', 's_acctbal', 'min(ps_supplycost)']
['Supplier#000000574', 13784, 'RUSSIA', Decimal('8096.98'), 398.27000000000004]
['Supplier#000000949', 9430, 'UNITED KINGDOM', Decimal('91.39'), 382.5200000000002]
['Supplier#000000470', 6213, 'ROMANIA', Decimal('727.89'), 601.78]
['Supplier#000000138', 8363, 'ROMANIA', Decimal('906.07'), 337.5100000000001]
['Supplier#000000323', 3563, 'RUSSIA', Decimal('704.83'), 651.3099999999997]
['Supplier#000000812', 10551, 'FRANCE', Decimal('8615.50'), 259.65999999999997]
['Supplier#000000290', 2037, 'FRANCE', Decimal('167.56'), 339.06]
['Supplier#000000509', 18972, 'FRANCE', Decimal('4315.15'), 794.2600000000001]
['Supplier#000000690', 9430, 'ROMANIA', Decimal('7448.46'), 602.73]
['Supplier#000000563', 5797, 'GERMANY', Decimal('-942.73'), 628.11]
['Supplier#000000954', 4191, 'FRANCE', Decimal('6721.70'), 579.6800000000002]
['Supplier#000000149', 18344, 'FRANCE', Decimal('4518.31'), 712.75]
['Supplier#000000311', 13784, 'RUSSIA'

In [11]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

s_name,p_partkey,n_name,s_acctbal,min(ps_supplycost)
str,i64,str,"decimal[15,2]","decimal[15,2]"
"""Supplier#000000574""",13784,"""RUSSIA""",8096.98,398.27
"""Supplier#000000949""",9430,"""UNITED KINGDOM""",91.39,382.52
"""Supplier#000000470""",6213,"""ROMANIA""",727.89,601.78
"""Supplier#000000138""",8363,"""ROMANIA""",906.07,337.51
"""Supplier#000000323""",3563,"""RUSSIA""",704.83,651.30
…,…,…,…,…
"""Supplier#000000574""",323,"""RUSSIA""",8096.98,574.84
"""Supplier#000000408""",18139,"""RUSSIA""",6173.87,418.71
"""Supplier#000000957""",10956,"""UNITED KINGDOM""",4324.51,893.81
"""Supplier#000000384""",13120,"""GERMANY""",1342.17,280.36


In [12]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

In [13]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q2-step3.zip'