In [1]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

In [2]:
#EXPERIMENT = "asdf"
#INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
#OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"
#INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
#OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
#templatedf_path = f"./outputs/{EXPERIMENT}-step2/templatedf.csv"
EXPERIMENT = ""
INPUT_ZIP = ""
OUTPUT_DIR = ""
INDEX_COLS = []
OUTPUT_COLS = []
templatedf_path = ""

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q10-customer"
OUTPUT_DIR = "./outputs/ap-duckdb-q10-customer-step3"
INPUT_ZIP = "./outputs/ap-duckdb-q10-customer-step2.zip"
INDEX_COLS = ["c_custkey", "c_name", "c_acctbal", "c_phone", "n_name", "c_address", "c_comment"]
OUTPUT_COLS = ["revenue"]
templatedf_path = "./outputs/ap-duckdb-q10-customer-step1/template.pkl"


In [4]:
INDEX_COLS = INDEX_COLS or [""] # if no index cols are provided, then we make one up with empty string name in step 1

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# Import saved variables from the first step
templatedf: pl.DataFrame = pickle.load(open(templatedf_path, 'rb'))

In [6]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 61, 36, 41, 16, 57, 6, 7, 56, 17, 40, 37, 60, 21, 47, 10, 51, 0, 26, 30, 31, 27, 1, 50, 11, 46, 2, 28, 53, 12, 45, 32, 24, 49, 48, 25, 33, 44, 13, 52, 29, 3, 34, 8, 22, 59, 18, 38, 4, 55, 14, 43, 42, 15, 54, 5, 39, 19, 58, 23, 62, 9, 35])

In [7]:
# Make sure the data types of the row fields are correct
for i in range(len(alldata)):
    alldata[i]['row'] = pl.DataFrame(alldata[i]['row']).cast(templatedf.select(INDEX_COLS).schema).to_dicts()[0]

In [8]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

c_custkey,c_name,c_acctbal,c_phone,n_name,c_address,c_comment
i64,str,"decimal[15,2]",str,str,str,str
8242,"""Customer#000008242""",6322.09,"""15-792-676-1184""","""ETHIOPIA""","""cYDWDiJt06B8CYzXX2L8x2hn1VFG""",""" regular theodolites affix. ca…"
2455,"""Customer#000002455""",2070.99,"""17-946-225-9977""","""GERMANY""","""a5DZ199yfAcFhfi2uwBE PKo,Z""","""pinto beans alongside of the f…"
1966,"""Customer#000001966""",1937.72,"""10-973-269-8886""","""ALGERIA""","""IbwZr7j QVifqf9WizOIWx,UXV9Cqx…","""odolites across the unusual ac…"
1565,"""Customer#000001565""",1820.03,"""12-402-178-2007""","""BRAZIL""","""n4acVpG0Deyj5aIFAfSNg Iu9cUagw…","""deposits; unusual, bold deposi…"
14398,"""Customer#000014398""",-602.24,"""34-814-111-5424""","""UNITED STATES""","""l49oKjbjQHz6YZwjo5wPihM lyYO6G""","""es haggle fluffily blithely fl…"
…,…,…,…,…,…,…
12226,"""Customer#000012226""",1850.48,"""19-265-644-3796""","""INDONESIA""","""08fy8Pc0NmrqAiAYPZuOOD55dV3tbO…",""". quickly bold theodolites gro…"
3541,"""Customer#000003541""",7052.19,"""26-551-286-8801""","""MOZAMBIQUE""","""ye4dLVD7hS2cbIL956lC""","""ular ideas wake bold, unusual …"
11866,"""Customer#000011866""",3380.37,"""33-807-163-1247""","""UNITED KINGDOM""","""hdAdp5v,AYqoX24svKZw1UGEXmQ""","""ep at the furiously final requ…"
9986,"""Customer#000009986""",-196.72,"""26-349-647-1183""","""MOZAMBIQUE""","""yKAZSS,DElfPsdFKAqaNz0I""","""quickly furiously regular requ…"


In [9]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()
    print(row)

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

{'c_custkey': 8242, 'c_name': 'Customer#000008242', 'c_acctbal': Decimal('6322.09'), 'c_phone': '15-792-676-1184', 'n_name': 'ETHIOPIA', 'c_address': 'cYDWDiJt06B8CYzXX2L8x2hn1VFG', 'c_comment': ' regular theodolites affix. carefully ironic packages cajole deposits; slyly ironic packages wake quickly. regular,'}
{'c_custkey': 2455, 'c_name': 'Customer#000002455', 'c_acctbal': Decimal('2070.99'), 'c_phone': '17-946-225-9977', 'n_name': 'GERMANY', 'c_address': 'a5DZ199yfAcFhfi2uwBE PKo,Z', 'c_comment': 'pinto beans alongside of the furiously ironic asymptotes are quickly even platelets: express'}
{'c_custkey': 1966, 'c_name': 'Customer#000001966', 'c_acctbal': Decimal('1937.72'), 'c_phone': '10-973-269-8886', 'n_name': 'ALGERIA', 'c_address': 'IbwZr7j QVifqf9WizOIWx,UXV9CqxUyrwj', 'c_comment': 'odolites across the unusual accounts hang carefully furiously bold excuses. regular pi'}
{'c_custkey': 1565, 'c_name': 'Customer#000001565', 'c_acctbal': Decimal('1820.03'), 'c_phone': '12-402-178

Naive reconstruction

In [10]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['c_custkey', 'c_name', 'c_acctbal', 'c_phone', 'n_name', 'c_address', 'c_comment', 'revenue']
[8242, 'Customer#000008242', Decimal('6322.09'), '15-792-676-1184', 'ETHIOPIA', 'cYDWDiJt06B8CYzXX2L8x2hn1VFG', ' regular theodolites affix. carefully ironic packages cajole deposits; slyly ironic packages wake quickly. regular,', 622786.7297]
[2455, 'Customer#000002455', Decimal('2070.99'), '17-946-225-9977', 'GERMANY', 'a5DZ199yfAcFhfi2uwBE PKo,Z', 'pinto beans alongside of the furiously ironic asymptotes are quickly even platelets: express', 481592.4053]
[1966, 'Customer#000001966', Decimal('1937.72'), '10-973-269-8886', 'ALGERIA', 'IbwZr7j QVifqf9WizOIWx,UXV9CqxUyrwj', 'odolites across the unusual accounts hang carefully furiously bold excuses. regular pi', 444059.0382]
[1565, 'Customer#000001565', Decimal('1820.03'), '12-402-178-2007', 'BRAZIL', 'n4acVpG0Deyj5aIFAfSNg Iu9cUagwN3OsRbKC 4', 'deposits; unusual, bold deposits around the f', 412506.00619999995]
[14398, 'Customer#000014398', D

In [11]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

c_custkey,c_name,c_acctbal,c_phone,n_name,c_address,c_comment,revenue
i64,str,"decimal[15,2]",str,str,str,str,"decimal[38,4]"
8242,"""Customer#000008242""",6322.09,"""15-792-676-1184""","""ETHIOPIA""","""cYDWDiJt06B8CYzXX2L8x2hn1VFG""",""" regular theodolites affix. ca…",622786.7297
2455,"""Customer#000002455""",2070.99,"""17-946-225-9977""","""GERMANY""","""a5DZ199yfAcFhfi2uwBE PKo,Z""","""pinto beans alongside of the f…",481592.4053
1966,"""Customer#000001966""",1937.72,"""10-973-269-8886""","""ALGERIA""","""IbwZr7j QVifqf9WizOIWx,UXV9Cqx…","""odolites across the unusual ac…",444059.0382
1565,"""Customer#000001565""",1820.03,"""12-402-178-2007""","""BRAZIL""","""n4acVpG0Deyj5aIFAfSNg Iu9cUagw…","""deposits; unusual, bold deposi…",412506.0061
14398,"""Customer#000014398""",-602.24,"""34-814-111-5424""","""UNITED STATES""","""l49oKjbjQHz6YZwjo5wPihM lyYO6G""","""es haggle fluffily blithely fl…",408575.3600
…,…,…,…,…,…,…,…
12226,"""Customer#000012226""",1850.48,"""19-265-644-3796""","""INDONESIA""","""08fy8Pc0NmrqAiAYPZuOOD55dV3tbO…",""". quickly bold theodolites gro…",323369.1017
3541,"""Customer#000003541""",7052.19,"""26-551-286-8801""","""MOZAMBIQUE""","""ye4dLVD7hS2cbIL956lC""","""ular ideas wake bold, unusual …",319976.9349
11866,"""Customer#000011866""",3380.37,"""33-807-163-1247""","""UNITED KINGDOM""","""hdAdp5v,AYqoX24svKZw1UGEXmQ""","""ep at the furiously final requ…",316999.3436
9986,"""Customer#000009986""",-196.72,"""26-349-647-1183""","""MOZAMBIQUE""","""yKAZSS,DElfPsdFKAqaNz0I""","""quickly furiously regular requ…",316228.2307


In [12]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

In [13]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q10-customer-step3.zip'