In [1]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

In [2]:
#EXPERIMENT = "asdf"
#INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
#OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"
#INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
#OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
#templatedf_path = f"./outputs/{EXPERIMENT}-step2/templatedf.csv"
EXPERIMENT = ""
INPUT_ZIP = ""
OUTPUT_DIR = ""
INDEX_COLS = []
OUTPUT_COLS = []
templatedf_path = ""

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q15-customer"
OUTPUT_DIR = "./outputs/ap-duckdb-q15-customer-step3"
INPUT_ZIP = "./outputs/ap-duckdb-q15-customer-step2.zip"
INDEX_COLS = ["s_suppkey", "s_name", "s_address", "s_phone"]
OUTPUT_COLS = ["total_revenue"]
templatedf_path = "./outputs/ap-duckdb-q15-customer-step1/template.pkl"


In [4]:
INDEX_COLS = INDEX_COLS or [""] # if no index cols are provided, then we make one up with empty string name in step 1

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# Import saved variables from the first step
templatedf: pl.DataFrame = pickle.load(open(templatedf_path, 'rb'))

In [6]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 77, 61, 36, 41, 16, 57, 82, 6, 7, 83, 56, 17, 40, 37, 60, 76, 21, 47, 10, 51, 84, 0, 26, 71, 67, 30, 31, 66, 70, 27, 1, 50, 11, 46, 69, 2, 28, 53, 12, 45, 32, 65, 73, 24, 49, 48, 25, 72, 64, 33, 44, 13, 52, 29, 68, 3, 34, 63, 8, 75, 22, 59, 18, 38, 4, 80, 79, 55, 14, 43, 42, 15, 54, 78, 81, 5, 39, 19, 58, 23, 74, 62, 9, 35])

In [7]:
# Make sure the data types of the row fields are correct
for i in range(len(alldata)):
    alldata[i]['row'] = pl.DataFrame(alldata[i]['row']).cast(templatedf.select(INDEX_COLS).schema).to_dicts()[0]

In [8]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

s_suppkey,s_name,s_address,s_phone
i64,str,str,str
913,"""Supplier#000000913""","""PTSEe31GjmUGy58DpRn""","""34-601-419-1634"""
474,"""Supplier#000000474""","""78jpeO3SIlDN6""","""10-327-319-7717"""
677,"""Supplier#000000677""","""TEJ LMkTnY5hp8aQckzyb""","""23-290-639-3315"""
378,"""Supplier#000000378""","""mLPJtpu4wOc cSFzBR""","""13-930-567-5190"""
220,"""Supplier#000000220""","""yw45wy2XyjPmY4XXHOcVYSvtU""","""29-197-645-8102"""
…,…,…,…
157,"""Supplier#000000157""","""1EmkCApL5iF""","""13-776-259-5994"""
402,"""Supplier#000000402""","""JR8vWoCteJtJg3okRpt0r28KEo""","""13-109-731-3195"""
496,"""Supplier#000000496""","""QMWQx2utAqLlVtzkXD34POTPn dKN1""","""30-810-880-3654"""
925,"""Supplier#000000925""","""vXDWBL0AYSnOG2c1UZXkUzMCqF7b83…","""29-398-723-8226"""


In [9]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()
    print(row)

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

{'s_suppkey': 913, 's_name': 'Supplier#000000913', 's_address': 'PTSEe31GjmUGy58DpRn', 's_phone': '34-601-419-1634'}
{'s_suppkey': 474, 's_name': 'Supplier#000000474', 's_address': '78jpeO3SIlDN6', 's_phone': '10-327-319-7717'}
{'s_suppkey': 677, 's_name': 'Supplier#000000677', 's_address': 'TEJ LMkTnY5hp8aQckzyb', 's_phone': '23-290-639-3315'}
{'s_suppkey': 378, 's_name': 'Supplier#000000378', 's_address': 'mLPJtpu4wOc cSFzBR', 's_phone': '13-930-567-5190'}
{'s_suppkey': 220, 's_name': 'Supplier#000000220', 's_address': 'yw45wy2XyjPmY4XXHOcVYSvtU', 's_phone': '29-197-645-8102'}
{'s_suppkey': 315, 's_name': 'Supplier#000000315', 's_address': 'k9fJuxApb0M0dHM7I7', 's_phone': '12-574-691-2832'}
{'s_suppkey': 641, 's_name': 'Supplier#000000641', 's_address': 's,fStDiv8ocWmhtn8irUQ5jRG', 's_phone': '27-393-904-4536'}
{'s_suppkey': 953, 's_name': 'Supplier#000000953', 's_address': 'w77P,JF6UdQQZ5f jHUG0I6yW8lleUAwy3ztsqr', 's_phone': '31-642-490-3022'}
{'s_suppkey': 238, 's_name': 'Supplier

Naive reconstruction

In [10]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['s_suppkey', 's_name', 's_address', 's_phone', 'total_revenue']
[913, 'Supplier#000000913', 'PTSEe31GjmUGy58DpRn', '34-601-419-1634', 740946.8968257306]
[474, 'Supplier#000000474', '78jpeO3SIlDN6', '10-327-319-7717', 908937.1094937591]
[677, 'Supplier#000000677', 'TEJ LMkTnY5hp8aQckzyb', '23-290-639-3315', 950510.9765868094]
[378, 'Supplier#000000378', 'mLPJtpu4wOc cSFzBR', '13-930-567-5190', 847088.4238866355]
[220, 'Supplier#000000220', 'yw45wy2XyjPmY4XXHOcVYSvtU', '29-197-645-8102', 899938.3756387155]
[315, 'Supplier#000000315', 'k9fJuxApb0M0dHM7I7', '12-574-691-2832', 959801.5519142305]
[641, 'Supplier#000000641', 's,fStDiv8ocWmhtn8irUQ5jRG', '27-393-904-4536', 975994.3403430603]
[953, 'Supplier#000000953', 'w77P,JF6UdQQZ5f jHUG0I6yW8lleUAwy3ztsqr', '31-642-490-3022', 770309.4147504745]
[238, 'Supplier#000000238', 'vH81dBrqiAF 5r6MutkO9,W92ds4', '18-950-354-3488', 881250.873261866]
[164, 'Supplier#000000164', 'Uj0,Jr1GFzWA58rVzHPD6DayBIeDkmxXKg368', '12-414-446-6598', 964269.50425

In [11]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

s_suppkey,s_name,s_address,s_phone,total_revenue
i64,str,str,str,"decimal[38,4]"
913,"""Supplier#000000913""","""PTSEe31GjmUGy58DpRn""","""34-601-419-1634""",740946.8968
474,"""Supplier#000000474""","""78jpeO3SIlDN6""","""10-327-319-7717""",908937.1094
677,"""Supplier#000000677""","""TEJ LMkTnY5hp8aQckzyb""","""23-290-639-3315""",950510.9765
378,"""Supplier#000000378""","""mLPJtpu4wOc cSFzBR""","""13-930-567-5190""",847088.4238
220,"""Supplier#000000220""","""yw45wy2XyjPmY4XXHOcVYSvtU""","""29-197-645-8102""",899938.3756
…,…,…,…,…
157,"""Supplier#000000157""","""1EmkCApL5iF""","""13-776-259-5994""",824768.0440
402,"""Supplier#000000402""","""JR8vWoCteJtJg3okRpt0r28KEo""","""13-109-731-3195""",889440.1401
496,"""Supplier#000000496""","""QMWQx2utAqLlVtzkXD34POTPn dKN1""","""30-810-880-3654""",880258.6398
925,"""Supplier#000000925""","""vXDWBL0AYSnOG2c1UZXkUzMCqF7b83…","""29-398-723-8226""",912189.7834


In [12]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

In [13]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q15-customer-step3.zip'