In [1]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

In [2]:
#EXPERIMENT = "asdf"
#INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
#OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"
#INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
#OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
#templatedf_path = f"./outputs/{EXPERIMENT}-step2/templatedf.csv"
EXPERIMENT = ""
INPUT_ZIP = ""
OUTPUT_DIR = ""
INDEX_COLS = []
OUTPUT_COLS = []
templatedf_path = ""

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q1"
OUTPUT_DIR = "./outputs/ap-duckdb-q1-step3"
INPUT_ZIP = "./outputs/ap-duckdb-q1-step2.zip"
INDEX_COLS = ["l_returnflag", "l_linestatus"]
OUTPUT_COLS = ["sum_qty", "sum_base_price", "sum_disc_price", "sum_charge", "avg_qty", "avg_price", "avg_disc", "count_order"]
templatedf_path = "./outputs/ap-duckdb-q1-step1/template.pkl"


In [4]:
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# Import saved variables from the first step
templatedf: pl.DataFrame = pickle.load(open(templatedf_path, 'rb'))

In [6]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 16, 6, 7, 17, 21, 10, 0, 26, 30, 31, 27, 1, 11, 2, 28, 12, 24, 25, 13, 29, 3, 8, 22, 18, 4, 14, 15, 5, 19, 23, 9])

In [7]:
# Make sure the data types of the row fields are correct
for i in range(len(alldata)):
    alldata[i]['row'] = pl.DataFrame(alldata[i]['row']).cast(templatedf.select(INDEX_COLS).schema).to_dicts()[0]

In [8]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

l_returnflag,l_linestatus
str,str
"""A""","""F"""
"""N""","""F"""
"""R""","""F"""
"""N""","""O"""


In [9]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()
    print(row)

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

{'l_returnflag': 'A', 'l_linestatus': 'F'}
{'l_returnflag': 'N', 'l_linestatus': 'F'}
{'l_returnflag': 'R', 'l_linestatus': 'F'}
{'l_returnflag': 'N', 'l_linestatus': 'O'}


Naive reconstruction

In [10]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['l_returnflag', 'l_linestatus', 'sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']
['A', 'F', 3899734.145193726, 5292376241.022015, 5249520169.545148, 5197640048.900388, 25.44357219851789, 35933.47039305396, 0.05047187385032928, 144796.86598203142]
['N', 'F', 89605.74542453185, 134078673.37070186, 124484291.47637552, 136043535.2070816, 24.95430638921443, 35310.206028922425, 0.04897543094168873, 3718.9750592048886]
['R', 'F', 3768367.911042263, 5380117595.907811, 4855004241.02617, 5315169810.653909, 25.4907102085246, 36030.09581759708, 0.050069656855529075, 142743.017310443]
['N', 'O', 7601318.569778294, 11031614073.98362, 9994419779.33467, 10582435008.250677, 25.6047321386023, 35988.60840564336, 0.05012341744739862, 291610.7385911932]


In [11]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3899734.14,5292376241.02,5249520169.5451,5197640048.900388,25.443572,35933.470393,0.050472,144796
"""N""","""F""",89605.74,134078673.37,124484291.4763,136043535.207081,24.954306,35310.206029,0.048975,3718
"""R""","""F""",3768367.91,5380117595.9,4855004241.0261,5315169810.653909,25.49071,36030.095818,0.05007,142743
"""N""","""O""",7601318.56,11031614073.98,9994419779.3346,10582435008.250677,25.604732,35988.608406,0.050123,291610


In [12]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

In [13]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q1-step3.zip'