In [3]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

EXPERIMENT = "pac-duckdb-q1"
INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [4]:
# Import saved variables from the first step
INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
templatedf: pl.DataFrame = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/template.pkl', 'rb'))

In [5]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 16, 6, 7, 17, 21, 10, 0, 26, 30, 31, 27, 1, 11, 2, 28, 12, 24, 25, 13, 29, 3, 8, 22, 18, 4, 14, 15, 5, 19, 23, 9])

In [43]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

l_returnflag,l_linestatus
str,str
"""A""","""F"""
"""N""","""F"""
"""N""","""O"""
"""R""","""F"""


In [44]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

Naive reconstruction

In [45]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['l_returnflag', 'l_linestatus', 'sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']
['A', 'F', 3829849.0071717314, 5341691787.088531, 5175989840.735145, 5276612160.76773, 25.56014338441755, 35839.2408068936, 0.050192019953040984, 151816.835989049]
['N', 'F', 97347.33313245185, 135703568.59810174, 117193423.87591256, 138330589.2666378, 25.332104261051793, 36405.65249584284, 0.05027824900344496, 3869.076405151929]
['N', 'O', 7510763.322264384, 10298337591.188448, 10111166463.948906, 10404723789.86984, 25.558339642977405, 35972.39254183539, 0.05027057440807528, 293855.90345317853]
['R', 'F', 3884912.441534467, 5261892931.38069, 4975409622.399012, 5434293589.666393, 25.45817600411522, 35939.572184965975, 0.04988005604003848, 149240.41118552018]


In [46]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3829849.0,5341691787.08,5175989840.7351,5276612160.76773,25.560143,35839.240807,0.050192,151816
"""N""","""F""",97347.33,135703568.59,117193423.8759,138330589.266637,25.332104,36405.652496,0.050278,3869
"""N""","""O""",7510763.32,10298337591.18,10111166463.9489,10404723789.86984,25.55834,35972.392542,0.050271,293855
"""R""","""F""",3884912.44,5261892931.38,4975409622.399,5434293589.666393,25.458176,35939.572185,0.04988,149240


In [47]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

Polars-based reconstruction

In [48]:
# Construct the correct shape of table using only numeric indices for rows and columns
numericdf = pl.DataFrame(allinfo).select(
    pl.col('rowidx'),
    pl.col('colidx'),
    pl.col('value')
).sort(by=['colidx', 'rowidx']).pivot(
    index='rowidx',
    on='colidx',
    values='value',
    maintain_order=True
)
numericdf

rowidx,2,3,4,5,6,7,8,9
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3829800.0,5341700000.0,5176000000.0,5276600000.0,25.560143,35839.240807,0.050192,151816.835989
1,97347.333132,135700000.0,117190000.0,138330000.0,25.332104,36405.652496,0.050278,3869.076405
2,7510800.0,10298000000.0,10111000000.0,10405000000.0,25.55834,35972.392542,0.050271,293855.903453
3,3884900.0,5261900000.0,4975400000.0,5434300000.0,25.458176,35939.572185,0.04988,149240.411186


In [49]:
# Add the actual column names
namedcolsdf = numericdf.with_columns(
    pl.col(str(i)).alias(colnames[i])
    for i in [colidxes[c] for c in OUTPUT_COLS]
).drop([str(x) for x in [colidxes[c] for c in OUTPUT_COLS]])
namedcolsdf

rowidx,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3829800.0,5341700000.0,5176000000.0,5276600000.0,25.560143,35839.240807,0.050192,151816.835989
1,97347.333132,135700000.0,117190000.0,138330000.0,25.332104,36405.652496,0.050278,3869.076405
2,7510800.0,10298000000.0,10111000000.0,10405000000.0,25.55834,35972.392542,0.050271,293855.903453
3,3884900.0,5261900000.0,4975400000.0,5434300000.0,25.458176,35939.572185,0.04988,149240.411186


In [50]:
# Use the column names to insert this data into the template table, overwriting the empty columns that exist there
outputdf = templatedf.with_columns(
    pl.col(INDEX_COLS),
).with_columns(
    namedcolsdf.select(pl.all().exclude('rowidx'))
).cast(templatedf.schema)
outputdf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3829849.0,5341691787.08,5175989840.7351,5276612160.76773,25.560143,35839.240807,0.050192,151816
"""N""","""F""",97347.33,135703568.59,117193423.8759,138330589.266637,25.332104,36405.652496,0.050278,3869
"""N""","""O""",7510763.32,10298337591.18,10111166463.9489,10404723789.86984,25.55834,35972.392542,0.050271,293855
"""R""","""F""",3884912.44,5261892931.38,4975409622.399,5434293589.666393,25.458176,35939.572185,0.04988,149240


In [51]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/pac-duckdb-q1-step3.zip'