In [1]:
#!/usr/bin/env python
# coding: utf-8

EXPERIMENT = 'pac-duckdb-q1'
INPUT_ZIP = f'./outputs/{EXPERIMENT}-step2.zip'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}-step3'

import os
from typing import List
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

import numpy as np
import pickle

import duckdb
import polars as pl
import pyarrow as pa

import zipfile
import numpy as np
import pickle
import json
import io
import parse
import shutil

In [2]:
# Import saved variables from the first step
INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
templatedf: pl.DataFrame = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/template.pkl', 'rb'))

In [3]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 16, 6, 7, 17, 21, 10, 0, 26, 30, 31, 27, 1, 11, 2, 28, 12, 24, 25, 13, 29, 3, 8, 22, 18, 4, 14, 15, 5, 19, 23, 9])

In [4]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows():
    rowidxes[tuple(row)] = rowidxes.get(tuple(row), len(rowidxes))

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

In [5]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

l_returnflag,l_linestatus
str,str
"""A""","""F"""
"""N""","""F"""
"""N""","""O"""
"""R""","""F"""


Naive reconstruction

In [6]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['l_returnflag', 'l_linestatus', 'sum_qty', 'sum_base_price', 'sum_disc_price', 'sum_charge', 'avg_qty', 'avg_price', 'avg_disc', 'count_order']
['A', 'F', 3765750.2257570163, 5137245325.263337, 4998609024.848441, 5256155659.484106, 25.488795921119262, 36162.64533651841, 0.05047393470125876, 146189.78234473686]
['N', 'F', 96647.51381247856, 141523067.0339612, 117764517.78819464, 138186185.43609065, 25.254704447480368, 34378.06016462823, 0.0487409309957195, 3852.544705164627]
['N', 'O', 7719818.530798227, 11074547815.710772, 9707836156.304594, 10346463099.438492, 25.548613172353136, 36072.6559101019, 0.04998162105815481, 294007.6926948845]
['R', 'F', 3777723.381622972, 5237189609.883725, 5005372932.515666, 5376563959.094568, 25.62493209987229, 36145.4305258992, 0.049912532059403665, 147113.2510706484]


In [7]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3765750.22,5137245325.26,4998609024.8484,5256155659.484106,25.488796,36162.645337,0.050474,146189
"""N""","""F""",96647.51,141523067.03,117764517.7881,138186185.43609,25.254704,34378.060165,0.048741,3852
"""N""","""O""",7719818.53,11074547815.71,9707836156.3045,10346463099.438492,25.548613,36072.65591,0.049982,294007
"""R""","""F""",3777723.38,5237189609.88,5005372932.5156,5376563959.094568,25.624932,36145.430526,0.049913,147113


In [8]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

Polars-based reconstruction

In [9]:
# Construct the correct shape of table using only numeric indices for rows and columns
numericdf = pl.DataFrame(allinfo).select(
    pl.col('rowidx'),
    pl.col('colidx'),
    pl.col('value')
).sort(by=['colidx', 'rowidx']).pivot(
    index='rowidx',
    on='colidx',
    values='value',
    maintain_order=True
)
numericdf

rowidx,2,3,4,5,6,7,8,9
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3765800.0,5137200000.0,4998600000.0,5256200000.0,25.488796,36162.645337,0.050474,146189.782345
1,96647.513812,141520000.0,117760000.0,138190000.0,25.254704,34378.060165,0.048741,3852.544705
2,7719800.0,11075000000.0,9707800000.0,10346000000.0,25.548613,36072.65591,0.049982,294007.692695
3,3777700.0,5237200000.0,5005400000.0,5376600000.0,25.624932,36145.430526,0.049913,147113.251071


In [10]:
# Add the actual column names
namedcolsdf = numericdf.with_columns(
    pl.col(str(i)).alias(colnames[i])
    for i in [colidxes[c] for c in OUTPUT_COLS]
).drop([str(x) for x in [colidxes[c] for c in OUTPUT_COLS]])
namedcolsdf

rowidx,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3765800.0,5137200000.0,4998600000.0,5256200000.0,25.488796,36162.645337,0.050474,146189.782345
1,96647.513812,141520000.0,117760000.0,138190000.0,25.254704,34378.060165,0.048741,3852.544705
2,7719800.0,11075000000.0,9707800000.0,10346000000.0,25.548613,36072.65591,0.049982,294007.692695
3,3777700.0,5237200000.0,5005400000.0,5376600000.0,25.624932,36145.430526,0.049913,147113.251071


In [11]:
# Use the column names to insert this data into the template table, overwriting the empty columns that exist there
outputdf = templatedf.with_columns(
    pl.col(INDEX_COLS),
).with_columns(
    namedcolsdf.select(pl.all().exclude('rowidx'))
).cast(templatedf.schema)
outputdf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",3765750.22,5137245325.26,4998609024.8484,5256155659.484106,25.488796,36162.645337,0.050474,146189
"""N""","""F""",96647.51,141523067.03,117764517.7881,138186185.43609,25.254704,34378.060165,0.048741,3852
"""N""","""O""",7719818.53,11074547815.71,9707836156.3045,10346463099.438492,25.548613,36072.65591,0.049982,294007
"""R""","""F""",3777723.38,5237189609.88,5005372932.5156,5376563959.094568,25.624932,36145.430526,0.049913,147113


In [12]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/pac-duckdb-q1-step3.zip'