In [1]:
#!/usr/bin/env python
# coding: utf-8

EXPERIMENT = 'pac-duckdb-q1-part3'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'
GENERATE = False
USE_EVEN_NUMBER_OF_INPUT_ROWS = False

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

import os
from typing import List
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

import numpy as np
import pickle

import duckdb
import polars as pl
import pyarrow as pa

GENERATE = False, so we will load saved output from files rather than recomputing.


In [None]:
# Import the output from the previous step
ZIPFILE = f'./outputs/e2e-q1.zip'
TEMPLATE = './outputs/pac-duckdb-q1/template.pkl'
REVERSEMAP = './outputs/pac-duckdb-q1/reverse_map.pkl'

import zipfile
import numpy as np
import pickle
import json
import io
import parse

In [None]:
# Import saved variables from the first step
INDEX_COLS = pickle.load(open('./outputs/pac-duckdb-q1/INDEX_COLS.pkl', 'rb'))
OUTPUT_COLS = pickle.load(open('./outputs/pac-duckdb-q1/OUTPUT_COLS.pkl', 'rb'))
reversemap = pickle.load(open(REVERSEMAP, 'rb'))
templatedf: pl.DataFrame = pickle.load(open(TEMPLATE, 'rb'))


In [4]:
# load the json input files
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(ZIPFILE, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 16, 6, 7, 17, 21, 10, 0, 26, 30, 31, 27, 1, 11, 2, 28, 12, 24, 25, 13, 29, 3, 8, 22, 18, 4, 14, 15, 5, 19, 23, 9])

In [None]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows():
    rowidxes[tuple(row)] = rowidxes.get(tuple(row), len(rowidxes))

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

In [None]:
# Construct the correct shape of table using only numeric indices for rows and columns
numericdf = pl.DataFrame(allinfo).select(
    pl.col('rowidx'),
    pl.col('colidx'),
    pl.col('value')
).sort(by=['colidx', 'rowidx']).pivot(
    index='rowidx',
    on='colidx',
    values='value',
    maintain_order=True
)
numericdf

rowidx,2,3,4,5,6,7,8,9
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3857500.0,5422300000.0,5116600000.0,5045900000.0,25.460729,35911.862323,0.05022,142749.247304
1,100052.376947,136230000.0,122810000.0,126850000.0,25.057037,36249.930455,0.049958,4100.965872
2,7312700.0,10710000000.0,10224000000.0,10062000000.0,25.636385,36162.96519,0.049992,288137.822115
3,3828600.0,5174100000.0,4914400000.0,5255100000.0,25.500825,35911.562107,0.050129,143037.056184


In [None]:
# Add the actual column names
namedcolsdf = numericdf.with_columns(
    pl.col(str(i)).alias(colnames[i])
    for i in [colidxes[c] for c in OUTPUT_COLS]
).drop([str(x) for x in [colidxes[c] for c in OUTPUT_COLS]])
namedcolsdf

rowidx,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3857500.0,5422300000.0,5116600000.0,5045900000.0,25.460729,35911.862323,0.05022,142749.247304
1,100052.376947,136230000.0,122810000.0,126850000.0,25.057037,36249.930455,0.049958,4100.965872
2,7312700.0,10710000000.0,10224000000.0,10062000000.0,25.636385,36162.96519,0.049992,288137.822115
3,3828600.0,5174100000.0,4914400000.0,5255100000.0,25.500825,35911.562107,0.050129,143037.056184


In [None]:
# Use the column names to insert this data into the template table, overwriting the empty columns that exist there
outputdf = templatedf.with_columns(
    pl.col(INDEX_COLS),
).with_columns(
    namedcolsdf.select(pl.all().exclude('rowidx'))
)
outputdf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,f64,f64,f64,f64,f64,f64,f64,f64
"""A""","""F""",3857500.0,5422300000.0,5116600000.0,5045900000.0,25.460729,35911.862323,0.05022,142749.247304
"""N""","""F""",100052.376947,136230000.0,122810000.0,126850000.0,25.057037,36249.930455,0.049958,4100.965872
"""N""","""O""",7312700.0,10710000000.0,10224000000.0,10062000000.0,25.636385,36162.96519,0.049992,288137.822115
"""R""","""F""",3828600.0,5174100000.0,4914400000.0,5255100000.0,25.500825,35911.562107,0.050129,143037.056184


In [None]:
# Save the output to a file
outputdf.write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))