In [45]:
#!/usr/bin/env python
# coding: utf-8

EXPERIMENT = 'pac-duckdb-q1-part3'
OUTPUT_DIR = f'./outputs/{EXPERIMENT}'
GENERATE = False
USE_EVEN_NUMBER_OF_INPUT_ROWS = False

if GENERATE:
    print("GENERATE = True, so we will generate new samples.")
else:
    print("GENERATE = False, so we will load saved output from files rather than recomputing.")

import os
from typing import List
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

import numpy as np
import pickle

import duckdb
import polars as pl
import pyarrow as pa

GENERATE = False, so we will load saved output from files rather than recomputing.


In [63]:
# Import your generated samples
ZIPFILE = f'./outputs/e2e-q1.zip'
TEMPLATE = './outputs/pac-duckdb-q1/template.pkl'
REVERSEMAP = './outputs/pac-duckdb-q1/reverse_map.pkl'

import zipfile
import numpy as np
import pickle
import json
import io
import parse

In [68]:
INDEX_COLS = pickle.load(open('./outputs/pac-duckdb-q1/INDEX_COLS.pkl', 'rb'))
OUTPUT_COLS = pickle.load(open('./outputs/pac-duckdb-q1/OUTPUT_COLS.pkl', 'rb'))

In [47]:
# load the json input files
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(ZIPFILE, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 16, 6, 7, 17, 21, 10, 0, 26, 30, 31, 27, 1, 11, 2, 28, 12, 24, 25, 13, 29, 3, 8, 22, 18, 4, 14, 15, 5, 19, 23, 9])

In [48]:
alldata[0]

{'col': 'sum_qty',
 'row': {'l_returnflag': 'A', 'l_linestatus': 'F'},
 'value': [3857488.9962900397]}

In [72]:
reversemap = pickle.load(open(REVERSEMAP, 'rb'))
reversemap.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31])

In [65]:
templatedf: pl.DataFrame = pickle.load(open(TEMPLATE, 'rb'))
templatedf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,"decimal[38,2]","decimal[38,2]","decimal[38,4]","decimal[38,6]",f64,f64,f64,i64
"""A""","""F""",,,,,,,,
"""N""","""F""",,,,,,,,
"""N""","""O""",,,,,,,,
"""R""","""F""",,,,,,,,


In [None]:
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows():
    rowidxes[tuple(row)] = rowidxes.get(tuple(row), len(rowidxes))
colidxes, rowidxes

({'sum_qty': 2,
  'sum_base_price': 3,
  'sum_disc_price': 4,
  'sum_charge': 5,
  'avg_qty': 6,
  'avg_price': 7,
  'avg_disc': 8,
  'count_order': 9},
 {('A', 'F'): 0, ('N', 'F'): 1, ('N', 'O'): 2, ('R', 'F'): 3})

In [108]:
allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]
allinfo

[{'colname': 'avg_price',
  'rowid': ('A', 'F'),
  'value': 35911.862323141315,
  'colidx': 7,
  'rowidx': 0},
 {'colname': 'avg_qty',
  'rowid': ('A', 'F'),
  'value': 25.46072926526221,
  'colidx': 6,
  'rowidx': 0},
 {'colname': 'sum_base_price',
  'rowid': ('N', 'O'),
  'value': 10710225854.755188,
  'colidx': 3,
  'rowidx': 2},
 {'colname': 'sum_base_price',
  'rowid': ('R', 'F'),
  'value': 5174102300.640319,
  'colidx': 3,
  'rowidx': 3},
 {'colname': 'avg_qty',
  'rowid': ('N', 'F'),
  'value': 25.05703707770849,
  'colidx': 6,
  'rowidx': 1},
 {'colname': 'avg_price',
  'rowid': ('N', 'F'),
  'value': 36249.930455476184,
  'colidx': 7,
  'rowidx': 1},
 {'colname': 'sum_disc_price',
  'rowid': ('N', 'O'),
  'value': 10224149786.93517,
  'colidx': 4,
  'rowidx': 2},
 {'colname': 'sum_qty',
  'rowid': ('A', 'F'),
  'value': 3857488.9962900397,
  'colidx': 2,
  'rowidx': 0},
 {'colname': 'avg_disc',
  'rowid': ('N', 'O'),
  'value': 0.04999175381697957,
  'colidx': 8,
  'rowidx': 

In [173]:
colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']
colnames, rownames

({2: 'sum_qty',
  3: 'sum_base_price',
  4: 'sum_disc_price',
  5: 'sum_charge',
  6: 'avg_qty',
  7: 'avg_price',
  8: 'avg_disc',
  9: 'count_order'},
 {0: ('A', 'F'), 1: ('N', 'F'), 2: ('N', 'O'), 3: ('R', 'F')})

In [110]:
sorted(allinfo, key=lambda x: (x['colidx'], x['rowidx']))

[{'colname': 'sum_qty',
  'rowid': ('A', 'F'),
  'value': 3857488.9962900397,
  'colidx': 2,
  'rowidx': 0},
 {'colname': 'sum_qty',
  'rowid': ('N', 'F'),
  'value': 100052.37694728826,
  'colidx': 2,
  'rowidx': 1},
 {'colname': 'sum_qty',
  'rowid': ('N', 'O'),
  'value': 7312681.859289101,
  'colidx': 2,
  'rowidx': 2},
 {'colname': 'sum_qty',
  'rowid': ('R', 'F'),
  'value': 3828609.7640175517,
  'colidx': 2,
  'rowidx': 3},
 {'colname': 'sum_base_price',
  'rowid': ('A', 'F'),
  'value': 5422314213.3975115,
  'colidx': 3,
  'rowidx': 0},
 {'colname': 'sum_base_price',
  'rowid': ('N', 'F'),
  'value': 136233014.10269928,
  'colidx': 3,
  'rowidx': 1},
 {'colname': 'sum_base_price',
  'rowid': ('N', 'O'),
  'value': 10710225854.755188,
  'colidx': 3,
  'rowidx': 2},
 {'colname': 'sum_base_price',
  'rowid': ('R', 'F'),
  'value': 5174102300.640319,
  'colidx': 3,
  'rowidx': 3},
 {'colname': 'sum_disc_price',
  'rowid': ('A', 'F'),
  'value': 5116636172.695285,
  'colidx': 4,
  '

In [171]:
numericdf = pl.DataFrame(allinfo).select(
    pl.col('rowidx'),
    pl.col('colidx'),
    pl.col('value')
).sort(by=['colidx', 'rowidx']).pivot(
    index='rowidx',
    on='colidx',
    values='value',
    maintain_order=True
)
numericdf

rowidx,2,3,4,5,6,7,8,9
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3857500.0,5422300000.0,5116600000.0,5045900000.0,25.460729,35911.862323,0.05022,142749.247304
1,100052.376947,136230000.0,122810000.0,126850000.0,25.057037,36249.930455,0.049958,4100.965872
2,7312700.0,10710000000.0,10224000000.0,10062000000.0,25.636385,36162.96519,0.049992,288137.822115
3,3828600.0,5174100000.0,4914400000.0,5255100000.0,25.500825,35911.562107,0.050129,143037.056184


In [196]:
namedcolsdf = numericdf.with_columns(
    pl.col(str(i)).alias(colnames[i])
    for i in range(2,10)
).drop([str(x) for x in range(2,10)])
namedcolsdf

rowidx,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
i64,f64,f64,f64,f64,f64,f64,f64,f64
0,3857500.0,5422300000.0,5116600000.0,5045900000.0,25.460729,35911.862323,0.05022,142749.247304
1,100052.376947,136230000.0,122810000.0,126850000.0,25.057037,36249.930455,0.049958,4100.965872
2,7312700.0,10710000000.0,10224000000.0,10062000000.0,25.636385,36162.96519,0.049992,288137.822115
3,3828600.0,5174100000.0,4914400000.0,5255100000.0,25.500825,35911.562107,0.050129,143037.056184


In [197]:
outputdf = templatedf.with_columns(
    pl.col(INDEX_COLS),
).with_columns(
    namedcolsdf.select(pl.all().exclude('rowidx'))
)
outputdf

l_returnflag,l_linestatus,sum_qty,sum_base_price,sum_disc_price,sum_charge,avg_qty,avg_price,avg_disc,count_order
str,str,f64,f64,f64,f64,f64,f64,f64,f64
"""A""","""F""",3857500.0,5422300000.0,5116600000.0,5045900000.0,25.460729,35911.862323,0.05022,142749.247304
"""N""","""F""",100052.376947,136230000.0,122810000.0,126850000.0,25.057037,36249.930455,0.049958,4100.965872
"""N""","""O""",7312700.0,10710000000.0,10224000000.0,10062000000.0,25.636385,36162.96519,0.049992,288137.822115
"""R""","""F""",3828600.0,5174100000.0,4914400000.0,5255100000.0,25.500825,35911.562107,0.050129,143037.056184


In [199]:
outputdf.write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))