In [1]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

In [2]:
#EXPERIMENT = "asdf"
#INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
#OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"
#INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
#OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
#templatedf_path = f"./outputs/{EXPERIMENT}-step2/templatedf.csv"
EXPERIMENT = ""
INPUT_ZIP = ""
OUTPUT_DIR = ""
INDEX_COLS = []
OUTPUT_COLS = []
templatedf_path = ""

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q13-customer"
OUTPUT_DIR = "./outputs/ap-duckdb-q13-customer-step3"
INPUT_ZIP = "./outputs/ap-duckdb-q13-customer-step2.zip"
INDEX_COLS = ["c_count"]
OUTPUT_COLS = ["custdist"]
templatedf_path = "./outputs/ap-duckdb-q13-customer-step1/template.pkl"


In [4]:
INDEX_COLS = INDEX_COLS or [""] # if no index cols are provided, then we make one up with empty string name in step 1

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# Import saved variables from the first step
templatedf: pl.DataFrame = pickle.load(open(templatedf_path, 'rb'))

In [6]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([20, 36, 16, 6, 7, 17, 21, 10, 0, 26, 30, 31, 27, 1, 11, 2, 28, 12, 32, 24, 25, 33, 13, 29, 3, 34, 8, 22, 18, 4, 14, 15, 5, 19, 23, 9, 35])

In [7]:
# Make sure the data types of the row fields are correct
for i in range(len(alldata)):
    alldata[i]['row'] = pl.DataFrame(alldata[i]['row']).cast(templatedf.select(INDEX_COLS).schema).to_dicts()[0]

In [8]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

c_count
i64
0
11
10
9
8
…
32
34
36
35


In [9]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()
    print(row)

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

{'c_count': 0}
{'c_count': 11}
{'c_count': 10}
{'c_count': 9}
{'c_count': 8}
{'c_count': 12}
{'c_count': 7}
{'c_count': 13}
{'c_count': 20}
{'c_count': 19}
{'c_count': 18}
{'c_count': 14}
{'c_count': 17}
{'c_count': 16}
{'c_count': 15}
{'c_count': 21}
{'c_count': 22}
{'c_count': 6}
{'c_count': 23}
{'c_count': 24}
{'c_count': 5}
{'c_count': 25}
{'c_count': 26}
{'c_count': 27}
{'c_count': 4}
{'c_count': 28}
{'c_count': 29}
{'c_count': 3}
{'c_count': 30}
{'c_count': 31}
{'c_count': 33}
{'c_count': 2}
{'c_count': 32}
{'c_count': 34}
{'c_count': 36}
{'c_count': 35}
{'c_count': 1}


Naive reconstruction

In [10]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['c_count', 'custdist']
[0, 2417.483121303556]
[11, 347.77725249173966]
[10, 332.59308386379394]
[9, 346.8435083335815]
[8, 291.7196481534941]
[12, 301.00022899829696]
[7, 233.7701585486316]
[13, 234.72793437010594]
[20, 199.49988758033106]
[19, 231.38228744551455]
[18, 183.81995340899172]
[14, 235.6371198051787]
[17, 208.4371738162276]
[16, 195.8441283749067]
[15, 238.6706165838613]
[21, 200.73800571459782]
[22, 191.09187119215454]
[6, 170.89148444978187]
[23, 170.19427387045639]
[24, 156.09896304593371]
[5, 112.28351675876094]
[25, 114.15479540838882]
[26, 100.13316402498367]
[27, 69.81810910879021]
[4, 44.038027530518164]
[28, 50.996766938065015]
[29, 34.16727619837069]
[3, 18.750101445763566]
[30, 12.935605005910388]
[31, 7.946456339213468]
[33, 9.197615910838344]
[2, 7.686303069414674]
[32, 3.4087493802728717]
[34, 5.523401675578198]
[36, 0.20702345643320796]
[35, -1.4199527452072775]
[1, 1.67167422909908]


In [11]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

c_count,custdist
i64,i64
0,2417
11,347
10,332
9,346
8,291
…,…
32,3
34,5
36,0
35,-1


In [12]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

In [13]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q13-customer-step3.zip'