In [1]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

In [2]:
#EXPERIMENT = "asdf"
#INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
#OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"
#INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
#OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
#templatedf_path = f"./outputs/{EXPERIMENT}-step2/templatedf.csv"
EXPERIMENT = ""
INPUT_ZIP = ""
OUTPUT_DIR = ""
INDEX_COLS = []
OUTPUT_COLS = []
templatedf_path = ""

In [3]:
# Parameters
EXPERIMENT = "ap-duckdb-q9-customer"
OUTPUT_DIR = "./outputs/ap-duckdb-q9-customer-step3"
INPUT_ZIP = "./outputs/ap-duckdb-q9-customer-step2.zip"
INDEX_COLS = ["nation", "o_year"]
OUTPUT_COLS = ["sum_profit"]
templatedf_path = "./outputs/ap-duckdb-q9-customer-step1/template.pkl"


In [4]:
INDEX_COLS = INDEX_COLS or [""] # if no index cols are provided, then we make one up with empty string name in step 1

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [5]:
# Import saved variables from the first step
templatedf: pl.DataFrame = pickle.load(open(templatedf_path, 'rb'))

In [6]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()

dict_keys([115, 142, 154, 103, 20, 98, 77, 139, 61, 36, 119, 41, 16, 158, 57, 174, 94, 123, 82, 6, 135, 162, 163, 7, 83, 134, 95, 122, 56, 159, 17, 118, 40, 37, 138, 60, 99, 76, 21, 102, 155, 143, 114, 47, 148, 10, 109, 51, 172, 92, 125, 84, 0, 133, 164, 113, 144, 152, 105, 26, 129, 71, 88, 67, 168, 30, 169, 31, 89, 66, 128, 70, 27, 104, 153, 145, 112, 165, 1, 85, 132, 93, 124, 173, 108, 50, 149, 11, 46, 166, 69, 131, 2, 86, 127, 90, 28, 170, 53, 12, 45, 32, 65, 73, 24, 107, 150, 146, 49, 111, 48, 110, 147, 151, 106, 25, 72, 64, 33, 44, 13, 52, 29, 171, 126, 91, 68, 130, 87, 3, 167, 34, 63, 8, 75, 22, 59, 101, 156, 18, 140, 117, 38, 160, 137, 4, 80, 79, 121, 96, 55, 14, 43, 42, 15, 54, 78, 120, 97, 136, 81, 5, 39, 161, 116, 19, 141, 157, 58, 100, 23, 74, 62, 9, 35])

In [7]:
# Make sure the data types of the row fields are correct
for i in range(len(alldata)):
    alldata[i]['row'] = pl.DataFrame(alldata[i]['row']).cast(templatedf.select(INDEX_COLS).schema).to_dicts()[0]

In [8]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

nation,o_year
str,i64
"""ALGERIA""",1998
"""ALGERIA""",1997
"""ALGERIA""",1996
"""ALGERIA""",1995
"""ALGERIA""",1994
…,…
"""VIETNAM""",1996
"""VIETNAM""",1995
"""VIETNAM""",1994
"""VIETNAM""",1993


In [9]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()
    print(row)

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'][0],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']

{'nation': 'ALGERIA', 'o_year': 1998}
{'nation': 'ALGERIA', 'o_year': 1997}
{'nation': 'ALGERIA', 'o_year': 1996}
{'nation': 'ALGERIA', 'o_year': 1995}
{'nation': 'ALGERIA', 'o_year': 1994}
{'nation': 'ALGERIA', 'o_year': 1993}
{'nation': 'ALGERIA', 'o_year': 1992}
{'nation': 'ARGENTINA', 'o_year': 1998}
{'nation': 'ARGENTINA', 'o_year': 1997}
{'nation': 'ARGENTINA', 'o_year': 1996}
{'nation': 'ARGENTINA', 'o_year': 1995}
{'nation': 'ARGENTINA', 'o_year': 1994}
{'nation': 'ARGENTINA', 'o_year': 1993}
{'nation': 'ARGENTINA', 'o_year': 1992}
{'nation': 'BRAZIL', 'o_year': 1998}
{'nation': 'BRAZIL', 'o_year': 1997}
{'nation': 'BRAZIL', 'o_year': 1996}
{'nation': 'BRAZIL', 'o_year': 1995}
{'nation': 'BRAZIL', 'o_year': 1994}
{'nation': 'BRAZIL', 'o_year': 1993}
{'nation': 'BRAZIL', 'o_year': 1992}
{'nation': 'CANADA', 'o_year': 1998}
{'nation': 'CANADA', 'o_year': 1997}
{'nation': 'CANADA', 'o_year': 1996}
{'nation': 'CANADA', 'o_year': 1995}
{'nation': 'CANADA', 'o_year': 1994}
{'nation':

Naive reconstruction

In [10]:
# Naive reconstruction based on the indices of the keys in the templatedf
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy().tolist()

allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value'][0]
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append(row + [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])

['nation', 'o_year', 'sum_profit']
['ALGERIA', 1998, 786167.5160316498]
['ALGERIA', 1997, 1792640.498654827]
['ALGERIA', 1996, 1771126.4985583997]
['ALGERIA', 1995, 2335994.3030150593]
['ALGERIA', 1994, 1701953.5640502202]
['ALGERIA', 1993, 1907427.5977554666]
['ALGERIA', 1992, 1909404.2500511943]
['ARGENTINA', 1998, 1261378.3868255268]
['ARGENTINA', 1997, 2612503.293244797]
['ARGENTINA', 1996, 3026761.1213749573]
['ARGENTINA', 1995, 2492367.845750657]
['ARGENTINA', 1994, 1172221.0907070204]
['ARGENTINA', 1993, 2108785.2284152796]
['ARGENTINA', 1992, 2571396.2680789977]
['BRAZIL', 1998, 1718074.471984651]
['BRAZIL', 1997, 2269677.7492079325]
['BRAZIL', 1996, 2105743.713220573]
['BRAZIL', 1995, 1587289.7301208787]
['BRAZIL', 1994, 2967580.953208946]
['BRAZIL', 1993, 1991285.4071051974]
['BRAZIL', 1992, 1983421.7926812293]
['CANADA', 1998, 869294.2054759802]
['CANADA', 1997, 1291879.6468158094]
['CANADA', 1996, 2277738.1291544875]
['CANADA', 1995, 2017243.905785383]
['CANADA', 1994, 2296

In [11]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema)

nation,o_year,sum_profit
str,i64,"decimal[38,4]"
"""ALGERIA""",1998,786167.5160
"""ALGERIA""",1997,1792640.4986
"""ALGERIA""",1996,1771126.4985
"""ALGERIA""",1995,2335994.3030
"""ALGERIA""",1994,1701953.5640
…,…,…
"""VIETNAM""",1996,1651122.7410
"""VIETNAM""",1995,1290217.1503
"""VIETNAM""",1994,2609674.6842
"""VIETNAM""",1993,1671397.8171


In [12]:
pl.DataFrame(df2, schema=allcols, orient='row').cast(templatedf.schema).write_csv(os.path.join(OUTPUT_DIR, 'output.csv'))

In [13]:
# zip the OUTPUT_DIR
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)

'/Users/michael/projects/dpdb/pacdb/outputs/ap-duckdb-q9-customer-step3.zip'