In [None]:
#!/usr/bin/env python
# coding: utf-8

import json
import os
import pickle
import shutil
import zipfile

import parse
import polars as pl

from timer import Timer

In [None]:
#EXPERIMENT = "asdf"
#INPUT_ZIP = f"./outputs/{EXPERIMENT}-step2.zip"
#OUTPUT_DIR = f"./outputs/{EXPERIMENT}-step3"
#INDEX_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/INDEX_COLS.pkl', 'rb'))
#OUTPUT_COLS = pickle.load(open(f'./outputs/{EXPERIMENT}-step1/OUTPUT_COLS.pkl', 'rb'))
#templatedf_path = f"./outputs/{EXPERIMENT}-step2/templatedf.csv"
EXPERIMENT = ""
INPUT_ZIP = ""
OUTPUT_DIR = ""
INDEX_COLS = []
OUTPUT_COLS = []
templatedf_path = ""

In [None]:
# Timer setup
timer = Timer(experiment=EXPERIMENT, step="step3", output_dir="./times")

# Ensure the output directory exists
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

In [None]:
USING_DEFAULT_INDEX_COLS = False
if len(INDEX_COLS) == 0:
    print("No index cols provided, using default index cols")
    USING_DEFAULT_INDEX_COLS = True
    INDEX_COLS = ["rank"]

In [None]:
# Import saved variables from the first step
timer.start("load_templatedf")
templatedf: pl.DataFrame = pickle.load(open(templatedf_path, 'rb'))
timer.end()

In [None]:
# load the json input files from step 2
# {'col': column name as string,
#  'row': group-by column values as dict (effectively a row id),
#  'values': [ 1000x values ] }
timer.start("load_all_json_data")
alldata = {}

pattern = parse.compile("{n}.json")

with zipfile.ZipFile(INPUT_ZIP, 'r') as zf:
    for filename in zf.namelist():
        if filename.endswith('.json'):
            with zf.open(filename) as f:
                filenumber = int(pattern.parse(filename).named['n'])
                data = json.load(f)
                d: dict = data
                alldata[filenumber] = d
alldata.keys()
timer.end()

In [None]:
# Make sure the data types of the row fields are correct
timer.start("determine_cell_locations")
for i in range(len(alldata)):
    alldata[i]['row'] = pl.DataFrame(alldata[i]['row']).cast(templatedf.select(INDEX_COLS).schema).to_dicts()[0]

In [None]:
allgroups = templatedf.select(INDEX_COLS)
allgroups

In [None]:
# Cursed data shuffling to reidentify what row goes where, what column goes where, and what the labels should be on everything
colidxes = {}
for col in OUTPUT_COLS:
    colidxes[col] = templatedf.get_column_index(col)
rowidxes = {}
for row in templatedf.select(INDEX_COLS).iter_rows(named=True):
    rowidxes[tuple(row.values())] = allgroups.with_row_index().filter(
        pl.col(k).eq(v)
        for k, v in row.items()
    ).select("index").item()
    print(row)

allinfo = [
    {"colname": adentry['col'],
    "rowid": tuple(adentry['row'].values()),
    "value": adentry['value'],
    "colidx": colidxes[adentry['col']],
    "rowidx": rowidxes[tuple(adentry['row'].values())]}
    for adidx, adentry in alldata.items()
]

colnames = {}
rownames = {}
for entry in allinfo:
    colnames[entry['colidx']] = entry['colname']
    rownames[entry['rowidx']] = entry['rowid']
timer.end()

Naive reconstruction

In [None]:
# Naive reconstruction based on the indices of the keys in the templatedf
timer.start("reconstruct_dataframe")
allcols = INDEX_COLS + OUTPUT_COLS
allrows = allgroups.select(INDEX_COLS).to_numpy(structured=True).tolist()


allinfo2 = {
    (rowidxes[tuple(adentry['row'].values())], colidxes[adentry['col']]): adentry['value']
    for adidx, adentry in alldata.items()
}

df2 = []
print(allcols)
for row in allrows:
    print(row, [allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS])
    df2.append([*row, *[allinfo2.get((rowidxes[tuple(row)], colidxes[col]), None) for col in OUTPUT_COLS]])

In [None]:
df = pl.DataFrame(df2, schema=allcols, orient='row')

if USING_DEFAULT_INDEX_COLS: # If we used default index cols, remove them from output
    df = df.select(OUTPUT_COLS)

timer.end()
df

In [None]:
timer.start("write_output_json")
df.write_json(os.path.join(OUTPUT_DIR, 'output.json'))
timer.end()

In [None]:
# zip the OUTPUT_DIR
timer.start("zip_output")
shutil.make_archive(OUTPUT_DIR, 'zip', OUTPUT_DIR)
timer.end()