# bootstrap_puzzles_04_aggregate_export

Groups the `gold.word_probabilities` table by letter_set, aggregating all words for that letter set in an array.

This allows faster lookup of words by letter_set for inference (solving).

Exports the aggregated table as a JSON file, tagging the file as `.latest.json` if it is the latest file in the exports dir.

In [None]:
%run "./00_setup.ipynb"

In [None]:
from pyspark.sql import functions as F
import json
from pathlib import Path
from datetime import datetime

from src.fileutils import get_local_path, get_all_files
from src.constants import EXPORT_PATH, DATE_FORMAT

In [None]:
# TODO: Parameterize
_SOURCE_DB_NAME = "gold"
_SOURCE_TABLE_NAME = "word_probabilities"
_PUZZLE_DATE = "2025-06-23" # could be None for bootstrap?

In [None]:
df = spark.sql(f"SELECT * FROM {_SOURCE_DB_NAME}.{_SOURCE_TABLE_NAME}")

In [None]:
print(f"Loaded {df.count()} records from {_SOURCE_DB_NAME}.{_SOURCE_TABLE_NAME}")
print("Labeled samples:")
df.filter(F.col("source") == "truth").show(5)

print("Predicted positive samples:")
df.filter(F.col("source") == "model").sort("probability", ascending=False).show(5)

print("Predicted negative samples:")
df.filter(F.col("source") == "model").sort("probability").show(5)

In [None]:
# Aggregate by letter_set
result_df = df.groupBy("letter_set").agg(
    F.collect_list(
        F.struct(
            F.col("word").alias("word"),
            F.col("probability").alias("probability"), 
            F.col("source").alias("source")
        )
    ).alias("data")
)

In [None]:
# Collect rows and convert to dictionary (this should easily fit in memory)
collected = result_df.collect()
json_dict = {row.letter_set: [item.asDict() for item in row.data] for row in collected}

print(f"Converted to dictionary with {len(json_dict)} keys")
for letter_set, data in list(json_dict.items())[:5]:
    print(f"{letter_set}: {data}")

In [None]:
# Save to export folder
export_dir = Path(get_local_path(EXPORT_PATH))
export_dir.mkdir(parents=True, exist_ok=True)

# Find previous "latest" file and remove that tag from filename
latest_data = get_all_files(EXPORT_PATH, [".latest.json"])


In [None]:
# Determine how to save the export (as latest file or not)
def parse_file_date(file_path: str) -> datetime:
    file_name = Path(Path(file_path).stem).stem # drop .json, then drop .latest
    file_date_str = file_name.split("_")[-1]
    file_date = datetime.strptime(file_date_str, DATE_FORMAT)
    return file_date 

save_as_latest = False

if len(latest_data) > 1:
    raise Exception(f"There should be only one .latest.json file, but found {len(latest_data)} files.")

if len(latest_data) > 0:
    # find date of the latest file
    latest_file_date = parse_file_date(latest_data[0])
    current_date = datetime.strptime(_PUZZLE_DATE, DATE_FORMAT)

    if current_date < latest_file_date:
        print(f"current date {current_date.strftime(DATE_FORMAT)} is earlier than latest file date {latest_file_date.strftime(DATE_FORMAT)}")
    else:
        save_as_latest = True
        print(f"current date {current_date.strftime(DATE_FORMAT)} is most recent.")
        print(f"removing latest tag from data_{latest_file_date.strftime(DATE_FORMAT)}.latest.json")
        old_file_path = Path(latest_data[0])
        new_path = Path(str(old_file_path).replace(".latest", ""))
        old_file_path.rename(new_path)
        

else:
    save_as_latest = True
    print(f"No `.latest.json` files found in {get_local_file(EXPORT_PATH)}.")

In [None]:
base_file_name = f"data_{_PUZZLE_DATE}"

if save_as_latest:
    export_path = export_dir / f"{base_file_name}.latest.json"
    print(f"Saving as latest file: {export_path}")
else:
    export_path = export_dir / f"{base_file_name}.json"
    print(f"Saving as non-latest file: {export_path}")

# Dump json with minimal whitespace (cuts file size almost in half)
with open(f"{export_path}", "w") as f:
    json.dump(json_dict, f, sort_keys=True, separators=(',', ':'))

print(f"✅ File saved as {export_path}")