# bootstrap_puzzles_04_aggregate_export

Groups the `gold.word_probabilities` table by letter_set, aggregating all words for that letter set in an array.

This allows faster lookup of words by letter_set for inference (solving).

Exports the aggregated table as a JSON file, tagging the file as `.latest.json` if it is the latest file in the exports dir.

In [None]:
%run "./00_setup.ipynb"

In [None]:
from pyspark.sql import functions as F
import json
from pathlib import Path
from datetime import datetime
import time

from src.fileutils import get_local_path, get_all_files
from src.constants import EXPORT_PATH, DATE_FORMAT

In [None]:
# TODO: Parameterize
_SOURCE_DB_NAME = "gold"
_SOURCE_TABLE_NAME = "word_probabilities"
_PUZZLE_DATE = "2025-06-23" # could be None for bootstrap?

In [None]:
# Get the gold layer table of word probabilities
df = spark.sql(f"SELECT * FROM {_SOURCE_DB_NAME}.{_SOURCE_TABLE_NAME}")

In [None]:
print(f"Loaded {df.count()} records from {_SOURCE_DB_NAME}.{_SOURCE_TABLE_NAME}")
print("Labeled samples:")
df.filter(F.col("source") == "truth").show(5)

print("Predicted positive samples:")
df.filter(F.col("source") == "model").sort("probability", ascending=False).show(5)

print("Predicted negative samples:")
df.filter(F.col("source") == "model").sort("probability").show(5)

In [None]:
# Aggregate by letter_set
result_df = df.groupBy("letter_set").agg(
    F.collect_list(
        F.struct(
            F.col("word").alias("word"),
            F.col("probability").alias("probability"),
            F.col("last_seen_on").alias("last_seen_on"),
            F.col("letter_set").alias("letter_set")
        )
    ).alias("data")
)

In [None]:
# Collect rows and convert to dictionary (this should easily fit in memory)

# Use shortened key names in target dictionary
# (This saves 2MB of space when converted to JSON, which makes for faster inference)
source_target_key_names = [
    ("word", "word"),
    ("probability", "prob"),
    ("last_seen_on", "date"),
    ("letter_set", "letters"),
]

def to_dict(item):
    result = {}
    for source_key, target_key in source_target_key_names:
        if source_key == "last_seen_on":
            # Format date as string (datetime objects can't be serialized)
            val = None if item[source_key] is None else item[source_key].strftime(DATE_FORMAT)
            result[target_key] = val
        else:
            result[target_key] = item[source_key]

    return result

collected = result_df.collect()
json_dict = {row.letter_set: [to_dict(item) for item in row.data] for row in collected}


print(f"Converted to dictionary with {len(json_dict)} keys")
for letter_set, data in list(json_dict.items())[:5]:
    print(f"{letter_set}: {data}")

In [None]:
def parse_latest_file_date(file_path: str) -> datetime:
    """
    Get the date of the latest_file. This is the "logical" date, meaning that it
    covers all past puzzles processed up to and including this date. This is different
    from the date when the file was created.

    Assumes file name structure: data_{YYYY-MM-DD date}_{timestamp of creation}.json
    """
    file_name = Path(file_path).stem # drop .json
    _, date_str, _ = file_name.split("_")
    return datetime.strptime(date_str, DATE_FORMAT)

In [None]:
def move_out_of_latest(file_path: str):
    """Moves a file out of the /latest dir and into the parent of /latest"""
    source = Path(file_path)

    if not source.exists():
        raise FileNotFoundError(f"{source} does not exist.")

    if source.parent.name != "latest":
        raise ValueError(f"Expected file to be in a 'latest' directory, but found: {source.parent}")

    destination = source.parent.parent / source.name
    source.rename(destination)

    print(f"✅ File moved to: {destination}")

In [None]:
# Create {EXPORT_PATH}/latest dirs if needed
export_dir = Path(get_local_path(EXPORT_PATH))
latest_dir = export_dir / "latest"
latest_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Get the current "latest" file (if any)
latest_files = get_all_files(f"{EXPORT_PATH}/latest", [".json"])

In [None]:
# Fail if there is more than one latest file
if len(latest_files) > 1:
    raise Exception(f"{latest_dir} should have only one `.latest.json` file; found {len(latest_files)}. Please fix and rerun.")
    

In [None]:
# Log if there is no latest file
if len(latest_files) == 0:
    print(f"🔎 No `.json` files found in {latest_dir}.")

In [None]:
# If there is already a file in /latest...
if len(latest_files) == 1:
    # Compare the file date to the puzzle date
    latest_file = latest_files[0]
    latest_date = parse_latest_file_date(latest_file)
    current_date = datetime.strptime(_PUZZLE_DATE, DATE_FORMAT)

    # Fail if the current_date < latest_date
    if current_date < latest_date:
        raise Exception(f"Puzzle date {_PUZZLE_DATE} must be same or later than latest date: {latest_date.strftime(DATE_FORMAT)}")
    
    # Otherwise move the latest file out of /latest into the parent dir of /latest
    print(f"🧹 Moving previous file {latest_dir} out of /latest ...")
    move_out_of_latest(latest_file)

In [None]:
# Save the new data in /latest
creation_timestamp = str(int(time.time()))
new_file_path = latest_dir / f"data_{_PUZZLE_DATE}_{creation_timestamp}.json"

In [None]:
# Dump json with minimal whitespace (cuts file size almost in half)
with open(f"{new_file_path}", "w") as f:
    json.dump(json_dict, f, sort_keys=True, separators=(',', ':'))

print(f"✅ File saved as {new_file_path}")