# Backfill: transform_word_decisions

Part of historical backfill pipeline

- One backfill pipeline run per year
- Work in batches of one month
- For each month:
    - Get the filepaths of puzzles for that month
    - Transform each puzzle into `word_decisions` table rows
    - write to the bronze table
    - perform validation checks and audit logs before and after each write op

## ⚠️ Not working locally? ⚠️

To run this notebook locally, edit the first code cell:

Change:  
`%run "./00_setup"`  
To:  
`%run "./00_setup.ipynb"`

👉 _Please **do not commit** this change — it's only for local execution._

In [None]:
%run "./00_setup"

In [None]:
from pyspark.sql.types import * 
import pyspark.sql.functions as F
from typing import Any

In [None]:
from src.sparkdbutils import create_db, write_to_table_replace_where
from src.fileutils import get_latest_wordlist, word_file_to_set, get_puzzle_paths
from src.wordutils import get_letter_set_map, transform_puzzle_to_word_decisions_by_path
from src.bronzeutils import rows_to_word_decisions_df, WORD_DECISIONS_PARTITIONS

In [None]:
wordlist_filename, wordlist_version = get_latest_wordlist()
wordlist = word_file_to_set(wordlist_filename)
letter_set_map = get_letter_set_map(wordlist)

In [None]:
def process_month(year: int, month: int) -> list[dict[str, Any]]:
    """
    Returns word_decision rows for all puzzles in the given year/month
    """
    rows = []
    puzzle_paths = get_puzzle_paths(year, month)
    for puzzle_path in sorted(puzzle_paths):
        curr_rows = transform_puzzle_to_word_decisions_by_path(puzzle_path,
                                                               wordlist,
                                                               letter_set_map,
                                                               wordlist_version)
        rows.extend(curr_rows)

    return rows_to_word_decisions_df(rows, spark)

In [None]:
# TODO: Parameterize _YEAR, _TARGET_DB_NAME, _TABLE_NAME
_YEAR = 0000
_TARGET_DB_NAME = "bronze"
_TABLE_NAME = "word_decisions"
create_db(spark, _TARGET_DB_NAME)

In [None]:
total_rows = 0

for month in range(1, 13):
    print(f"Processing year {_YEAR}, month {month}...")
    df = process_month(_YEAR, month)
    
    curr_count = df.count()
    total_rows += curr_count
    print(f"Writing {curr_count} rows to {_TARGET_DB_NAME}.{_TABLE_NAME}")
    replace_where_dict = {
        "year": _YEAR,
        "month": month,
    }
    write_to_table_replace_where(spark,
                   df,
                   _TARGET_DB_NAME,
                   _TABLE_NAME,
                   replace_where_dict,
                   WORD_DECISIONS_PARTITIONS)

    # TODO: validation, audit log, etc.
print(f"{total_rows} written in total")

In [None]:
df2 = spark.sql("SELECT * FROM bronze.word_decisions")
print(f"{df2.count()} total rows in table")
df2.show(10, False)

In [None]:
df2.select(["year", "month"]).distinct().sort(["year", "month",]).show(50, False)

In [None]:
new_words = df2.filter(df2.was_in_wordlist == False).select("word").distinct().sort("word")
print(f"{new_words.count()} words not found in external wordlist")
new_words.show(new_words.count())