# 02_transform_word_decisions

Part of single-file daily ingestion pipeline
- Read in the puzzle file for the parameterized date
- find all possible words 
- extract the explicit and implict decisions about each word
- write to the bronze table


In [None]:
%run "./00_setup.ipynb"

In [None]:
from src.sparkdbutils import get_or_create_db, write_to_table
from src.fileutils import get_latest_wordlist, word_file_to_set
from src.wordutils import get_letter_set_map, transform_puzzle_to_word_decisions_by_date
from src.bronzeutils import rows_to_word_decisions_df

In [None]:
# TODO: Parameterized _PUZZLE_DATE
_PUZZLE_DATE = "2023-02-18"

In [None]:
wordlist_filename, wordlist_version = get_latest_wordlist()
wordlist = word_file_to_set(wordlist_filename)
letter_set_map = get_letter_set_map(wordlist)

In [None]:
rows = transform_puzzle_to_word_decisions_by_date(_PUZZLE_DATE, 
                                                  wordlist, 
                                                  letter_set_map, 
                                                  wordlist_version)

In [None]:
df = rows_to_word_decisions_df(rows, spark)

In [None]:
print(df.count())
df.printSchema()

In [None]:
df.show(10, False)

In [None]:
# TODO: Pipeline parameter for db name, table name, puzzle_date, etc.
# TODO: Do not set this as a variable here
_TARGET_DB_NAME = "bronze"
get_or_create_db(spark, _TARGET_DB_NAME)

In [None]:
# TODO: Pipeline parameter for table name
_TABLE_NAME = "word_decisions"

# TODO: Extract to a helper function
year, month, day = _PUZZLE_DATE.split("-")
replace_where_dict = {
    "year": int(year),
    "month": int(month),
    "day": int(day)
}
partitions = ["year", "month"]

write_to_table(spark, df, _TARGET_DB_NAME, _TABLE_NAME, replace_where_dict, partitions)

In [None]:
df2 = spark.sql("SELECT * FROM bronze.word_decisions")
print(df2.count())
df2.show(10, False)

In [None]:
df2.select(["year", "month"]).distinct().sort(["year", "month"]).show()

In [None]:
# LOCAL ONLY - is this needed?
spark.stop()

In [None]:
# ===== TODOS / notes below this line =======

- one pipeline to backfill, another for daily ingestion
- backfill runs for a year, one month at a time, with validation and audit steps
- backfill gets the paths for a given month, then reads in each puzzle one at a time, writing to in-memory rows, then writes to a dataframe / table

- Daily ingest script that writes one file for a specific day/month/year
- Repurpose helper methods to write to table, create db if it doesn't exist ...