# 02_extract_word_decisions

- read in the files from a given date range
- for each file:
    - find all possible words 
    - extract the explicit and implict decisions about each word
    - write to the bronze table

TODOs:
- one pipeline to backfill, another for daily ingestion
- backfill runs for a year, one month at a time, with verification and audit steps
- backfill gets the paths for a given month (`glob` locally, `dbutils.fs.ls()` in cloud), then reads in each puzzle one at a time, writing to in-memory rows, then writes to a dataframe, then uses `uses replaceWhere` with Delta
- daily can use the delete + write pattern (or will `replaceWhere` work for this as well??)
- helper methods: `get_puzzle_by_date`, `ingest_puzzle_by_date` (for daily), `get_puzzle_paths`, `get_puzzle_by_path`, `ingest_puzzle_by_path` (for backfill) 

In [None]:
%run "./00_setup.ipynb"

In [None]:
from datetime import datetime, timedelta
import os
from glob import glob
from pathlib import Path

In [None]:
from src.constants import WORDLIST_FILENAME, DATE_FORMAT
from src.fileutils import word_file_to_set, get_local_path
from src.wordutils import get_letter_set_map, ingest_puzzle_by_date

In [None]:
# TODO: Parameterize YEAR in notebook, then run this per month
# TODO: Write helper function to find all puzzle dates for a given year/month
# TODO: need local version of this
def get_all_json_files(path):
    files = []
    for item in dbutils.fs.ls(path):
        if item.isDir():
            files.extend(get_all_json_files(item.path))
        elif item.path.endswith('.json'):
            files.append(item.path)
    return files
# Local version - use glob

YEAR = 2024

start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 1, 1)
curr_date = start_date

puzzle_dates = []
missing = [datetime(2024, 7, 30), datetime(2024, 12, 16)]
while curr_date < end_date:
    if curr_date not in missing:
        puzzle_dates.append(curr_date.strftime("%Y-%m-%d"))
    curr_date += timedelta(days=1)

print(puzzle_dates)

In [None]:
wordlist = word_file_to_set(f"raw/wordlists/{WORDLIST_FILENAME}")
letter_set_map = get_letter_set_map(wordlist)

In [None]:
rows = []
for puzzle_date in puzzle_dates:
    print(f"ingesting puzzle for {puzzle_date}...")
    curr_rows = ingest_puzzle(puzzle_date, wordlist, letter_set_map)
    print(f"{len(curr_rows)} rows created for {puzzle_date}")
    rows.extend(curr_rows)

print(f"{len(rows)} total rows")
    

In [None]:
# TODO: Delete this cell (local only)
# TODO: Create local functions for starting Spark Session (including with db)
from pyspark.sql import SparkSession
from pyspark.sql.types import * 
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate()

In [None]:
schema = StructType([
    StructField("word", StringType(), False),
    StructField("accepted", BooleanType(), False),
    StructField("was_in_wordlist", BooleanType(), False),
    StructField("puzzle_date", DateType(), False),
    StructField("center_letter", StringType(), False),
    StructField("outer_letters", StringType(), False),
])
df = spark.createDataFrame(rows, schema=schema)

In [None]:
# Add derived year/month/day columns for partitioning
df = df.withColumn("year", F.year("puzzle_date")) \
       .withColumn("month", F.month("puzzle_date")) \
       .withColumn("day", F.dayofmonth("puzzle_date"))

In [None]:
df.printSchema()

In [None]:
df.filter(df.was_in_wordlist == False).select("word").distinct().sort("word").show(100, False)

In [None]:
# TODO: Backfill script validates as it goes, uses replaceWhere with delta runs for a given year only, one chunk at a time
# TODO: Daily ingest script that writes one file for a specific day/month/year
# TODO: Repurpose helper methods to write to table, create db if it doesn't exist ... again with local and dbx code paths??
# TODO: Try to do all writes at once or find a batch size
# TODO: Need a way to redo the run, 1 write per puzzle date? Is that efficient??