# 02_extract_word_decisions

- read in the files from a given date range
- for each file:
    - find all possible words 
    - extract the explicit and implict decisions about each word
    - write to the bronze table

In [1]:
%run "./00_setup.ipynb"

✅ Added /Users/mike/repos/spelling-bee-solver-training to path
Ready to import from src/


In [2]:
from datetime import datetime, timedelta
import os
from glob import glob
from pathlib import Path

In [3]:
from src.constants import WORDLIST_FILENAME, DATE_FORMAT
from src.fileutils import word_file_to_set, get_local_path
from src.wordutils import get_letter_set_map, ingest_puzzle_by_date

In [7]:
# TODO: Parameterize YEAR in notebook, then run this per month
# TODO: Write helper function to find all puzzle dates for a given year/month
# TODO: need local version of this
def get_all_json_files(path):
    files = []
    for item in dbutils.fs.ls(path):
        if item.isDir():
            files.extend(get_all_json_files(item.path))
        elif item.path.endswith('.json'):
            files.append(item.path)
    return files
# Local version - use glob

YEAR = 2024

start_date = datetime(2024, 1, 1)
end_date = datetime(2025, 1, 1)
curr_date = start_date

puzzle_dates = []
missing = [datetime(2024, 7, 30), datetime(2024, 12, 16)]
while curr_date < end_date:
    if curr_date not in missing:
        puzzle_dates.append(curr_date.strftime("%Y-%m-%d"))
    curr_date += timedelta(days=1)

print(puzzle_dates)

['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08', '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12', '2024-01-13', '2024-01-14', '2024-01-15', '2024-01-16', '2024-01-17', '2024-01-18', '2024-01-19', '2024-01-20', '2024-01-21', '2024-01-22', '2024-01-23', '2024-01-24', '2024-01-25', '2024-01-26', '2024-01-27', '2024-01-28', '2024-01-29', '2024-01-30', '2024-01-31', '2024-02-01', '2024-02-02', '2024-02-03', '2024-02-04', '2024-02-05', '2024-02-06', '2024-02-07', '2024-02-08', '2024-02-09', '2024-02-10', '2024-02-11', '2024-02-12', '2024-02-13', '2024-02-14', '2024-02-15', '2024-02-16', '2024-02-17', '2024-02-18', '2024-02-19', '2024-02-20', '2024-02-21', '2024-02-22', '2024-02-23', '2024-02-24', '2024-02-25', '2024-02-26', '2024-02-27', '2024-02-28', '2024-02-29', '2024-03-01', '2024-03-02', '2024-03-03', '2024-03-04', '2024-03-05', '2024-03-06', '2024-03-07', '2024-03-08', '2024-03-09', '2024-03-10', '2024-03-11', '2024

In [None]:
wordlist = word_file_to_set(f"raw/wordlists/{WORDLIST_FILENAME}")
letter_set_map = get_letter_set_map(wordlist)

In [None]:
rows = []
for puzzle_date in puzzle_dates:
    print(f"ingesting puzzle for {puzzle_date}...")
    curr_rows = ingest_puzzle(puzzle_date, wordlist, letter_set_map)
    print(f"{len(curr_rows)} rows created for {puzzle_date}")
    rows.extend(curr_rows)

print(f"{len(rows)} total rows")
    

In [None]:
# TODO: Delete this cell (local only)
# TODO: Create local functions for starting Spark Session (including with db)
from pyspark.sql import SparkSession
from pyspark.sql.types import * 
import pyspark.sql.functions as F
spark = SparkSession.builder.getOrCreate()

In [None]:
schema = StructType([
    StructField("word", StringType(), False),
    StructField("accepted", BooleanType(), False),
    StructField("was_in_wordlist", BooleanType(), False),
    StructField("puzzle_date", DateType(), False),
    StructField("center_letter", StringType(), False),
    StructField("outer_letters", StringType(), False),
])
df = spark.createDataFrame(rows, schema=schema)

In [None]:
# Add derived year/month/day columns for partitioning
df = df.withColumn("year", F.year("puzzle_date")) \
       .withColumn("month", F.month("puzzle_date")) \
       .withColumn("day", F.dayofmonth("puzzle_date"))

In [None]:
df.printSchema()

In [None]:
df.filter(df.was_in_wordlist == False).select("word").distinct().sort("word").show(100, False)

In [None]:
# TODO: Backfill script validates as it goes, uses replaceWhere with delta runs for a given year only, one chunk at a time
# TODO: Daily ingest script that writes one file for a specific day/month/year
# TODO: Repurpose helper methods to write to table, create db if it doesn't exist ... again with local and dbx code paths??
# TODO: Try to do all writes at once or find a batch size
# TODO: Need a way to redo the run, 1 write per puzzle date? Is that efficient??