# bootstrap_words_01_filter_add_words

- read in external wordlist
- filter out wordlist words that cannot be Spelling Bee solution words
    - skip if < 4 chars long
    - get letter_set, skip if len(letter_set) > 7
    - append (word, letter_set, version) to rows
- get all past puzzles from storage, find new solution words not in wordlist
    -  add word, letter_set, and version = 2
- for each word in puzzle_answer_words:
    - version = 1 for words from external wordlist, = 2 for new words from puzzles 
    - add (word, letter_set, version) to rows
- save to Delta table

In [0]:
%run "./00_setup"

In [0]:
# TODO: parameterize in pipeline
_TARGET_DB_NAME = "raw"
_TARGET_TABLE_NAME = "words"

In [0]:
from src.wordutils import get_letter_set, filter_wordlist
from src.constants import WORDLIST_PATH, RAW_WORDLIST_FILENAME, RAW_SOLUTIONS_PATH
from src.fileutils import get_all_files, get_local_path, get_puzzle_by_path, word_file_to_set
from src.sparkdbutils import create_db, create_unpartitioned_table
from pyspark.sql.types import *
import pyspark.sql.functions as F

In [0]:
# filter the wordlist
wordlist_words = filter_wordlist(word_file_to_set(f"{WORDLIST_PATH}/{RAW_WORDLIST_FILENAME}"))
print(f"📋 {len(wordlist_words)} words after filtering external wordlist.")

In [0]:
# Add solution words from past puzzles that were not in the external wordlist
# We do not need to filter out these words. If they were in a puzzle solution, they're valid.

# get all puzzle paths
puzzle_paths = get_all_files(RAW_SOLUTIONS_PATH, [".json"])
puzzle_words = set()

# load each puzzle and add to answer set
for puzzle_path in puzzle_paths:
    puzzle = get_puzzle_by_path(puzzle_path)
    answers = puzzle["answers"]
    puzzle_words.update(answers)

new_puzzle_words = puzzle_words - wordlist_words
all_words = new_puzzle_words | set(wordlist_words)
print(f"👀 {len(new_puzzle_words)} new words found in past solutions: {', '.join(sorted(new_puzzle_words))}")
print(f"📋 {len(all_words)} words total after adding new words from past puzzle answers.")

In [0]:
# create the rows
def get_wordlist_version(word: str) -> int:
    if word in new_puzzle_words:
        return 2
    else:
        return 1

rows = [(word, get_letter_set(word), get_wordlist_version(word)) for word in sorted(all_words)]

In [0]:
print(f"total rows: {len(rows)}")
print(f"new_puzzle_words rows: {len([row for row in rows if row[2] == 2])}")
print(f"wordlist_words rows: {len([row for row in rows if row[2] == 1])}")

In [0]:
schema = StructType([
    StructField("word", StringType(), False),
    StructField("letter_set", StringType(), False),
    StructField("version", IntegerType(), False)
])
df = spark.createDataFrame(rows, schema)

In [0]:
# Finally add date_added (all null for the bootstrap script)
# In the future, this will show the puzzle date of a word that has been
# added to the wordlist in a future puzzle
df = df.withColumn("date_added", F.lit(None).cast("date")) 

In [0]:
create_db(spark, _TARGET_DB_NAME)
create_unpartitioned_table(spark, df, _TARGET_TABLE_NAME, _TARGET_DB_NAME)