# bootstrap_words_01_filter_add_words

- version = 1
- read in raw external wordlist -> source set
- rows = [ ]
- for each word in source set:
   - skip if < 4 chars long
   - get letter_set, skip if len(letter_set) > 7
   - append (word, letter_set, version) to rows
- get all puzzle paths
- puzzle_answer_words = set
- for each puzzle:
    -  load puzzle and get answers
    -  for each word in answers:
        -  add to puzzle_answer_words
- for each word in puzzle_answer_words:
    - add (word, letter_set, version) to rows
- save rows (word, letter_set, version) to CSV

In [None]:
%run "./00_setup.ipynb"

In [None]:
import csv
from src.wordutils import get_letter_set, filter_wordlist
from src.constants import WORDLIST_PATH, RAW_WORDLIST_FILENAME, RAW_SOLUTIONS_PATH, WORDLIST_TEMP_CSV_FILENAME
from src.fileutils import get_all_files, get_local_path, get_puzzle_by_path, word_file_to_set

In [None]:
# filter the wordlist
wordlist = filter_wordlist(word_file_to_set(f"{WORDLIST_PATH}/{RAW_WORDLIST_FILENAME}"))
print(f"{len(wordlist)} words after filtering external wordlist.")

In [None]:
# Add solution words from past puzzles that were not in the external wordlist
# We do not need to filter out these words. If they were in a puzzle solution, they're valid.

# get all puzzle paths
puzzle_paths = get_all_files(RAW_SOLUTIONS_PATH, [".json"])

# load each puzzle and add to answer set
print("loading puzzles ", end="")
for puzzle_path in puzzle_paths:
    print(".", end="")
    puzzle = get_puzzle_by_path(puzzle_path)
    answers = puzzle["answers"]
    wordlist.update(answers)

print(f"\n{len(wordlist)} words after adding new words from past puzzle answers.")

In [None]:
# create the rows
VERSION = 1
rows = [(word, get_letter_set(word), VERSION) for word in sorted(wordlist)]

In [None]:
temp_path = get_local_path(f"{WORDLIST_PATH}/{WORDLIST_TEMP_CSV_FILENAME}")
with open(temp_path, "w") as f:
    writer = csv.writer(f)
    writer.writerow(["word", "letter_set", "version"])
    writer.writerows(rows)

print(f"✅ Wrote {len(rows)} rows to {temp_path}")