# 01 Backend walkthrough

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

os.chdir("..")
os.getcwd()

In [None]:
from src.gridgpt.utils import load_catalog

In [None]:
catalog = load_catalog()
catalog

## 1. Set-up word database

GridGPT uses a word database built from online crossword sources. Follow these steps to create and maintain the database:

1. In the terminal, navigate to the root directory of the project: `cd /path/to/gridgpt` and activate your virtual environment.

2. Scrape the source data of NYT's Mini Crosswords from [`worddb.com`](https://worddb.com)

    ```bash
    python scripts/scrape_worddb.py --start-date 2023-01-01 --end-date 2023-12-31
    ```
    This creates and updates: `data/01_raw/worddb_com/nyt_mini_clues.json`


3. Process the scraped raw data to create the main word database

    ```bash
    python scripts/create_worddb_database.py
    ```
    This processes the scraped data and creates: `data/02_intermediary/word_database/word_database_full.json`

## 2. Word database manager

When `WordDatabaseManager()` is initialized, it automatically creates up to date filtered databases relevant for the crossword to be generated:

- [`word_database_filtered.json`](../data/02_intermediary/word_database/word_database_filtered.json) - Filtered word-clue pairs
- [`word_list_with_frequencies.json`](../data/02_intermediary/word_database/word_list_with_frequencies.json) - Word frequency analysis

The filtering process can flexibly:
- `min_length` & `max_length` - limit for minimum and maximum number of characters in a word
- `min_frequency` - apply a minimum frequency threshold (e.g., words must have been used in a crossword more than 5 times)
- `exclude_special_characters` - exclude special characters
- `exclude_reference_clues` - remove reference clues (e.g., "See 15-Across")

In [None]:
from src.gridgpt.word_database_manager import WordDatabaseManager

In [None]:
word_db_manager = WordDatabaseManager(
    min_frequency=1,
    min_length=3,
    max_length=5,
    exclude_special_chars=True,
    exclude_reference_clues=True,
)

In [None]:
# The following data is stored and available in the WordDatabaseManager instance

# word_db_manager.word_database_full
# word_db_manager.word_database_filtered
# word_db_manager.word_list_with_frequencies
# word_db_manager.words_by_length

print(f"Number of words in the full word database: {len(word_db_manager.word_database_full)}")
print(f"Number of words in the filtered word database: {len(word_db_manager.word_database_filtered)}")
print(
    f"Number of words in the word list with frequencies: {len(word_db_manager.word_list_with_frequencies)}",
    f"\n  with minimum frequency {min(word_db_manager.word_list_with_frequencies.values())} and maximum frequency {max(word_db_manager.word_list_with_frequencies.values())}."
)
print(f"Groups of words by length: {list(word_db_manager.words_by_length.keys())}")

## 3. Template manager

In [None]:
from src.gridgpt.template_manager import (
    load_templates,
    select_template,
    identify_theme_slots,
    print_template_grid,
)

In [None]:
templates_data = load_templates()
templates = templates_data["templates"]

for template in templates:
    print(f"{template['name']} (ID: {template['id']}, difficulty: {template['difficulty']})")

In [None]:
template = select_template(template_id='5x5_blocked_corners')

In [None]:
theme_slots = identify_theme_slots(template)

In [None]:
print_template_grid(template)

## 4. Theme manager

In [None]:
from src.gridgpt.theme_manager import ThemeManager, generate_theme_entry

In [None]:
theme = "music"

### All in one

In [None]:
theme_entry = generate_theme_entry(
    theme=theme,
    min_chars=5,
    max_chars=5,
    min_frequency=1,
    similarity_mode="semantic",
    similarity_threshold=0.35,
    weigh_similarity=True,
    word_db_manager=word_db_manager
)
theme_entry

### Step by step

In [None]:
theme_manager = ThemeManager(theme, word_db_manager)

In [None]:
theme_manager.theme_embedding = theme_manager.embedding_provider.embed([theme])[0]
theme_manager.theme_embedding

In [None]:
theme_entries = theme_manager.find_theme_entries(
    min_chars=5,
    max_chars=5,
    min_frequency=1,
    similarity_mode="semantic"
)

In [None]:
print("\nTop 20 theme entries by similarity:")
for word, score in theme_entries[:20]:
    print(f"  {word}: {score:.3f}")

In [None]:
# Try running this cell a few times to see which different entries may be chosen
selected_theme_entries = theme_manager.choose_theme_entries(
    number_of_theme_entries=1, threshold=0.35, weigh_similarity=True
)
selected_theme_entry = selected_theme_entries[0]
selected_theme_entry

In [None]:
# Test choosing theme entries
print()
print("Choose 1 theme entry with similarity weighting:")
selected = theme_manager.choose_theme_entries(number_of_theme_entries=1, threshold=0.35, weigh_similarity=True)
print(f"Selected: {selected}")

print("\nChoose 3 theme entries with similarity weighting:")
selected_multiple = theme_manager.choose_theme_entries(number_of_theme_entries=3, threshold=0.35, weigh_similarity=True)
print(f"Selected: {selected_multiple}")

print("\nChoose 3 theme entries without similarity weighting (uniform random):")
selected_uniform = theme_manager.choose_theme_entries(number_of_theme_entries=3, threshold=0.35, weigh_similarity=False)
print(f"Selected: {selected_uniform}")

In [None]:
# Test with different themes
themes_to_test = ["food", "space", "sports", "music"]

for test_theme in themes_to_test:
    print(f"\n=== Theme: {test_theme} ===")
    test_manager = ThemeManager(test_theme)
    entries = test_manager.find_theme_entries(min_chars=4, max_chars=6)
    print(f"Top 5 entries: {[(word, f'{score:.3f}') for word, score in entries[:5]]}")
    selected = test_manager.choose_theme_entries(number_of_theme_entries=2, threshold=0.15)
    print(f"Selected entries: {selected}")

## 5. Crossword Generator

In [None]:
from src.gridgpt.crossword_generator import CrosswordGenerator, generate_themed_crossword

### All in one

In [None]:
crossword = generate_themed_crossword(
    template=template,
    theme_entry=theme_entry,
    max_attempts=100,
    backtracking_max_attempts=100,
    word_db_manager=word_db_manager
)

### Step by step

In [None]:
generator = CrosswordGenerator(word_db_manager)

In [None]:
print(theme_entry)

In [None]:
generator.validate_theme_entry(theme_entry)

In [None]:
template_with_theme = generator.place_theme_entry(template, theme_entry)
template_with_theme

In [None]:
backtracking_attempts = 100
for attempt in range(backtracking_attempts):
    try:
        filled_grid = generator.backtracking_fill(template_with_theme, max_attempts=100)
        if filled_grid:
            print(f"Successfully filled grid on attempt {attempt + 1}")
            break
    except Exception as e:
        if attempt == backtracking_attempts - 1:
                raise

In [None]:
filled_grid["grid"]

## 6. Clue manager

In [None]:
from src.gridgpt.clue_manager import ClueRetriever, ClueGenerator

In [None]:
print("Theme:", theme)
print("Grid:")
display(crossword['grid'])

### 6.1 Retrieve clues from database

In [None]:
retriever = ClueRetriever(word_db_manager)

In [None]:
retrieved_clues = retriever.retrieve_existing_clues(crossword)

In [None]:
# Print clues
print("\nClues:")
print("\nAcross:")
across_clues = {k: v for k, v in retrieved_clues.items() if 'A' in k}
for slot_id, clue in sorted(across_clues.items()):
    word = crossword["filled_slots"][slot_id]
    print(f"{slot_id}: {clue} ({word})")
    
print("\nDown:")
down_clues = {k: v for k, v in retrieved_clues.items() if 'D' in k}
for slot_id, clue in sorted(down_clues.items()):
    word = crossword["filled_slots"][slot_id]
    print(f"{slot_id}: {clue} ({word})")

In [None]:
# Under the hood, retrieve_existing_clues first gets all available 
# clues for a given word and then randomly selects one
example_word = "AURA"

available_clues = retriever.get_available_clues(example_word)
print("Available clues:", available_clues)

selected_clue = retriever.select_random_clue(available_clues)
print("Randomly selected clue:", selected_clue)

### 6.2 Generate clues with an LLM

In [None]:
generator = ClueGenerator(word_db_manager=word_db_manager)

In [None]:
# Prompt that is formatted and used for clue generation
generator.prompt

In [None]:
generated_clues = generator.generate_clues(crossword, theme)

In [None]:
# Print clues
print("\nClues:")
print("\nAcross:")
across_clues = {k: v for k, v in generated_clues.items() if 'A' in k}
for slot_id, clue in sorted(across_clues.items()):
    word = crossword["filled_slots"][slot_id]
    print(f"{slot_id}: {clue} ({word})")
    
print("\nDown:")
down_clues = {k: v for k, v in generated_clues.items() if 'D' in k}
for slot_id, clue in sorted(down_clues.items()):
    word = crossword["filled_slots"][slot_id]
    print(f"{slot_id}: {clue} ({word})")