In [1]:
%reload_ext dotenv
%dotenv ../.env

In [2]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [5]:
import pandas as pd

In [4]:
from google import genai

client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.5-flash", contents="Explain how AI works in a few words"
)
print(response.text)

Learns from data to recognize patterns and make decisions.


In [None]:
import json

from strands_solver.word_finder import WordFinder

# Load puzzle from JSON
with open("../puzzles/2025-09-17.json", "r") as f:
    puzzle_data = json.load(f)

theme = puzzle_data["theme"]
grid = puzzle_data["grid"]

print(f"Theme: {theme}")
print(f"Grid size: {len(grid)}x{len(grid[0])}")

# Find all words in the grid
finder = WordFinder(grid)
found_strands = finder.find_all_words()

# Create list with theme first, then all found words
words = [theme] + list({strand.string for strand in found_strands})
print(f"\nTotal items to embed: {len(words)}")

Theme: We beg to differ
Grid size: 8x6

Total items to embed: 508


In [None]:
"ATTRACT" in words

True

In [None]:
from google import genai
from google.genai.types import EmbedContentConfig
from tenacity import retry, stop_after_attempt, wait_fixed

client = genai.Client()


# Retry on any exception, wait 60 seconds between retries, max 5 attempts
@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(60),
    before_sleep=lambda retry_state: print(
        f"Error occurred: {retry_state.outcome.exception() if retry_state.outcome else 'unkown'}. "
        f"\nWaiting 60 seconds before retry {retry_state.attempt_number}..."
    ),
)
def embed_batch(batch):
    response = client.models.embed_content(
        model="gemini-embedding-001",
        contents=batch,
        config=EmbedContentConfig(
            task_type="SEMANTIC_SIMILARITY",
        ),
    )
    return response


# Process words in batches of 100
batch_size = 100
embeddings = []

for i in range(0, len(words), batch_size):
    batch = words[i : i + batch_size]
    response = embed_batch(batch)
    assert response.embeddings
    for emb in response.embeddings:
        assert emb.values
        embeddings.append(emb.values)
    print(f"Processed batch {i // batch_size + 1}: {len(batch)} words")

print(f"Total embeddings: {len(embeddings)}")

Processed batch 1: 100 words
Processed batch 2: 100 words
Processed batch 3: 100 words
Processed batch 4: 100 words
Processed batch 5: 100 words
Processed batch 6: 8 words
Total embeddings: 508


In [12]:
from itertools import zip_longest

import numpy as np


def cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
    """Calculate cosine similarity between two 1D numpy arrays."""
    arr1 = np.array(vec1)
    arr2 = np.array(vec2)
    norm1 = np.linalg.norm(arr1)
    norm2 = np.linalg.norm(arr2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return np.dot(arr1, arr2) / (norm1 * norm2)


theme_similarity = [cosine_similarity(e, embeddings[0]) for e in embeddings]

df = pd.DataFrame(
    list(zip_longest(words, embeddings, theme_similarity, fillvalue=np.nan)),
    columns=np.array(["word", "embedding", "theme_similarity"]),
).sort_values("theme_similarity", ascending=False)
df.head(40)

NameError: name 'embeddings' is not defined

In [24]:
df[df["word"] == "ATTRACT"]

Unnamed: 0,word,embedding,theme_similarity
420,ATTRACT,"[-0.027488235, -0.0070873527, 0.017679159, -0....",0.78143


In [33]:
response = client.models.embed_content(
    model="gemini-embedding-001",
    contents=["OPPOSITES RIGID"],
    config=EmbedContentConfig(
        task_type="SEMANTIC_SIMILARITY",
    ),
)
assert response.embeddings
emb = response.embeddings[0].values
assert emb
cosine_similarity(emb, embeddings[0])

np.float64(0.7941492961309876)

"OPPOSITES RIGID" (an incorrect spangram) is closer to "We beg to differ" than "OPPOSITIES ATTRACT" (the correct spangram).

So semantic similarity is not a silver bullet for ranking solutions.


In [59]:
from typing import Iterable


@retry(stop=stop_after_attempt(3))
def find_best_spangram(theme: str, candidates: Iterable[str]):
    prompt = f"""
    You are given a cryptic clue and a set of candidate phrases. Your task is to find the phrase that fits the clue best. The phrase should be grammatically correct.

    Answer only with the exact text of the best candidate.

    <clue>{theme}</clue>

    <candidates>
    {"\n".join(f"<candidate>{cand.lower()}</candidate>" for cand in candidates)}
    </candidates>
    """.strip()
    response = client.models.generate_content(
        model="gemini-2.5-flash", contents=[prompt]
    )
    assert response.text
    assert response.usage_metadata
    print(f"Thinking tokens: {response.usage_metadata.thoughts_token_count}")
    for cand in candidates:
        if cand.lower() == response.text.lower():
            return cand
    raise ValueError(
        f"Model did not respond with a valid candidate, received: {response.text!r}"
    )


find_best_spangram(
    "We beg to differ",
    [
        "OPPOSITES RIGID",
        "OPPOSITES ATTRACT",
        "OPPOSITES ENGAGE",
        "OPPOSITES LOVE",
        "OPPOSITES CONTAIN",
    ],
)

Thinking tokens: 2871


'OPPOSITES ATTRACT'

In [None]:
from strands_solver.grid_coverer import GridCoverer
from strands_solver.word_finder import WordFinder
from strands_solver.solver import Solver
from strands_solver.spangram_finder import SpangramFinder
from strands_solver.main import get_puzzle

puzzle = get_puzzle("2025-12-01")

solver = Solver(
    finder=WordFinder(puzzle.grid),
    coverer=GridCoverer(puzzle.grid),
    spangram_finder=SpangramFinder(puzzle.grid, num_words=puzzle.num_words),
)
solutions = solver.solve()

2025-12-12 20:27:28,088 [INFO] HTTP Request: GET https://www.nytimes.com/svc/strands/v2/2025-12-01.json "HTTP/1.1 200 OK"
2025-12-12 20:27:28,089 [INFO] Loading dictionary
2025-12-12 20:27:28,179 [INFO] Loaded 172822 words
2025-12-12 20:27:28,180 [INFO] Finding words
2025-12-12 20:27:28,207 [INFO] Found 869 words
2025-12-12 20:27:28,207 [INFO] Covering grid
2025-12-12 20:27:32,248 [INFO] Found 2698 covers
2025-12-12 20:27:32,249 [INFO] Finding spangrams
2025-12-12 20:27:34,222 [INFO] Found 3879 solutions with spangrams


In [None]:
spangrams = set[str]()
for solution in solutions:
    spangram = " ".join(strand.string for strand in solution.spangram)
    spangrams.add(spangram)
len(spangrams)

712

In [None]:
sorted(spangrams, key=lambda spangram: len(spangram.split()))
# why is HOCKEY RINK not included?

['ZEALS HOCK PUCKA REINK',
 'BODY REINK GOAL HOCK',
 'BODY REINK GOALS HOCK',
 'ROBIN KORE GALS HOCK',
 'ZEALS HOCK AMBO REINK',
 'ZEAL SHOCK PUCKA REINK',
 'REINK ORGY AMBO CONSIST',
 'ROBE GOAL HOCK PUCK',
 'ROBE RAZES HOCK PUCK',
 'TICK SINS COEMBODY REINK',
 'BODY REINK GOAL SHOCK',
 'BOAST SINS COEMBODY REINK',
 'COEMBODY REINK GOAL HOCKS',
 'SHOCK AMBO REINK ORGY',
 'BRINK GOAL HOCK PUCE',
 'ZEKS HOCK PUCES SKAT',
 'COEMBODY REINK GOAL SHOCK',
 'REINK GOALS HOCK PUCK',
 'HOCK AMBO REINK ORGY',
 'REINK ORGY ZEAL HOCKS',
 'COEMBODY REINK GOAL HOCK',
 'HOCK PUCK AMBO REINK',
 'GOAL HOCK PUCES SKAT',
 'HOCK PUCES SKAT NICKS',
 'RICK SINS COEMBODY REINK',
 'SHOCK PUCK AMBO REINK',
 'BODY REINK GOAL HOCKS',
 'REINK GOAL HOCK PUCK',
 'ROBE RINK GOAL HOCKS',
 'BRINK GOAL HOCK PUCES',
 'PUCKA REINK ORGY ZEAL',
 'REINK ORGY ZEAL SHOCK',
 'ROBE GOAL SHOCK PUCK',
 'COEMBODY REINK GOALS HOCK',
 'HOCK PUCKA REINK ORGY',
 'ZEKS HOCK PUCES TASK',
 'ROBE RINK GOALS HOCK',
 'PUCK AMBO REINK ORGY',

In [63]:
find_best_spangram("We beg to differ", spangrams)

Thinking tokens: 1378


'OPPOSITES ATTRACT'

## Embedder playground


In [1]:
from strands_solver.embedder import Embedder

embedder = Embedder()

In [None]:
# Embed some words via API (cached=False)
test_words = ["hello", "world", "python", "embedding"]
result = await embedder.get_embeddings(test_words, cached=False)

print(f"Got {len(result)} embeddings")
for word, vec in result.items():
    print(f"  {word}: dim={len(vec)}, first 5 values={vec[:5]}")

Got 4 embeddings
  hello: dim=3072, first 5 values=[-0.03452844, 0.009059422, 0.007681673, -0.057083048, -0.029342607]
  world: dim=3072, first 5 values=[-0.018003238, -0.0058756433, 0.025875887, -0.060153086, -0.017594127]
  python: dim=3072, first 5 values=[-0.0119025335, -0.0009590619, 0.007020803, -0.06769121, -0.019711033]
  embedding: dim=3072, first 5 values=[-0.017193511, 0.006651974, -0.004125737, -0.079775475, -0.00868848]


In [6]:
# Store embeddings in SQLite
embedder.store_embeddings(result)
print("Stored embeddings in DB")

Stored embeddings in DB


In [None]:
# Retrieve from cache (cached=True)
cached = await embedder.get_embeddings(["hello", "python"], cached=True)

print("Retrieved from cache:")
for word, vec in cached.items():
    print(f"  {word}: dim={len(vec)}, first 5 values={vec[:5]}")

Retrieved from cache:
  hello: dim=3072, first 5 values=[-0.03452844172716141, 0.009059421718120575, 0.007681672926992178, -0.057083047926425934, -0.02934260666370392]
  python: dim=3072, first 5 values=[-0.01190253347158432, -0.0009590619010850787, 0.007020803168416023, -0.06769120693206787, -0.019711032509803772]


In [None]:
# Try to get missing embedding from cache (raises KeyError)
try:
    await embedder.get_embeddings(["not_in_cache"], cached=True)
except KeyError as e:
    print(f"Expected error: {e}")

Expected error: "Embedding not found in cache: 'not_in_cache'"


In [9]:
# Verify cached values match original (roundtrip check)
import numpy as np

for word in ["hello", "python"]:
    original = result[word]
    from_cache = cached[word]
    # float32 serialization loses some precision
    assert np.allclose(original, from_cache, rtol=1e-5)

print("Roundtrip check passed!")

Roundtrip check passed!


In [None]:
# Close DB connection when done
embedder.close()

### Inter-word similarity


In [43]:
from pathlib import Path

# Read all solution files from the out/ directory
out_dir = Path("../out/2025-10-03")
solution_files = sorted(out_dir.glob("*.solution.*.txt"))

solutions = []
for solution_file in solution_files:
    solution_text = solution_file.read_text()
    # Parse the solution using eval (since it's a repr of a Solution object)
    solution = eval(solution_text)
    solutions.append(solution)

print(f"Read {len(solutions)} solutions from {len(solution_files)} files")
if solutions:
    print(f"\nFirst solution:")
    print(f"  Spangram: {' '.join(strand.string for strand in solutions[0].spangram)}")
    print(f"  Non-spangram strands: {len(solutions[0].non_spangram_strands)}")


Read 22180 solutions from 22180 files

First solution:
  Spangram: MANGE ARCH BIDE DEAR FLEA
  Non-spangram strands: 6


In [36]:
min_spangram_len = min(len(s.spangram) for s in solutions)
print(f"Minimum spangram length among solutions: {min_spangram_len}")

solutions = [s for s in solutions if len(s.spangram) == min_spangram_len]
print(f"Number of solutions with minimum spangram length: {len(solutions)}")

Minimum spangram length among solutions: 1
Number of solutions with minimum spangram length: 4


In [44]:
from itertools import combinations
import json
import numpy as np
from strands_solver.embedder import Embedder


def cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
    arr1 = np.array(vec1)
    arr2 = np.array(vec2)
    norm1 = np.linalg.norm(arr1)
    norm2 = np.linalg.norm(arr2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return float(np.dot(arr1, arr2) / (norm1 * norm2))


embedder = Embedder()

# Load puzzle theme
with open("../puzzles/2025-10-03.json", "r") as f:
    puzzle_data = json.load(f)
theme = puzzle_data["theme"]
print(f"Theme: {theme}")

# Collect all unique words across all solutions
all_words = set()
for solution in solutions:
    for strand in solution.spangram:
        all_words.add(strand.string)
    for strand in solution.non_spangram_strands:
        all_words.add(strand.string)

print(f"Total unique words across all solutions: {len(all_words)}")

# Get embeddings from cache for words
embeddings = await embedder.get_embeddings(list(all_words), cached=True)
print(f"Loaded {len(embeddings)} word embeddings from cache")

# Embed theme (not cached) and add to embeddings dict
theme_emb_dict = await embedder.get_embeddings([theme], cached=False)
embeddings[theme] = theme_emb_dict[theme]
print("Embedded theme via API")

# Calculate cosine similarity stats per solution (including theme as a word)
results = []
for i, solution in enumerate(solutions):
    # Get all words in this solution + theme
    words = [theme]
    words += [strand.string for strand in solution.spangram]
    words += [strand.string for strand in solution.non_spangram_strands]

    # Calculate cosine similarity for all pairs (including theme)
    similarities = []
    for w1, w2 in combinations(words, 2):
        sim = cosine_similarity(embeddings[w1], embeddings[w2])
        similarities.append(sim)

    if similarities:
        results.append(
            {
                "solution_idx": i,
                "num_words": len(words),
                "num_pairs": len(similarities),
                "min": min(similarities),
                "max": max(similarities),
                "avg": sum(similarities) / len(similarities),
            }
        )

# Display results
df_results = pd.DataFrame(results)
df_results.sort_values("avg", ascending=False)

Theme: Who's in charge?
Total unique words across all solutions: 250
Loaded 250 word embeddings from cache
Embedded theme via API


Unnamed: 0,solution_idx,num_words,num_pairs,min,max,avg
4328,4328,8,28,0.834432,0.933726,0.886434
10208,10208,8,28,0.834432,0.933726,0.886434
18486,18486,9,36,0.788041,0.933726,0.873740
2123,2123,9,36,0.788041,0.933726,0.873740
18577,18577,8,28,0.781865,0.911312,0.868893
...,...,...,...,...,...,...
13113,13113,11,55,0.765797,0.850070,0.811080
8188,8188,11,55,0.765797,0.848864,0.810930
19966,19966,11,55,0.765797,0.848864,0.810930
7136,7136,11,55,0.765797,0.848864,0.810231


In [45]:
# Find best solution by average similarity
best_idx = df_results.loc[df_results["avg"].idxmax(), "solution_idx"]
best_solution = solutions[best_idx]

print(f"Best solution (index {best_idx}):")
print(f"  Spangram: {' '.join(strand.string for strand in best_solution.spangram)}")
print(
    f"  Non-spangram: {', '.join(strand.string for strand in best_solution.non_spangram_strands)}"
)

# Get all words in this solution
words = [theme]
words += [strand.string for strand in best_solution.spangram]
words += [strand.string for strand in best_solution.non_spangram_strands]

# Calculate all pairwise similarities
pairs = []
for w1, w2 in combinations(words, 2):
    sim = cosine_similarity(embeddings[w1], embeddings[w2])
    pairs.append({"word1": w1, "word2": w2, "similarity": sim})

# Display sorted by similarity
df_pairs = (
    pd.DataFrame(pairs)
    .sort_values("similarity", ascending=False)
    .reset_index(drop=True)
)
print(f"\nAll {len(df_pairs)} word pairs sorted by similarity:")
df_pairs


Best solution (index 4328):
  Spangram: LEADERSHIP
  Non-spangram: SUPERVISOR, MANAGER, BOSS, CHIEF, HEAD, DIRECTOR

All 28 word pairs sorted by similarity:


Unnamed: 0,word1,word2,similarity
0,SUPERVISOR,MANAGER,0.933726
1,MANAGER,BOSS,0.918473
2,MANAGER,DIRECTOR,0.911707
3,LEADERSHIP,CHIEF,0.911312
4,LEADERSHIP,SUPERVISOR,0.909379
5,LEADERSHIP,MANAGER,0.909295
6,SUPERVISOR,BOSS,0.908978
7,LEADERSHIP,BOSS,0.906313
8,BOSS,CHIEF,0.906054
9,SUPERVISOR,DIRECTOR,0.905057


In [42]:
# Calculate pairwise similarities between ALL words
pairs = []
for w1, w2 in combinations(all_words, 2):
    sim = cosine_similarity(embeddings[w1], embeddings[w2])
    pairs.append({"word1": w1, "word2": w2, "similarity": sim})

df_pairs = (
    pd.DataFrame(pairs)
    .sort_values("similarity", ascending=False)
    .reset_index(drop=True)
)
print(f"\nAll {len(df_pairs)} word pairs sorted by similarity:")
df_pairs



All 36 word pairs sorted by similarity:


Unnamed: 0,word1,word2,similarity
0,MANAGE,MANAGER,0.93703
1,SUPERVISOR,MANAGER,0.933726
2,BOSS,MANAGER,0.918473
3,DIRECTOR,MANAGER,0.911707
4,CHIEF,LEADERSHIP,0.911312
5,SUPERVISOR,LEADERSHIP,0.909379
6,LEADERSHIP,MANAGER,0.909295
7,SUPERVISOR,BOSS,0.908978
8,BOSS,LEADERSHIP,0.906313
9,CHIEF,BOSS,0.906054
