In [1]:
%reload_ext dotenv
%dotenv ../.env

In [13]:
import sys
from pathlib import Path

project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

In [7]:
import pandas as pd

In [3]:
from google import genai

client = genai.Client()

response = client.models.generate_content(
    model="gemini-2.5-flash", contents="Explain how AI works in a few words"
)
print(response.text)

AI identifies patterns in vast data to learn, predict, and make decisions.


In [14]:
import json

from finder import Finder

# Load puzzle from JSON
with open('../puzzles/2025-10-03.json', 'r') as f:
    puzzle_data = json.load(f)

theme = puzzle_data['theme']
grid = puzzle_data['puzzle']

print(f"Theme: {theme}")
print(f"Grid size: {len(grid)}x{len(grid[0])}")

# Find all words in the grid
finder = Finder(grid)
found_strands = finder.find_all_words()

# Create list with theme first, then all found words
words = [theme] + [strand.string for strand in found_strands]
print(f"\nTotal items to embed: {len(words)}")

Theme: Who's in charge?
Grid size: 8x6

Total items to embed: 1295


In [17]:
from google import genai
from google.genai.types import EmbedContentConfig
from tenacity import retry, stop_after_attempt, wait_fixed

client = genai.Client()

# Retry on any exception, wait 60 seconds between retries, max 5 attempts
@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(60),
    before_sleep=lambda retry_state: print(
        f"Error occurred: {retry_state.outcome.exception() if retry_state.outcome else "unkown"}. "
        f"\nWaiting 60 seconds before retry {retry_state.attempt_number}..."
    )
)
def embed_batch(batch):
    response = client.models.embed_content(
        model="gemini-embedding-001",
        contents=batch,
        config=EmbedContentConfig(
            task_type="SEMANTIC_SIMILARITY",
        ),
    )
    return response

# Process words in batches of 100
batch_size = 100
embeddings = []

for i in range(0, len(words), batch_size):
    batch = words[i : i + batch_size]
    response = embed_batch(batch)
    assert response.embeddings
    for emb in response.embeddings:
        assert emb.values
        embeddings.append(emb.values)
    print(f"Processed batch {i // batch_size + 1}: {len(batch)} words")

print(f"Total embeddings: {len(embeddings)}")

Processed batch 1: 100 words
Error occurred: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.\n* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/embed_content_free_tier_requests', 'quotaId': 'EmbedContentRequestsPerMinutePerUserPerProjectPerModel-FreeTier'}]}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}]}}. Waiting 60 seconds before retry 1...
Processed batch 2: 100 words
Error occurred: 429 RESOURCE_EXHAUSTED. {'error': {'code': 429, 'message': 'You excee

RetryError: RetryError[<Future at 0x11e371350 state=finished raised ClientError>]

In [37]:
from itertools import zip_longest

import numpy as np


def cosine_similarity(vec1: list[float], vec2: list[float]) -> float:
    """Calculate cosine similarity between two 1D numpy arrays."""
    arr1 = np.array(vec1)
    arr2 = np.array(vec2)
    norm1 = np.linalg.norm(arr1)
    norm2 = np.linalg.norm(arr2)
    if norm1 == 0 or norm2 == 0:
        return 0.0
    return np.dot(arr1, arr2) / (norm1 * norm2)


theme_similarity = [cosine_similarity(e, embeddings[0]) for e in embeddings]

pd.DataFrame(
    list(zip_longest(words, embeddings, theme_similarity, fillvalue=np.nan)),
    columns=np.array(["word", "embedding", "theme_similarity"]),
).sort_values("theme_similarity", ascending=False)[0:40]

Unnamed: 0,word,embedding,theme_similarity
0,Who's in charge?,"[-0.027346142, 0.0026793897, 0.009290586, -0.0...",1.0
770,LEADERSHIP,"[-0.014990592, 0.0076978216, 0.018945364, -0.0...",0.896096
768,LEADER,"[-0.01748339, 0.009356349, 0.021340456, -0.066...",0.879373
772,LEADER,"[-0.01748339, 0.009356349, 0.021340456, -0.066...",0.879373
769,LEADERS,"[-0.01795223, 0.0013008751, 0.022484446, -0.05...",0.86597
198,MANAGER,"[-0.020130286, -0.00023499256, 0.014827996, -0...",0.863864
767,LEADIER,"[-0.009267567, 0.012589904, 0.01994284, -0.064...",0.8625
766,LEADIER,"[-0.009267567, 0.012589904, 0.01994284, -0.064...",0.8625
197,MANAGE,"[-0.018445294, -0.010931728, 0.014863009, -0.0...",0.852446
200,MANAGE,"[-0.018445294, -0.010931728, 0.014863009, -0.0...",0.852446


In [41]:
from collections import Counter

unique_words = [word for word, count in Counter(words).items() if count == 1]
len(words)

1295