# Word Embedding Search

This notebook loads 7-letter words from our word list and creates embeddings to find semantically similar words.

## Setup
First, we'll install required packages if they're not already installed.


In [1]:
import sentence_transformers
import sklearn
import numpy
import json


  from .autonotebook import tqdm as notebook_tqdm


## 1. Load Word List
Load the words from our common-7-letter-words.txt file

In [11]:
with open('wordLists/7letters.json', 'r') as f:
    words = json.load(f)

print(f"Loaded {len(words)} words.")

Loaded 41997 words.


## 2. Generate Word Embeddings
Use sentence-transformers to generate embeddings for all words


In [12]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize the model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all words
print("Generating embeddings...")
word_embeddings = model.encode(words, show_progress_bar=True)
print("Done!")


Generating embeddings...


Batches: 100%|██████████| 1313/1313 [00:34<00:00, 37.54it/s]


Done!


## 3. Query Function
Create a function to find the most similar words


In [13]:

from sklearn.metrics.pairwise import cosine_similarity

def query_top_k(query, k=5):
    """Find the k most similar words to the query.
    
    Args:
        query (str): The word or phrase to find similar words to
        k (int): Number of similar words to return
        
    Returns:
        list: Top k words and their similarity scores
    """
    query_emb = model.encode([query])
    similarities = cosine_similarity(query_emb, word_embeddings)[0]
    top_k_idx = np.argsort(similarities)[::-1][:k]
    return [(words[i], similarities[i]) for i in top_k_idx]

def query_by_threshold(query, threshold=0.5):
    """Find all words with similarity score greater than threshold.
    
    Args:
        query (str): The word or phrase to find similar words to
        threshold (float): Minimum similarity score to return
        
    Returns:
        list: All words with similarity score greater than threshold
    """
    query_emb = model.encode([query])
    similarities = cosine_similarity(query_emb, word_embeddings)[0]
    # filter out words with similarity score less than threshold
    filtered_words = [word for word, similarity in zip(words, similarities) if similarity > threshold]
    # sort by similarity score
    sorted_words = sorted(filtered_words, key=lambda x: x[1], reverse=True)
    return sorted_words

## 4. Performing queries
Try finding similar words with different queries


In [15]:
# Example queries
threshold = 0.45

themes = [
    "winter",
    "spring", 
    "summer",
    "autumn",
    "breakfast",
    "italian_food",
    "asian_food",
    "desserts",
    "ocean_life",
    "forest_animals",
    "birds",
    "insects",
    "basketball",
    "soccer",
    "swimming",
    "hiking",
    "computers",
    "space",
    "chemistry",
    "biology",
    "painting",
    "music",
    "dance",
    "photography",
    "cities",
    "landmarks",
    "transportation",
    "countries",
    "joy",
    "love",
    "courage",
    "peace",
    "medical",
    "education",
    "construction",
    "cooking",
    "storm",
    "sunny",
    "rainy",
    "snowy",
]

results_by_theme = []
for theme in themes:
    results_by_theme.append(query_by_threshold(theme, threshold=threshold))

unchecked = []
for i in range(len(results_by_theme)):
    unchecked.append({"theme": themes[i], "words": [word for word in results_by_theme[i] if not word.endswith('ing') and not word.endswith('s')]})

# output as a json list
print(json.dumps(unchecked, indent=4))


[
    {
        "theme": "winter",
        "words": [
            "sweater",
            "outdoor",
            "outpour",
            "outside",
            "summery",
            "freezer",
            "frosted",
            "froster",
            "coldest",
            "coldish",
            "coldong",
            "holiday",
            "snowcap",
            "snowdon",
            "snowier",
            "snowily",
            "snowish",
            "snowman",
            "snowmen",
            "clausal",
            "clausum",
            "climate",
            "glacier",
            "wintery",
            "chilled",
            "yearday",
            "weather",
            "icework",
            "october",
            "daytime",
            "darkest",
            "january",
            "warmest",
            "warmish"
        ]
    },
    {
        "theme": "spring",
        "words": [
            "summery",
            "sunrise",
            "springe",
            "springy",
    