# Word Vector Exploration
In this notebook, we will explore word vectors from Spacy's `en_core_web_sm` model.  We will first
load these word vectors into a vector database and then use the database to find to top-k most
similar words.

After that, we will explore analogies using the word vectors, an interesting application that is often
used to evaluate the quality of word vectors and showcase biases in the data.

Finally, we will plot the word vectors using t-SNE to visualize the word vectors in 2D space.

In [1]:
import faiss
import numpy as np
import spacy
from tqdm import tqdm
import plotly.graph_objects as go
from sklearn.manifold import TSNE

# Load the spaCy model
# python -m spacy download en_core_web_lg
nlp = spacy.load('en_core_web_lg')

## Explore Spacy's Word Vectors

In [2]:
banana = nlp("banana")
fruit = nlp("fruit")
table = nlp("table")
print(f'Similarity between banana and fruit: {banana.similarity(fruit)}')
print(f'Similarity between banana and table: {banana.similarity(table)}')

Similarity between banana and fruit: 0.6650428369389225
Similarity between banana and table: 0.20632129046587597


## Find Similar Words

In [3]:
def create_faiss_index(nlp):
    """
    Create a FAISS index for the word vectors in the given spaCy model and store words as metadata.

    Args:
        nlp: The spaCy language model.

    Returns:
        faiss_index: The FAISS index of word vectors.
        word_list: List of words corresponding to the vectors in the FAISS index.
    """
    # Filter the words to only those that have vectors
    words = [word for word in tqdm(nlp.vocab.strings, "Loading all words") if nlp.vocab.has_vector(word)]

    # Create a matrix to hold all vectors of words that have one
    vectors = np.zeros((len(words), nlp.vocab.vectors_length), dtype='float32')
    
    for i, word in enumerate(tqdm(words, "Loading all vectors")):
        vectors[i] = nlp.vocab.get_vector(word)
    
    # Normalize the vectors (important for cosine similarity)
    faiss.normalize_L2(vectors)
    
    # Create a flat, L2 FAISS index
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)
    
    return index, words

# Create the FAISS index and word list
faiss_index, words = create_faiss_index(nlp)

Loading all words: 100%|██████████| 776469/776469 [00:06<00:00, 127025.35it/s]
Loading all vectors: 100%|██████████| 514092/514092 [00:01<00:00, 448628.16it/s]


In [4]:
def find_similar_words(faiss_index, words, query, nlp, topn=10):
    """
    Find the most similar words to the query word using a FAISS index.

    Args:
        faiss_index: The FAISS index where word vectors are stored.
        words: The list of words that corresponds to the vectors in the FAISS index.
        query: The query word to find similarities for.
        nlp: The loaded spaCy language model.
        topn: The number of similar words to return.

    Returns:
        A list of the most similar words with their similarity scores.
    """
    # Get the vector for the query word
    query_vector = nlp.vocab.get_vector(query).reshape(1, -1)
    
    # Normalize the query vector
    faiss.normalize_L2(query_vector)
    
    # Search the index for similar vectors
    distances, indices = faiss_index.search(query_vector, topn)
    
    # Return the words corresponding to the indices
    return [(words[idx], 1 - distances[0][i]) for i, idx in enumerate(indices[0])]

In [5]:
# Find similar words
similar_words = find_similar_words(faiss_index, words, 'dog', nlp, topn=10)
print('Top 10 words most similar to "dog":')
for index, (word, score) in enumerate(similar_words):
    print(f'{index+1}: {word} ({score:.2f})')

Top 10 words most similar to "dog":
1: dog (1.00)
2: dogs (0.67)
3: cat (0.64)
4: puppy (0.62)
5: pet (0.57)
6: pup (0.49)
7: canine (0.44)
8: wolfdogs (0.42)
9: dogsled (0.41)
10: uppy (0.40)


In [6]:
# Find similar words
similar_words = find_similar_words(faiss_index, words, 'cheese', nlp, topn=10)
print('Top 10 words most similar to "cheese":')
for index, (word, score) in enumerate(similar_words):
    print(f'{index+1}: {word} ({score:.2f})')

Top 10 words most similar to "cheese":
1: cheese (1.00)
2: cheesed (0.90)
3: cheesey (0.89)
4: cheeses (0.81)
5: headcheese (0.76)
6: cheesesteak (0.72)
7: -St (0.71)
8: cheesesteaks (0.70)
9: -Cheese (0.65)
10: theese (0.63)


In [7]:
# Find similar words
similar_words = find_similar_words(faiss_index, words, 'baseball', nlp, topn=10)
print('Top 10 words most similar to "baseball":')
for index, (word, score) in enumerate(similar_words):
    print(f'{index+1}: {word} ({score:.2f})')

Top 10 words most similar to "baseball":
1: baseball (1.00)
2: D1Baseball (0.74)
3: baseballs (0.71)
4: Cueball (0.70)
5: theball (0.69)
6: softball (0.68)
7: skeeball (0.65)
8: nutball (0.64)
9: Greaseball (0.62)
10: Dodgeball (0.60)


## Explore Analogies

In [8]:
def find_analogy(faiss_index, words, word1, word2, word3, nlp):
    """
    Compute the analogy of the form word1:word2 as word3:? using a FAISS index.

    This function finds the word that best completes the analogy given three words.
    It uses vector arithmetic (word3 - word1 + word2) to find the target vector and then
    searches for the closest word vector in the FAISS index.

    Args:
        faiss_index: The FAISS index where word vectors are stored.
        words: The list of words that corresponds to the vectors in the FAISS index.
        word1: The first word in the analogy.
        word2: The second word in the analogy.
        word3: The third word in the analogy.
        nlp: The loaded spaCy language model.

    Returns:
        The computed word that completes the analogy.
    """
    # Compute vectors for the given words
    vec1 = nlp.vocab.get_vector(word1).reshape(1, -1)
    vec2 = nlp.vocab.get_vector(word2).reshape(1, -1)
    vec3 = nlp.vocab.get_vector(word3).reshape(1, -1)

    # Compute the target vector for the analogy
    target_vector = vec3 - vec1 + vec2

    # Normalize the target vector
    faiss.normalize_L2(target_vector)

    # Search the index for the vector closest to the target vector
    _, indices = faiss_index.search(target_vector, 4)

    # Filter out the input words
    indices = [idx for idx in indices[0] if words[idx] not in [word1, word2, word3]]

    return words[indices[0]]

In [9]:
find_analogy(faiss_index, words, 'king', 'man', 'queen', nlp)

'woman--'

In [10]:
find_analogy(faiss_index, words, 'rain', 'raincoat', 'snow', nlp)

'snowshoes'

In [11]:
find_analogy(faiss_index, words, 'coffee', 'morning', 'tea', nlp)

'midafternoon'

## Visualize Word Vectors

In [12]:
def create_2d_tsne_plot_multi_queries_colored(faiss_index, words, query_words, nlp, topn=10):
    """
    Create a 2D t-SNE plot of the topn similar words to multiple query words, with each query word's similar
    words colored differently.

    Args:
        faiss_index: The FAISS index of word vectors.
        words: List of words corresponding to the vectors in the FAISS index.
        query_words: A list of words to center the plot on.
        nlp: The loaded spaCy language model.
        topn: Number of similar words to include in the plot for each query word.

    Returns:
        A Plotly figure object.
    """
    all_similar_words = []
    word_labels = []
    colors = []
    color_map = {}  # Map each query word to a unique color
    available_colors = ['red', 'green', 'blue', 'orange', 'purple', 'yellow', 'cyan', 'magenta', 'lime', 'pink']

    # Assign a unique color to each query word
    for i, query_word in enumerate(query_words):
        color_map[query_word] = available_colors[i % len(available_colors)]

    for query_word in query_words:
        similar_words = find_similar_words(faiss_index, words, query_word, nlp, topn=topn)
        for word, _ in similar_words:
            if word not in word_labels:  # Ensure unique words
                all_similar_words.append(nlp.vocab.get_vector(word))
                word_labels.append(word)
                colors.append(color_map[query_word])

    if not all_similar_words:
        raise ValueError("No similar words found for the given queries.")

    # Extract the vectors for these words
    vectors = np.array(all_similar_words)
    
    # Run t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    reduced_vectors = tsne.fit_transform(vectors)
    
    # Plot
    fig = go.Figure(data=go.Scatter(
        x=reduced_vectors[:,0],
        y=reduced_vectors[:,1],
        mode='markers+text',
        text=word_labels,
        marker=dict(
            size=8,
            color=colors,  # Apply the color mapping
            opacity=0.8,
        ),
        textposition="top center"
    ))
    
    # Customize layout
    title_query = ", ".join(query_words)
    fig.update_layout(title=f"2D t-SNE visualization for {title_query}", margin=dict(l=0, r=0, b=0, t=0), xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))

    fig.update_layout(
        margin=dict(l=40, r=40, b=40, t=40),  # Adjust these values as needed to fit the labels
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),  # Ensure tick labels are shown if desired
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )
 
    return fig

query_words = ["king", "vehicle", "apple", "coffeetable"]  # Specify your query words here
fig = create_2d_tsne_plot_multi_queries_colored(faiss_index, words, query_words, nlp, topn=50)
fig.show()
