# Word Vector Exploration
In this notebook, we will explore word vectors from Spacy's `en_core_web_sm` model.  We will first
load these word vectors into a vector database and then use the database to find to top-k most
similar words.

Next, we will explore analogies using the word vectors, an interesting application that is often
used to evaluate the quality of word vectors and showcase biases in the data.

Finally, we will use the word vectors to cluster words and visualize the clusters using t-SNE.

In [2]:
import faiss
import numpy as np
import time
import spacy
from tqdm import tqdm

# Load the spaCy model
nlp = spacy.load('en_core_web_md')

In [3]:
banana = nlp("banana")
fruit = nlp("fruit")
table = nlp("table")
print(f'Similarity between banana and fruit: {banana.similarity(fruit)}')
print(f'Similarity between banana and table: {banana.similarity(table)}')

Similarity between banana and fruit: 0.6650428369389225
Similarity between banana and table: 0.20632129046587597


In [4]:
def create_faiss_index(nlp):
    """
    Create a FAISS index for the word vectors in the given spaCy model and store words as metadata.

    Args:
        nlp: The spaCy language model.

    Returns:
        faiss_index: The FAISS index of word vectors.
        word_list: List of words corresponding to the vectors in the FAISS index.
    """
    # Filter the words to only those that have vectors
    words = [word for word in tqdm(nlp.vocab.strings, "Loading all words") if nlp.vocab.has_vector(word)]

    # Create a matrix to hold all vectors of words that have one
    vectors = np.zeros((len(words), nlp.vocab.vectors_length), dtype='float32')
    
    for i, word in enumerate(tqdm(words, "Loading all vectors")):
        vectors[i] = nlp.vocab.get_vector(word)
    
    # Normalize the vectors (important for cosine similarity)
    faiss.normalize_L2(vectors)
    
    # Create a flat, L2 FAISS index
    index = faiss.IndexFlatL2(vectors.shape[1])
    index.add(vectors)
    
    return index, words

# Create the FAISS index and word list
faiss_index, words = create_faiss_index(nlp)

Loading all words: 100%|██████████| 776469/776469 [00:05<00:00, 132611.37it/s]
Loading all vectors: 100%|██████████| 514092/514092 [00:00<00:00, 560132.66it/s]


In [5]:
def find_similar_words(faiss_index, words, query, nlp, topn=10):
    """
    Find the most similar words to the query word using a FAISS index.

    Args:
        faiss_index: The FAISS index where word vectors are stored.
        words: The list of words that corresponds to the vectors in the FAISS index.
        query: The query word to find similarities for.
        nlp: The loaded spaCy language model.
        topn: The number of similar words to return.

    Returns:
        A list of the most similar words with their similarity scores.
    """
    # Get the vector for the query word
    query_vector = nlp.vocab.get_vector(query).reshape(1, -1)
    
    # Normalize the query vector
    faiss.normalize_L2(query_vector)
    
    # Search the index for similar vectors
    distances, indices = faiss_index.search(query_vector, topn)
    
    # Return the words corresponding to the indices
    return [(words[idx], 1 - distances[0][i]) for i, idx in enumerate(indices[0])]

In [6]:
# Find similar words
similar_words = find_similar_words(faiss_index, words, 'dog', nlp, topn=10)
print('Top 10 words most similar to "dog":')
for index, (word, score) in enumerate(similar_words):
    print(f'{index+1}: {word} ({score:.2f})')

Top 10 words most similar to "dog":
1: bichon (1.00)
2: dog (1.00)
3: dog- (1.00)
4: dogsbody (1.00)
5: mastiff (1.00)
6: Hotdogs (0.67)
7: Muckdogs (0.67)
8: Rottweilers (0.67)
9: Saltdogs (0.67)
10: bloodhounds (0.67)


In [7]:
# Find similar words
similar_words = find_similar_words(faiss_index, words, 'cheese', nlp, topn=10)
print('Top 10 words most similar to "cheese":')
for index, (word, score) in enumerate(similar_words):
    print(f'{index+1}: {word} ({score:.2f})')

Top 10 words most similar to "cheese":
1: -St (1.00)
2: Croquette (1.00)
3: Gruyere (1.00)
4: Gruyère (1.00)
5: Parcheesi (1.00)
6: Parmesan (1.00)
7: bizzare (1.00)
8: brie (1.00)
9: cheddar (1.00)
10: cheddars (1.00)


In [8]:
# Find similar words
similar_words = find_similar_words(faiss_index, words, 'baseball', nlp, topn=10)
print('Top 10 words most similar to "baseball":')
for index, (word, score) in enumerate(similar_words):
    print(f'{index+1}: {word} ({score:.2f})')

Top 10 words most similar to "baseball":
1: Cueball (1.00)
2: Curveball (1.00)
3: Dodgeball (1.00)
4: Eyeball (1.00)
5: Hosenball (1.00)
6: Knuckleball (1.00)
7: Pokeball (1.00)
8: Tomball (1.00)
9: baseball (1.00)
10: baseball-almanac.com (1.00)


In [17]:
import spacy
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd

def plot_3d_tsne_for_words(word_list):
    """
    Create a 3D t-SNE visualization for the specified list of words using spaCy embeddings.

    Args:
        word_list (list of str): A list of words to visualize.

    Returns:
        A plotly figure object that can be displayed in a Jupyter notebook.
    """
    # Load the spaCy model
    nlp = spacy.load('en_core_web_md')
    
    # Get vectors for the words
    vectors = np.array([nlp.vocab.get_vector(word) for word in word_list if nlp.vocab.has_vector(word)])
    print(vectors)
    
    # Run t-SNE
    tsne = TSNE(n_components=3, random_state=42)
    vectors_reduced = tsne.fit_transform(vectors)
    
    # Create a DataFrame for the t-SNE results
    df = pd.DataFrame(vectors_reduced, columns=['x', 'y', 'z'])
    df['word'] = [word for word in word_list if nlp.vocab.has_vector(word)]
    
    # Create a 3D scatter plot
    fig = px.scatter_3d(df, x='x', y='y', z='z', text='word', hover_data=['word'])
    
    # Improve layout
    fig.update_traces(textposition='top center')
    fig.update_layout(title="3D t-SNE Word Vectors",
                      scene=dict(xaxis_title='t-SNE dimension 1',
                                 yaxis_title='t-SNE dimension 2',
                                 zaxis_title='t-SNE dimension 3'),
                      margin=dict(l=0, r=0, b=0, t=0))
    
    return fig

# Example usage:
# Replace 'your_list_of_words' with the actual words you want to visualize
your_list_of_words = [word for word, _ in find_similar_words(faiss_index, words, 'baseball', nlp, topn=20)]
fig = plot_3d_tsne_for_words(your_list_of_words)
fig.show()


[[-3.1401   4.0082  -1.2984  ...  1.9019  -0.95315 -0.49466]
 [-3.1401   4.0082  -1.2984  ...  1.9019  -0.95315 -0.49466]
 [-3.1401   4.0082  -1.2984  ...  1.9019  -0.95315 -0.49466]
 ...
 [-3.1401   4.0082  -1.2984  ...  1.9019  -0.95315 -0.49466]
 [-3.1401   4.0082  -1.2984  ...  1.9019  -0.95315 -0.49466]
 [-3.1401   4.0082  -1.2984  ...  1.9019  -0.95315 -0.49466]]


ValueError: perplexity must be less than n_samples