In [1]:
import numpy as np
from numpy import dot
from numpy.linalg import norm

In [2]:
def load_glove_model(File):
    """
    Load the GloVe model from a file.
    
    Args:
        File (str): Path to the GloVe embeddings file.
    
    Returns:
        dict: A dictionary mapping words to their embedding vectors.
    """
    print("Loading GloVe Model")
    glove_model = {}
    with open(File, 'r', encoding="utf-8") as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float32)
            glove_model[word] = embedding
    print(f"{len(glove_model)} words loaded!")
    return glove_model

In [3]:
def cosine_similarity(word1, word2, glove_vectors):
    """
    Compute the cosine similarity between two words using their GloVe vectors.
    
    Args:
        word1 (str): First word.
        word2 (str): Second word.
        glove_vectors (dict): Dictionary of GloVe vectors.
    
    Returns:
        float or None: Cosine similarity between word1 and word2, or None if a word is not found.
    """
    if word1 in glove_vectors and word2 in glove_vectors:
        vec1 = glove_vectors[word1]
        vec2 = glove_vectors[word2]
        return dot(vec1, vec2) / (norm(vec1) * norm(vec2))
    else:
        return None

In [4]:
def find_most_similar(word, glove_vectors, top_n=5):
    """
    Find the top-N most similar words to a given word using cosine similarity.
    
    Args:
        word (str): The word to find similar words for.
        glove_vectors (dict): Dictionary of GloVe vectors.
        top_n (int): Number of most similar words to return.
    
    Returns:
        list or None: List of tuples (word, similarity) of the top-N most similar words, or None if the word is not found.
    """
    if word not in glove_vectors:
        return None

    word_vec = glove_vectors[word]
    similarities = {}

    for other_word, other_vec in glove_vectors.items():
        if other_word != word:
            similarity = cosine_similarity(word, other_word, glove_vectors)
            similarities[other_word] = similarity

    # Sort by similarity and return the top-N most similar words
    most_similar = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:top_n]
    return most_similar

In [5]:
if __name__ == '__main__':
    # Load GloVe vectors
    glove_vectors = load_glove_model('glove.6B.50d.txt')
    
    # Compute cosine similarity for the specified word pairs
    pairs = [("cat", "dog"), ("car", "bus"), ("apple", "banana")]
    for word1, word2 in pairs:
        similarity = cosine_similarity(word1, word2, glove_vectors)
        if similarity is not None:
            print(f"Cosine similarity between '{word1}' and '{word2}': {similarity:.4f}")
        else:
            print(f"One of the words '{word1}' or '{word2}' is not in the vocabulary.")
    
    # Find the top 5 most similar words for specified words
    words_to_check = ["king", "computer", "university"]
    for word in words_to_check:
        similar_words = find_most_similar(word, glove_vectors)
        if similar_words is not None:
            print(f"\nTop 5 most similar words to '{word}':")
            for similar_word, similarity in similar_words:
                print(f"{similar_word}: {similarity:.4f}")
        else:
            print(f"The word '{word}' is not in the vocabulary.")


Loading GloVe Model
400000 words loaded!
Cosine similarity between 'cat' and 'dog': 0.9218
Cosine similarity between 'car' and 'bus': 0.8211
Cosine similarity between 'apple' and 'banana': 0.5608

Top 5 most similar words to 'king':
prince: 0.8236
queen: 0.7839
ii: 0.7746
emperor: 0.7736
son: 0.7667

Top 5 most similar words to 'computer':
computers: 0.9165
software: 0.8815
technology: 0.8526
electronic: 0.8126
internet: 0.8060

Top 5 most similar words to 'university':
college: 0.8745
harvard: 0.8711
yale: 0.8567
graduate: 0.8553
institute: 0.8484
