<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Embeddings-and-Analogies/GloVe-Analogy-Solver/Glove_Analogy_Solver.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
!pip install -q gensim

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import gensim.downloader as api
from sklearn.preprocessing import normalize

# Load Pre-Trained GloVe

In [3]:
print("Loading glove-wiki-gigaword-100...")
model = api.load("glove-wiki-gigaword-100")
print("Dataset downloaded successfully...")

Loading glove-wiki-gigaword-100...
Dataset downloaded successfully...


# Vectorization

In [4]:
print("Pre-computing normalized weights for speed...")
all_words = model.index_to_key
word_vectors = model.vectors  # Shape: (400000, 100)
# Normalize each row (vector) to unit length (L2 norm = 1)
normalized_vectors = normalize(word_vectors, norm='l2', axis=1)

def find_analogy(a, b, c, top_n=1):
    """
    Solves the analogy A:B :: C:D (A is to B as C is to D)
    Formula: D = argmax( cos(v_D, v_B - v_A + v_C) )
    """
    # Check if words exist in vocabulary
    for word in [a, b, c]:
        if word not in model:
            return f"Error: '{word}' not found in vocabulary."

    # Get vectors for A, B, and C
    vec_a = model[a]
    vec_b = model[b]
    vec_c = model[c]

    # Calculate target vector: v_B - v_A + v_C
    target_vec = vec_b - vec_a + vec_c
    # Normalize target vector for cosine similarity calculation
    target_vec = target_vec / np.linalg.norm(target_vec)

    # VECTORIZED SEARCH:
    # Dot product of normalized_vectors (400k, 100) and target_vec (100,)
    # Result is a vector of 400k similarity scores
    similarities = np.dot(normalized_vectors, target_vec)

    # Apply the constraint: Output cannot be A, B, or C
    # We find indices of A, B, and C and set their similarity to -infinity
    for word in [a, b, c]:
        idx = model.key_to_index[word]
        similarities[idx] = -np.inf

    # Get the indices of top N results
    best_indices = np.argsort(similarities)[-top_n:][::-1]

    results = [(all_words[idx], similarities[idx]) for idx in best_indices]
    return results

Pre-computing normalized weights for speed...


# Test

In [7]:
test_cases = [
    ("man", "doctor", "woman"),
    ("japan", "sushi", "germany"),
    ("scientist", "einstein", "painter"),
    ("tall", "tallest", "short"),
    ("france", "paris", "italy")
]

print("=" * 40)
print("\tAnalogy Results")
print("=" * 40)

for a, b, c in test_cases:
    prediction = find_analogy(a, b, c)
    if isinstance(prediction, list):
        print(f"{a}:{b} :: {c}:?  =>  {prediction[0][0]} (Score: {prediction[0][1]:.4f})")
    else:
        print(prediction)

	Analogy Results
man:doctor :: woman:?  =>  nurse (Score: 0.7757)
japan:sushi :: germany:?  =>  pastry (Score: 0.5280)
scientist:einstein :: painter:?  =>  picasso (Score: 0.6477)
tall:tallest :: short:?  =>  longest (Score: 0.6133)
france:paris :: italy:?  =>  rome (Score: 0.8084)
