# Evaluate Word2Vec Embeddings

**Goal:** Load the trained CBOW model state and vocabulary to evaluate the quality of the learned word embeddings.

**Evaluations:**
1.  Find nearest neighbors for given words.
2.  Perform word analogy tasks (e.g., king - man + woman = queen).

## ⚙️ Setup and Imports

In [1]:
import torch
import torch.nn as nn
import numpy as np
import os
import sys

# Add project root to path to allow importing utils and src
# Adjust path if your notebook is not in 'notebooks/' or project root differs
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    print(f"Appended project root: {project_root}")

# Import project modules
from utils import logger, get_device
from src.word2vec.vocabulary import Vocabulary
from src.word2vec.model import CBOW

# Configure display
%matplotlib inline 
# Potentially configure pandas display options if needed
import pandas as pd
pd.options.display.max_rows = 100

Appended project root: /Users/Oks_WORKSPACE/Desktop/DEV/W1_project/Dropout_Disco
⚙️  Configuring DropoutDisco logging...
  Logger 'DropoutDisco' level set to: INFO
  ✅ Console logging handler added.
  ✅ File logging handler added: logs/dropout_disco.log
2025-04-15 16:20:46 | DropoutDisco | INFO     | [logging.py:102] | 🎉 Logging system initialized successfully!
--- Running src/__init__.py ---


## 💾 Load Vocabulary and Model

In [2]:
import torch
import torch.nn as nn
import numpy as np
import os
import sys

# Ensure project root is added to path (assuming notebook is in 'notebooks/')
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)
    # print(f"Appended project root: {project_root}") # Optional print

# Import project modules
from utils import logger, get_device
from src.word2vec.vocabulary import Vocabulary
from src.word2vec.model import CBOW

# Helper function (copied from train_word2vec.py or import if utils)
def format_num_words(num_words):
    """Formats large numbers for filenames (e.g., 10M, 500k, All)."""
    if num_words == -1: return "All"
    if num_words >= 1_000_000: return f"{num_words // 1_000_000}M"
    if num_words >= 1_000: return f"{num_words // 1_000}k"
    return str(num_words)

# --- Configuration: SET THESE TO MATCH THE RUN YOU WANT TO EVALUATE ---
# Parameters from the successful run (10M words, 5 epochs, etc.)
RUN_CORPUS_NAME = "text8" # From the filename used
RUN_NUM_WORDS = 10000000
RUN_MIN_FREQ = 5
RUN_EMBED_DIM = 128 # We will verify/infer this from the loaded model
RUN_WINDOW_SIZE = 3
RUN_EPOCHS = 5
RUN_LR = 0.001
RUN_BATCH_SIZE = 512
BASE_MODEL_DIR = os.path.join(project_root, "models/word2vec") # Base save dir

# --- Construct Paths based on Run Parameters ---
nw_str = format_num_words(RUN_NUM_WORDS)

# Vocabulary Filename
vocab_filename = f"{RUN_CORPUS_NAME}_vocab_NW{nw_str}_MF{RUN_MIN_FREQ}.json"
VOCAB_PATH = os.path.join(BASE_MODEL_DIR, vocab_filename)

# Run-Specific Subdirectory Name
run_name = (
    f"CBOW_D{RUN_EMBED_DIM}_W{RUN_WINDOW_SIZE}_NW{nw_str}_"
    f"MF{RUN_MIN_FREQ}_E{RUN_EPOCHS}_LR{RUN_LR}_BS{RUN_BATCH_SIZE}"
)
run_save_dir = os.path.join(BASE_MODEL_DIR, run_name)

# Model State Filename (using standard name within run dir)
MODEL_STATE_PATH = os.path.join(run_save_dir, "model_state.pth")

logger.info(f"--- Evaluating Run: {run_name} ---")
logger.info(f"  Attempting to load Vocab: {VOCAB_PATH}")
logger.info(f"  Attempting to load Model State: {MODEL_STATE_PATH}")

# --- Load Vocabulary ---
vocab = None
vocab_size = 0
try:
    if not os.path.exists(VOCAB_PATH):
        logger.error(f"❌ Vocabulary file not found at specified path!")
    else:
        vocab = Vocabulary.load_vocab(VOCAB_PATH)
        vocab_size = len(vocab)
except Exception as e:
    logger.error(f"❌ Failed to load vocabulary: {e}", exc_info=True)

# --- Load Model State ---
device = get_device() # Use MPS if available
model_state_dict = None
embedding_matrix = None
loaded_embedding_dim = None # Variable to store inferred dimension

if vocab and os.path.exists(MODEL_STATE_PATH):
    try:
        logger.info(f"🧠 Loading model state from: {MODEL_STATE_PATH}")
        model_state_dict = torch.load(MODEL_STATE_PATH, map_location=torch.device('cpu'))
        logger.info(f"  Model state keys loaded: {list(model_state_dict.keys())}")

        # Extract the embedding matrix
        if 'embeddings.weight' in model_state_dict:
            # Clone and move to target device
            embedding_matrix = model_state_dict['embeddings.weight'].clone().to(device)
            logger.info(f"  Extracted embedding matrix. Shape: {embedding_matrix.shape}")

            # Infer embedding dimension from loaded matrix
            loaded_embedding_dim = embedding_matrix.shape[1]
            logger.info(f"  Inferred Embedding Dimension: {loaded_embedding_dim}")

            # Verify shape matches vocab size and inferred embed dim
            if embedding_matrix.shape[0] != vocab_size:
                logger.error(
                    f"❌ Shape mismatch! Embeddings rows ({embedding_matrix.shape[0]}) "
                    f"!= Vocab size ({vocab_size})."
                )
                embedding_matrix = None # Invalidate
            else:
                logger.info(f"✅ Embedding matrix shape matches vocab size.")
        else:
            logger.error("❌ 'embeddings.weight' key not found in state dictionary!")

    except Exception as e:
        logger.error(f"❌ Failed to load model state: {e}", exc_info=True)
else:
    if not vocab:
        logger.error("Vocabulary not loaded, cannot load model state.")
    if not os.path.exists(MODEL_STATE_PATH):
        logger.error(f"Model state file not found: {MODEL_STATE_PATH}")

# --- Final Check ---
if vocab and embedding_matrix is not None:
     logger.info("✅✅ Vocabulary and Embedding Matrix loaded successfully!")
     # Update EMBEDDING_DIM if needed by subsequent cells, using the inferred value
     EMBEDDING_DIM = loaded_embedding_dim
else:
     logger.error("🚨 Failed to load required artifacts for evaluation.")

2025-04-15 16:20:46 | DropoutDisco | INFO     | [297881156.py:55] | --- Evaluating Run: CBOW_D128_W3_NW10M_MF5_E5_LR0.001_BS512 ---
2025-04-15 16:20:46 | DropoutDisco | INFO     | [297881156.py:56] |   Attempting to load Vocab: /Users/Oks_WORKSPACE/Desktop/DEV/W1_project/Dropout_Disco/models/word2vec/text8_vocab_NW10M_MF5.json
2025-04-15 16:20:46 | DropoutDisco | INFO     | [297881156.py:57] |   Attempting to load Model State: /Users/Oks_WORKSPACE/Desktop/DEV/W1_project/Dropout_Disco/models/word2vec/CBOW_D128_W3_NW10M_MF5_E5_LR0.001_BS512/model_state.pth
2025-04-15 16:20:46 | DropoutDisco | INFO     | [vocabulary.py:110] | Attempting to load vocabulary from: /Users/Oks_WORKSPACE/Desktop/DEV/W1_project/Dropout_Disco/models/word2vec/text8_vocab_NW10M_MF5.json
2025-04-15 16:20:46 | DropoutDisco | INFO     | [vocabulary.py:123] | 📚 Vocab loaded (53,408 words) from /Users/Oks_WORKSPACE/Desktop/DEV/W1_project/Dropout_Disco/models/word2vec/text8_vocab_NW10M_MF5.json
2025-04-15 16:20:46 | Drop

## 🛠️ Evaluation Utility Functions

In [3]:
def get_embedding_vector(word: str) -> torch.Tensor | None:
    """Retrieves the learned embedding vector for a given word."""
    if not vocab or embedding_matrix is None:
        logger.error("Vocabulary or embedding matrix not loaded.")
        return None
    
    word_idx = vocab.get_index(word)
    if word_idx == vocab.unk_index and word != vocab.unk_token:
        logger.warning(f"Word '{word}' not in vocabulary, using <UNK> vector.")
        # Optionally return None if you don't want UNK vector for specific words
        # return None 
    
    # Ensure index is valid before accessing embedding matrix
    if 0 <= word_idx < embedding_matrix.shape[0]:
        return embedding_matrix[word_idx]
    else:
        logger.error(f"Invalid index {word_idx} for word '{word}'.")
        return None

def cosine_similarity(vec1: torch.Tensor, vec2: torch.Tensor) -> float:
    """Calculates cosine similarity between two vectors."""
    if vec1 is None or vec2 is None:
        return 0.0
    # Use torch functions for efficiency, ensure vectors are on same device
    cos = nn.CosineSimilarity(dim=0)
    # Ensure vectors are float type
    return cos(vec1.float(), vec2.float()).item()

def find_nearest_neighbors(input_word: str, top_n: int = 10):
    """Finds the most similar words based on cosine similarity."""
    if not vocab or embedding_matrix is None:
        logger.error("Vocabulary or embedding matrix not loaded.")
        return
    
    input_vector = get_embedding_vector(input_word)
    if input_vector is None:
        logger.error(f"Could not get embedding for '{input_word}'.")
        return

    logger.info(f"Finding nearest neighbors for '{input_word}'...")
    
    # Calculate similarities with all words in the vocabulary
    # Compute on the device where embedding_matrix resides
    cos = nn.CosineSimilarity(dim=1)
    similarities = cos(embedding_matrix.float(), input_vector.float().unsqueeze(0))
    
    # Get top N+1 indices (to exclude the input word itself)
    # Add small check for top_n against vocab size
    k = min(top_n + 1, len(vocab))
    top_indices = torch.argsort(similarities, descending=True)[:k]
    
    print(f"\n--- Top {top_n} most similar words to '{input_word}' --- ")
    results = []
    for idx_tensor in top_indices:
        idx = idx_tensor.item()
        word = vocab.get_word(idx)
        if word.lower() == input_word.lower(): # Simple check to skip input word
            continue
        sim = similarities[idx].item()
        results.append({'Word': word, 'Similarity': sim})
        if len(results) == top_n:
             break
             
    if results:
        display(pd.DataFrame(results))
    else:
        print("No similar words found (excluding input word).")

def word_analogy(word_a: str, word_b: str, word_c: str, top_n: int = 5):
    """Performs word analogy task: vec(a) - vec(b) + vec(c) ≈ vec(?) """
    if not vocab or embedding_matrix is None:
        logger.error("Vocabulary or embedding matrix not loaded.")
        return

    vec_a = get_embedding_vector(word_a)
    vec_b = get_embedding_vector(word_b)
    vec_c = get_embedding_vector(word_c)

    if vec_a is None or vec_b is None or vec_c is None:
        logger.error(f"Could not get embeddings for one or more analogy words: '{word_a}', '{word_b}', '{word_c}'.")
        return

    logger.info(f"Performing analogy: '{word_a}' - '{word_b}' + '{word_c}' = ?")

    # Calculate the target vector
    target_vector = (vec_a - vec_b + vec_c).float() # Ensure float

    # Calculate similarities with all words
    cos = nn.CosineSimilarity(dim=1)
    similarities = cos(embedding_matrix.float(), target_vector.unsqueeze(0))

    # Get top indices, excluding the input words
    k = min(top_n + 3, len(vocab)) # +3 to account for excluding a, b, c
    top_indices = torch.argsort(similarities, descending=True)[:k]

    print(f"\n--- Top {top_n} analogy results --- ")
    results = []
    input_words = {word_a.lower(), word_b.lower(), word_c.lower()}
    for idx_tensor in top_indices:
        idx = idx_tensor.item()
        word = vocab.get_word(idx)
        # Exclude input words
        if word.lower() in input_words:
            continue 
        sim = similarities[idx].item()
        results.append({'Word': word, 'Similarity': sim})
        if len(results) == top_n:
             break
             
    if results:
        display(pd.DataFrame(results))
    else:
        print("No analogy results found (excluding input words).")

## ▶️ Run Evaluations

In [4]:
# --- Test Nearest Neighbors --- 
test_words = ['king', 'computer', 'france', 'history', 'running', 'apple', 'man']

if embedding_matrix is not None:
    for word in test_words:
        find_nearest_neighbors(word, top_n=10)
else:
    logger.error("Cannot run nearest neighbors: Embeddings not loaded.")


2025-04-15 16:20:46 | DropoutDisco | INFO     | [3022521770.py:40] | Finding nearest neighbors for 'king'...

--- Top 10 most similar words to 'king' --- 


Unnamed: 0,Word,Similarity
0,kings,0.440645
1,son,0.426844
2,pope,0.42636
3,nebuchadnezzar,0.413334
4,usurper,0.401036
5,elector,0.399688
6,murad,0.399563
7,portugal,0.398484
8,throne,0.385913
9,emperor,0.381518


2025-04-15 16:20:46 | DropoutDisco | INFO     | [3022521770.py:40] | Finding nearest neighbors for 'computer'...

--- Top 10 most similar words to 'computer' --- 


Unnamed: 0,Word,Similarity
0,computers,0.540999
1,firewire,0.470809
2,gaming,0.409237
3,minicomputer,0.393269
4,computing,0.367814
5,peripherals,0.366021
6,software,0.365695
7,spreadsheet,0.3621
8,pc,0.359847
9,probabilistic,0.356872


2025-04-15 16:20:46 | DropoutDisco | INFO     | [3022521770.py:40] | Finding nearest neighbors for 'france'...

--- Top 10 most similar words to 'france' --- 


Unnamed: 0,Word,Similarity
0,spain,0.458888
1,french,0.451032
2,belgium,0.448859
3,partement,0.428191
4,sicily,0.415243
5,cameroon,0.40882
6,zurich,0.400472
7,argentina,0.39088
8,netherlands,0.390649
9,ireland,0.389632


2025-04-15 16:20:46 | DropoutDisco | INFO     | [3022521770.py:40] | Finding nearest neighbors for 'history'...

--- Top 10 most similar words to 'history' --- 


Unnamed: 0,Word,Similarity
0,demographics,0.428868
1,heritage,0.410536
2,origins,0.393789
3,timeline,0.389776
4,overview,0.385104
5,politics,0.366357
6,astronautics,0.362148
7,migrations,0.354111
8,yuma,0.337357
9,searchable,0.333797


2025-04-15 16:20:46 | DropoutDisco | INFO     | [3022521770.py:40] | Finding nearest neighbors for 'running'...

--- Top 10 most similar words to 'running' --- 


Unnamed: 0,Word,Similarity
0,pulled,0.381165
1,baserunners,0.370635
2,picking,0.361769
3,scraped,0.357239
4,bounced,0.353074
5,isp,0.352752
6,thrown,0.350522
7,northwards,0.347565
8,lighted,0.346535
9,sped,0.344212


2025-04-15 16:20:46 | DropoutDisco | INFO     | [3022521770.py:40] | Finding nearest neighbors for 'apple'...

--- Top 10 most similar words to 'apple' --- 


Unnamed: 0,Word,Similarity
0,macintosh,0.518473
1,intel,0.452954
2,hardware,0.436081
3,atari,0.43108
4,diablo,0.410351
5,iic,0.409558
6,amiga,0.409222
7,software,0.388601
8,codec,0.387728
9,nintendo,0.383485


2025-04-15 16:20:46 | DropoutDisco | INFO     | [3022521770.py:40] | Finding nearest neighbors for 'man'...

--- Top 10 most similar words to 'man' --- 


Unnamed: 0,Word,Similarity
0,woman,0.408826
1,someone,0.381755
2,teacher,0.363323
3,person,0.362559
4,sheared,0.360594
5,alia,0.3517
6,evil,0.342921
7,fashioned,0.342052
8,gift,0.339491
9,glory,0.339162


In [5]:
# --- Test Word Analogies --- 
analogies = [
    ('king', 'man', 'woman'),    # Target: queen
    ('paris', 'france', 'germany'), # Target: berlin
    ('walking', 'walked', 'swimming'), # Target: swam
    ('big', 'bigger', 'small'),   # Target: smaller
    ('cold', 'colder', 'hot')    # Target: hotter
]

if embedding_matrix is not None:
    for a, b, c in analogies:
        word_analogy(a, b, c, top_n=5)
else:
    logger.error("Cannot run analogies: Embeddings not loaded.")

2025-04-15 16:20:47 | DropoutDisco | INFO     | [3022521770.py:83] | Performing analogy: 'king' - 'man' + 'woman' = ?

--- Top 5 analogy results --- 


Unnamed: 0,Word,Similarity
0,mackay,0.396144
1,mentewab,0.377567
2,alphonse,0.376149
3,wife,0.364031
4,appointment,0.363669


2025-04-15 16:20:47 | DropoutDisco | INFO     | [3022521770.py:83] | Performing analogy: 'paris' - 'france' + 'germany' = ?

--- Top 5 analogy results --- 


Unnamed: 0,Word,Similarity
0,berlin,0.440776
1,canova,0.349511
2,lymec,0.348237
3,cologne,0.340313
4,headquarters,0.334597


2025-04-15 16:20:47 | DropoutDisco | INFO     | [3022521770.py:83] | Performing analogy: 'walking' - 'walked' + 'swimming' = ?

--- Top 5 analogy results --- 


Unnamed: 0,Word,Similarity
0,backpack,0.392932
1,processed,0.363658
2,tumbling,0.361517
3,equestrian,0.350594
4,overworld,0.348919


2025-04-15 16:20:47 | DropoutDisco | INFO     | [3022521770.py:83] | Performing analogy: 'big' - 'bigger' + 'small' = ?

--- Top 5 analogy results --- 


Unnamed: 0,Word,Similarity
0,large,0.398319
1,cedar,0.355712
2,centro,0.347184
3,northwest,0.339242
4,haldeman,0.332438


2025-04-15 16:20:47 | DropoutDisco | INFO     | [3022521770.py:83] | Performing analogy: 'cold' - 'colder' + 'hot' = ?

--- Top 5 analogy results --- 


Unnamed: 0,Word,Similarity
0,aq,0.368793
1,kosovo,0.362859
2,filling,0.356238
3,attrition,0.34251
4,solstice,0.342215
