In [6]:
from gensim.models import KeyedVectors
import os
import numpy as np


def getWordVector(model, word):
    try:
        return model[word]
    except KeyError:
        return None


def load_glove_model(filepath, encoding='utf-8'):
    try:
        print(f"Attempting to load model with encoding: {encoding}")
        model = KeyedVectors.load_word2vec_format(filepath, binary=False, encoding=encoding)
        print(f"Successfully loaded model with encoding: {encoding}")
        return model
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError with encoding {encoding}: {e}")
        if encoding != 'latin-1':
            print("Attempting to load model with latin-1 encoding")
            return load_glove_model(filepath, encoding='latin-1')
        else:
            print("Failed to load model with both utf-8 and latin-1 encodings.")
            return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


def completeAnalogyWithOptionList(wordA, wordB, wordC, options):
    try:
        vector_B = model[wordB]
        vector_A = model[wordA]
        vector_C = model[wordC]
    except KeyError as e:
        print(f"Warning: One or more words not found in vocabulary: {e}")
        return None

    analogy_vector = vector_B - vector_A + vector_C

    best_option = None
    best_similarity = -1

    for option in options:
        try:
            option_vector = model[option]
            similarity = cosine_similarity(analogy_vector, option_vector)

            if similarity > best_similarity:
                best_similarity = similarity
                best_option = option
        except KeyError:
            print(f"Warning: Option word '{option}' not found in vocabulary. Skipping.")
            continue

    return best_option


def cosine_similarity(v1, v2):
    v1_norm = np.linalg.norm(v1)
    v2_norm = np.linalg.norm(v2)
    if v1_norm == 0 or v2_norm == 0:
        return 0.0
    return np.dot(v1, v2) / (v1_norm * v2_norm)

In [8]:
file_path = "../data/glove-twitter-25.txt"
if not os.path.exists(file_path):
    print(f"Error: File not found at {file_path}")
else:
    model = load_glove_model(file_path)

Attempting to load model with encoding: utf-8
Successfully loaded model with encoding: utf-8


In [12]:
word_vector = getWordVector(model, "king")
if word_vector is not None:
    print(f"Vector for 'king': {word_vector[:10]}... (truncated)")
else:
    print("Word 'king' not found in the vocabulary.")
word_vector = getWordVector(model, "qwerty")
if word_vector is not None:
    print(f"Vector for 'qwerty': {word_vector[:10]}... (truncated)")
else:
    print("Word 'qwerty' not found in the vocabulary.")

options = ["germany", "france", "italy", "spain"]
result = completeAnalogyWithOptionList("berlin", "germany", "paris", options)
print(f"Analogy 'berlin' is to 'germany' as 'paris' is to: {result}")

options = ["swim", "run", "fly", "jump"]
result = completeAnalogyWithOptionList("water", "swim", "air", options)
print(f"Analogy 'water' is to 'swim' as 'air' is to: {result}")

options = ["queen", "princess", "waitress", "doctor", "qwerty"]
result = completeAnalogyWithOptionList("man", "king", "woman", options)
print(f"Analogy 'man' is to 'king' as 'woman' is to: {result}")

Vector for 'king': [-0.74501 -0.11992  0.37329  0.36847 -0.4472  -0.2288   0.70118  0.82872
  0.39486 -0.58347]... (truncated)
Vector for 'qwerty': [ 0.28914  -0.28524   0.21839   0.088964  1.2627   -0.22651  -0.10775
 -0.29433   1.4517    0.3187  ]... (truncated)
Analogy 'berlin' is to 'germany' as 'paris' is to: italy
Analogy 'water' is to 'swim' as 'air' is to: fly
Analogy 'man' is to 'king' as 'woman' is to: queen
