# GloVe (Gensim)

In [24]:
import numpy as np

In [25]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')  #search on the google
model = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

# Testing

In [26]:
def open_file(path_to_file):
    # Open the file in read mode
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [27]:
# File paths for semantic and syntactic data
semantic_file_path = "test/semantic_capital_country.txt"
syntatic_file_path = "test/syntatic_past_tense.txt"

# Function to read content from a file
def open_file(file_path):
    with open(file_path, "r") as file:
        return file.readlines()

# Load semantic and syntactic data
semantic = [line.strip() for line in open_file(semantic_file_path) if line.strip()]
syntatic = [line.strip() for line in open_file(syntatic_file_path) if line.strip()]

current_test = semantic 
for sent in semantic + syntatic:  
    if sent.startswith(":"):  
        current_test = syntatic
        continue
    current_test.append(sent.strip())
    
    current_test.append(sent.strip())

# Syntactic Accuracy

In [28]:
syn_total = len(syntatic)
syn_correct = 0

for sent in syntatic:
    sent = sent.lower()
    words = sent.split(" ")

    try:
        result = model.most_similar(positive=[words[1], words[2]], negative=[words[0]])[0][0]
    except:
        result = "<UNK>"

    if result == words[3]:
        syn_correct += 1

In [29]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.4f}")

Syntatic accuracy: 0.5545


# Semantic Accuracy

In [30]:
sem_total = len(semantic)
sem_correct = 0

for sent in semantic:
    sent = sent.lower()
    words = sent.split(" ")

    try:
        result = model.most_similar(positive=[words[1], words[2]], negative=[words[0]])[0][0]
    except:
        result = "<UNK>"

    if result == words[3]:
        sem_correct += 1

In [31]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.4f}")

Semantic accuracy: 0.5470


# Similarity Accuracy

In [32]:
file_path = "test/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [33]:
default_vector = np.zeros(model.vector_size)
try:
    result = model.get_vector('111222')
except:
    result = default_vector

In [34]:
def compute_similarity(model, test_data):
    words = test_data.lower().split("\t")

    default_vector = np.zeros(model.vector_size)
    try:
        embed0 = model.get_vector(words[0].strip())
        embed1 = model.get_vector(words[1].strip())
    except:
        embed0 = default_vector
        embed1 = default_vector


    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [35]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [36]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset metrics and model scores is {corr:2.2f}.")

Correlation between the dataset metrics and model scores is 0.54.


In [37]:
import pickle

# Save the model
pickle.dump(model,open('./app/models/gensim.model','wb'))

In [38]:
load_model = pickle.load(open('./app/models/gensim.model', 'rb'))
load_model.most_similar('father')

[('son', 0.9239585399627686),
 ('brother', 0.9224588871002197),
 ('grandfather', 0.8827932476997375),
 ('mother', 0.8656661510467529),
 ('uncle', 0.8646855354309082),
 ('wife', 0.8440755009651184),
 ('husband', 0.8430608510971069),
 ('daughter', 0.8396723866462708),
 ('friend', 0.8364216685295105),
 ('cousin', 0.8158136010169983)]