# GloVE

Let's work on implementation of GloVE.

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

## 1. Load data

In [2]:
from nltk.corpus import brown

brown.categories()
corpus = brown.sents(categories="news")
corpus

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [3]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['chien',
 "conference's",
 'Well-wishers',
 'While',
 'patrons',
 'Navigation',
 'motels',
 'testing',
 'petty',
 '$90',
 'Given',
 'Sanger-Harris',
 'scoreless',
 'Outside',
 'Atty.',
 'spongy',
 'Are',
 'negotiations',
 'Hord',
 'washed-out',
 'courtroom',
 'contrasting',
 'Gotham',
 'Charitable',
 'Antone',
 'meter',
 'maker',
 'SMU',
 'book-selection',
 'instructed',
 'Seidel',
 'ball-hawking',
 'fearing',
 '108',
 'picking',
 'U-2',
 'self-sacrifice',
 "'52",
 'alternatives',
 "isn't",
 'queen',
 'area',
 'Hall',
 "he'll",
 'mall',
 'combination',
 'Red',
 'honeymoon',
 'masterful',
 'feted',
 'Librarians',
 'highway',
 'Frick',
 'clocking',
 'exploring',
 'Anaconda',
 'consequences',
 'hood',
 'valuation',
 '375',
 'staggered',
 'Shelby',
 'Shipman',
 'Italy',
 'Vegas',
 'undergone',
 'enterprisingly',
 'closing',
 'colonies',
 'best',
 'streak',
 'terminated',
 'machine',
 'Players',
 'presiding',
 'walloping',
 'dwindled',
 'warless',
 'obtained',
 'missed',
 "Tennessee's",
 '

In [4]:
#numericalization
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)



In [5]:
#vocab size
voc_size = len(vocab)
print(voc_size)

14394


In [6]:
#append UNK
vocab.append('<UNK>')

In [7]:
vocab

['chien',
 "conference's",
 'Well-wishers',
 'While',
 'patrons',
 'Navigation',
 'motels',
 'testing',
 'petty',
 '$90',
 'Given',
 'Sanger-Harris',
 'scoreless',
 'Outside',
 'Atty.',
 'spongy',
 'Are',
 'negotiations',
 'Hord',
 'washed-out',
 'courtroom',
 'contrasting',
 'Gotham',
 'Charitable',
 'Antone',
 'meter',
 'maker',
 'SMU',
 'book-selection',
 'instructed',
 'Seidel',
 'ball-hawking',
 'fearing',
 '108',
 'picking',
 'U-2',
 'self-sacrifice',
 "'52",
 'alternatives',
 "isn't",
 'queen',
 'area',
 'Hall',
 "he'll",
 'mall',
 'combination',
 'Red',
 'honeymoon',
 'masterful',
 'feted',
 'Librarians',
 'highway',
 'Frick',
 'clocking',
 'exploring',
 'Anaconda',
 'consequences',
 'hood',
 'valuation',
 '375',
 'staggered',
 'Shelby',
 'Shipman',
 'Italy',
 'Vegas',
 'undergone',
 'enterprisingly',
 'closing',
 'colonies',
 'best',
 'streak',
 'terminated',
 'machine',
 'Players',
 'presiding',
 'walloping',
 'dwindled',
 'warless',
 'obtained',
 'missed',
 "Tennessee's",
 '

In [8]:
word2index['<UNK>'] = 0

In [9]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

## 2. Build Co-occurence Matrix X

Here, we need to count the co-occurence of two words given some window size.  We gonna use window size of 1.

In [10]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({'The': 806,
         'Fulton': 14,
         'County': 35,
         'Grand': 6,
         'Jury': 2,
         'said': 402,
         'Friday': 41,
         'an': 300,
         'investigation': 9,
         'of': 2849,
         "Atlanta's": 4,
         'recent': 20,
         'primary': 17,
         'election': 38,
         'produced': 6,
         '``': 732,
         'no': 109,
         'evidence': 17,
         "''": 702,
         'that': 802,
         'any': 90,
         'irregularities': 3,
         'took': 47,
         'place': 25,
         '.': 4030,
         'jury': 44,
         'further': 16,
         'in': 1893,
         'term-end': 1,
         'presentments': 1,
         'the': 5580,
         'City': 44,
         'Executive': 6,
         'Committee': 37,
         ',': 5188,
         'which': 244,
         'had': 279,
         'over-all': 2,
         'charge': 17,
         'deserves': 3,
         'praise': 2,
         'and': 2146,
         'thanks': 6,
         'Atlanta': 14,

In [11]:
window_size = 2

skip_grams = []

#loop each corpus
for doc in corpus:
    #look from the 2nd word until second last word
    for i in range(window_size, len(doc)-window_size):
        #center word
        center = doc[i]
        #outside words = 2 words

        outside = []
        for j in range(window_size):
            outside.append(doc[i+(j+1)])
            outside.append(doc[i-(j+1)])

        #for each of these two outside words, we gonna append to a list
        for each_out in outside:
            skip_grams.append((center, each_out))
            #center, outside1;   center, outside2

skip_grams

[('County', 'Grand'),
 ('County', 'Fulton'),
 ('County', 'Jury'),
 ('County', 'The'),
 ('Grand', 'Jury'),
 ('Grand', 'County'),
 ('Grand', 'said'),
 ('Grand', 'Fulton'),
 ('Jury', 'said'),
 ('Jury', 'Grand'),
 ('Jury', 'Friday'),
 ('Jury', 'County'),
 ('said', 'Friday'),
 ('said', 'Jury'),
 ('said', 'an'),
 ('said', 'Grand'),
 ('Friday', 'an'),
 ('Friday', 'said'),
 ('Friday', 'investigation'),
 ('Friday', 'Jury'),
 ('an', 'investigation'),
 ('an', 'Friday'),
 ('an', 'of'),
 ('an', 'said'),
 ('investigation', 'of'),
 ('investigation', 'an'),
 ('investigation', "Atlanta's"),
 ('investigation', 'Friday'),
 ('of', "Atlanta's"),
 ('of', 'investigation'),
 ('of', 'recent'),
 ('of', 'an'),
 ("Atlanta's", 'recent'),
 ("Atlanta's", 'of'),
 ("Atlanta's", 'primary'),
 ("Atlanta's", 'investigation'),
 ('recent', 'primary'),
 ('recent', "Atlanta's"),
 ('recent', 'election'),
 ('recent', 'of'),
 ('primary', 'election'),
 ('primary', 'recent'),
 ('primary', 'produced'),
 ('primary', "Atlanta's"),
 (

In [12]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({('County', 'Grand'): 1,
         ('County', 'Fulton'): 6,
         ('County', 'Jury'): 1,
         ('County', 'The'): 1,
         ('Grand', 'Jury'): 1,
         ('Grand', 'County'): 1,
         ('Grand', 'said'): 1,
         ('Grand', 'Fulton'): 1,
         ('Jury', 'said'): 1,
         ('Jury', 'Grand'): 2,
         ('Jury', 'Friday'): 1,
         ('Jury', 'County'): 1,
         ('said', 'Friday'): 5,
         ('said', 'Jury'): 1,
         ('said', 'an'): 2,
         ('said', 'Grand'): 1,
         ('Friday', 'an'): 1,
         ('Friday', 'said'): 3,
         ('Friday', 'investigation'): 1,
         ('Friday', 'Jury'): 1,
         ('an', 'investigation'): 3,
         ('an', 'Friday'): 1,
         ('an', 'of'): 43,
         ('an', 'said'): 2,
         ('investigation', 'of'): 4,
         ('investigation', 'an'): 4,
         ('investigation', "Atlanta's"): 1,
         ('investigation', 'Friday'): 1,
         ('of', "Atlanta's"): 1,
         ('of', 'investigation'): 5,
         (

### Weighting function

GloVe includes a weighting function to scale down too frequent words.

<img src = "../figures/glove_weighting_func.png" width=400>

In [13]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [14]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [15]:
import math

def random_batch(batch_size, word_sequence, skip_grams_id, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

### Testing the method

In [16]:
weighting_dic

{('chien', 'chien'): 0.03162277660168379,
 ('chien', "conference's"): 0.03162277660168379,
 ("conference's", 'chien'): 0.03162277660168379,
 ('chien', 'Well-wishers'): 0.03162277660168379,
 ('Well-wishers', 'chien'): 0.03162277660168379,
 ('chien', 'While'): 0.03162277660168379,
 ('While', 'chien'): 0.03162277660168379,
 ('chien', 'patrons'): 0.03162277660168379,
 ('patrons', 'chien'): 0.03162277660168379,
 ('chien', 'Navigation'): 0.03162277660168379,
 ('Navigation', 'chien'): 0.03162277660168379,
 ('chien', 'motels'): 0.03162277660168379,
 ('motels', 'chien'): 0.03162277660168379,
 ('chien', 'testing'): 0.03162277660168379,
 ('testing', 'chien'): 0.03162277660168379,
 ('chien', 'petty'): 0.03162277660168379,
 ('petty', 'chien'): 0.03162277660168379,
 ('chien', '$90'): 0.03162277660168379,
 ('$90', 'chien'): 0.03162277660168379,
 ('chien', 'Given'): 0.03162277660168379,
 ('Given', 'chien'): 0.03162277660168379,
 ('chien', 'Sanger-Harris'): 0.03162277660168379,
 ('Sanger-Harris', 'chie

In [17]:
batch_size = 2
#convert our skipgrams to id
skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams_id, X_ik, weighting_dic)

In [18]:
type(x[0][0])

numpy.int64

In [19]:
y

array([[11106],
       [  156]])

In [20]:
cooc

array([[1.60943791],
       [7.27517232]])

In [21]:
weighting

array([[0.10573713],
       [1.        ]])

## 4. Model

<img src ="../figures/glove.png" width=400>

In [22]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)

        self.word2index = word2index


    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)
    
    #let's write a function to get embedding given a word
    def get_embed(self, word):
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.center_embedding(word)
        embed_o = self.outside_embedding(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()

In [23]:
#test our system
voc_size = len(vocab)
emb_size = 2
model = Glove(voc_size, emb_size, word2index)

In [24]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [25]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [26]:
loss

tensor(45.4687, grad_fn=<SumBackward0>)

## 5. Training

In [27]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size, word2index)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [29]:
import time

# Training
num_epochs = 1000

total_start = time.time()
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams_id, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)
        
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")
        start = end

total_end = time.time()
training_time_total = total_end - total_start
min, sec = epoch_time(total_start, total_end)

print(f"\nComplete: \nTotal Loss: {loss:2.2f} | Time Taken: {min} minutes and {sec} seconds")

Epoch: 100 | cost: 8.663046 | time: 0m 4s
Epoch: 200 | cost: 3.549200 | time: 0m 4s
Epoch: 300 | cost: 69.170227 | time: 0m 4s
Epoch: 400 | cost: 7.458392 | time: 0m 4s
Epoch: 500 | cost: 18.753832 | time: 0m 4s
Epoch: 600 | cost: 8.309074 | time: 0m 4s
Epoch: 700 | cost: 4.043787 | time: 0m 4s
Epoch: 800 | cost: 135.150360 | time: 0m 4s
Epoch: 900 | cost: 5.158092 | time: 0m 4s
Epoch: 1000 | cost: 44.368946 | time: 0m 4s

Complete: 
Total Loss: 44.37 | Time Taken: 0 minutes and 42 seconds


# 6. Testing

In [30]:
def open_file(path_to_file):
    # Open the file in read mode
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [31]:
file_path = "test/word-test.v1.min.txt"

content = open_file(file_path)

semantic = []
syntatic = []

current_test = semantic
for sent in content:
    if sent[0] == ':':
        current_test = syntatic
        continue
    
    current_test.append(sent.strip())

In [32]:
vector_space = []

for word in vocab:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [33]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        each_vect = tuple(each_vect)
        target_vector=tuple(target_vector)
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

In [34]:
def similarity(model, test_data):
    words = test_data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    similar_vector = embed1 - embed0 + embed2

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    result = False
    if similar_word == words[3]:
        result = True

    return result

## Semantic Accuracy

In [35]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [36]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


## Syntactic Accuracy

In [37]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [38]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


## Similarity Accuracy

In [39]:
file_path = "test/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [40]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [41]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [42]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset metrics and model scores is {corr:2.2f}.")

Correlation between the dataset metrics and model scores is -0.02.


# 7. Save the model and meta data

In [44]:
import pickle

torch.save(model.state_dict(), '../app/models/glove.model')

glove_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}

pickle.dump(glove_args, open('../app/models/glove.args', 'wb'))

In [45]:
glove_args = pickle.load(open('../app/models/glove.args', 'rb'))
model_glove = Glove(**glove_args)
model_glove.load_state_dict(torch.load('../app/models/glove.model'))

# Test the model
model_glove.get_embed('sad')

(0.2447531521320343, -0.15323737263679504)