# GloVe from Scratch

In [72]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib

In [73]:
device = torch.device('cuda:0')
device 

device(type='cuda', index=0)

## 1. Load data

In [74]:
import nltk
from nltk.corpus import reuters

In [75]:
nltk.download('reuters')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\st124\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\st124\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\st124\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [76]:
corpus = list(reuters.sents())
corpus

[['ASIAN',
  'EXPORTERS',
  'FEAR',
  'DAMAGE',
  'FROM',
  'U',
  '.',
  'S',
  '.-',
  'JAPAN',
  'RIFT',
  'Mounting',
  'trade',
  'friction',
  'between',
  'the',
  'U',
  '.',
  'S',
  '.',
  'And',
  'Japan',
  'has',
  'raised',
  'fears',
  'among',
  'many',
  'of',
  'Asia',
  "'",
  's',
  'exporting',
  'nations',
  'that',
  'the',
  'row',
  'could',
  'inflict',
  'far',
  '-',
  'reaching',
  'economic',
  'damage',
  ',',
  'businessmen',
  'and',
  'officials',
  'said',
  '.'],
 ['They',
  'told',
  'Reuter',
  'correspondents',
  'in',
  'Asian',
  'capitals',
  'a',
  'U',
  '.',
  'S',
  '.',
  'Move',
  'against',
  'Japan',
  'might',
  'boost',
  'protectionist',
  'sentiment',
  'in',
  'the',
  'U',
  '.',
  'S',
  '.',
  'And',
  'lead',
  'to',
  'curbs',
  'on',
  'American',
  'imports',
  'of',
  'their',
  'products',
  '.'],
 ['But',
  'some',
  'exporters',
  'said',
  'that',
  'while',
  'the',
  'conflict',
  'would',
  'hurt',
  'them',
  'in',


In [77]:
#2. numeralization
#find unique words
flatten = lambda l: [item for sublist in l for item in sublist]

#set vocab limit to the first 10,000 words

words = list(set(flatten(corpus))) #all the words we have in the system - <UNK>
vocabs = words[:10000] #limit the vocab size to 10,000

In [78]:
#create handy mapping between integer and word
word2index = {v:idx for idx, v in enumerate(vocabs)}

In [79]:
last_vocab_idx = len(vocabs)
last_vocab_idx

10000

In [80]:
vocabs.append('<UNK>')
word2index['<UNK>'] = last_vocab_idx

In [81]:
index2word = {v:k for k, v in word2index.items()}
index2word[5]

'Wednesbury'

In [82]:
vocabs

['Hassenberg',
 'MODEST',
 '571',
 'ACQUISTION',
 'PARTIES',
 'Wednesbury',
 'PRODUCTION',
 '1975',
 'Planning',
 'JEM',
 'retracted',
 'properites',
 'Yom',
 'insulin',
 'broadened',
 'reasonable',
 'Burma',
 'QUALIFY',
 'ACLE',
 'wagons',
 'overstressed',
 'nonrelated',
 'INAI',
 'amazing',
 'accusation',
 'ORBIS',
 'outstripping',
 'mlnm',
 'crit',
 'LAWSUIT',
 'leisure',
 'commonstock',
 'Norcros',
 'POLYESTER',
 'SPIN',
 'GEOX',
 'handled',
 'Weizsaecker',
 'realises',
 'OMCO',
 'Eckenfelder',
 'LIVED',
 'Sealy',
 'DUFFOUR',
 'BUFF',
 'HOGS',
 'CAKE',
 'accruing',
 'synthetic',
 'El',
 'TWA',
 'contends',
 'pulping',
 '23rd',
 'alternations',
 'CONTAINMENT',
 'KRLZ',
 '934',
 'AEROTECH',
 'MABS',
 'hardship',
 'designee',
 'pursue',
 'HECLA',
 'Wesbanco',
 'EPSI',
 'UNDERUSED',
 'editiion',
 'TRILLIUM',
 'Continentale',
 'Humbrol',
 'SAFEGUARD',
 'difficulty',
 'CRITICIZED',
 'GOES',
 'SHS',
 'alliance',
 'introduce',
 'ATP259P',
 'deprived',
 'offier',
 'rebuilding',
 'Dover',
 '

# 2. Build Co-occurence Matrix X

In [83]:
from collections import Counter

X_i = Counter(flatten(corpus))
X_i

Counter({'.': 94703,
         ',': 72360,
         'the': 58251,
         'of': 35979,
         'to': 34035,
         'in': 26478,
         'said': 25224,
         'and': 25043,
         'a': 23492,
         'mln': 18037,
         'vs': 14120,
         '-': 13705,
         'for': 12785,
         'dlrs': 11730,
         "'": 11273,
         'The': 10968,
         '000': 10277,
         '1': 9977,
         's': 9298,
         'pct': 9093,
         'it': 8842,
         ';': 8762,
         '&': 8698,
         'lt': 8694,
         'on': 8556,
         'from': 7986,
         'cts': 7953,
         'is': 7580,
         '>': 7449,
         'that': 7377,
         'its': 7265,
         'by': 6872,
         '"': 6819,
         'at': 6537,
         '2': 6528,
         'U': 6388,
         'S': 6382,
         'year': 6310,
         'be': 6288,
         'with': 5945,
         'will': 5856,
         'was': 5787,
         'billion': 5652,
         '3': 5091,
         '5': 4683,
         'has': 4679,
   

In [84]:

window_size = 2

skip_grams = []

#loop each corpus
for doc in corpus:
    #look from the 2nd word until second last word
    for i in range(window_size, len(doc)-window_size):
        #center word
        center = doc[i]
        #outside words = 2 words

        outside = []
        for j in range(window_size):
            outside.append(doc[i+(j+1)])
            outside.append(doc[i-(j+1)])

        #for each of these two outside words, we gonna append to a list
        for each_out in outside:
            skip_grams.append((center, each_out))
            #center, outside1;   center, outside2

skip_grams

[('FEAR', 'DAMAGE'),
 ('FEAR', 'EXPORTERS'),
 ('FEAR', 'FROM'),
 ('FEAR', 'ASIAN'),
 ('DAMAGE', 'FROM'),
 ('DAMAGE', 'FEAR'),
 ('DAMAGE', 'U'),
 ('DAMAGE', 'EXPORTERS'),
 ('FROM', 'U'),
 ('FROM', 'DAMAGE'),
 ('FROM', '.'),
 ('FROM', 'FEAR'),
 ('U', '.'),
 ('U', 'FROM'),
 ('U', 'S'),
 ('U', 'DAMAGE'),
 ('.', 'S'),
 ('.', 'U'),
 ('.', '.-'),
 ('.', 'FROM'),
 ('S', '.-'),
 ('S', '.'),
 ('S', 'JAPAN'),
 ('S', 'U'),
 ('.-', 'JAPAN'),
 ('.-', 'S'),
 ('.-', 'RIFT'),
 ('.-', '.'),
 ('JAPAN', 'RIFT'),
 ('JAPAN', '.-'),
 ('JAPAN', 'Mounting'),
 ('JAPAN', 'S'),
 ('RIFT', 'Mounting'),
 ('RIFT', 'JAPAN'),
 ('RIFT', 'trade'),
 ('RIFT', '.-'),
 ('Mounting', 'trade'),
 ('Mounting', 'RIFT'),
 ('Mounting', 'friction'),
 ('Mounting', 'JAPAN'),
 ('trade', 'friction'),
 ('trade', 'Mounting'),
 ('trade', 'between'),
 ('trade', 'RIFT'),
 ('friction', 'between'),
 ('friction', 'trade'),
 ('friction', 'the'),
 ('friction', 'Mounting'),
 ('between', 'the'),
 ('between', 'friction'),
 ('between', 'U'),
 ('betwee

In [85]:
X_ik_skipgrams = Counter(skip_grams)
X_ik_skipgrams

Counter({(',', ','): 18444,
         ('the', 'of'): 13176,
         ('of', 'the'): 13075,
         ('mln', '.'): 12869,
         ('.', 'mln'): 12486,
         ('.', '.'): 11887,
         (',', '000'): 10762,
         ('000', ','): 10708,
         ('S', '.'): 10100,
         ('.', 'S'): 10024,
         ('s', "'"): 9215,
         (';', '&'): 8698,
         (';', 'lt'): 8693,
         ("'", 's'): 8649,
         ('the', ','): 8447,
         ('lt', ';'): 8380,
         ('lt', '&'): 8375,
         (',', 'the'): 8373,
         ('vs', ','): 7754,
         (',', 'vs'): 7705,
         ('the', 'in'): 7497,
         ('&', ';'): 7437,
         ('&', 'lt'): 7428,
         ('in', 'the'): 7407,
         ('.', '1'): 6418,
         ('1', '.'): 6406,
         ('the', 'to'): 6278,
         ('to', 'the'): 6219,
         ('the', 'said'): 5545,
         ('.', 'U'): 5522,
         (',', 'said'): 5513,
         ('U', '.'): 5289,
         ('vs', '.'): 5263,
         ('vs', 'cts'): 5258,
         ('cts', 'vs'): 

## Weight function

In [86]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurences between w_i and w_j is available
    try:
        x_ij = X_ik[(w_i, w_j)]
        #if not exist, then set to 1 "laplace smoothing"
    except:
        x_ij = 1
        
    #set xmax
    x_max = 100
    #set alpha
    alpha = 0.75
    
    #if co-ocurrence does not exceeed xmax, then just multiply with some alpha
    if x_ij < x_max:
        result = (x_ij / x_max)**alpha
    #otherwise, set to 1
    else:
        result = 1
    
    return result

In [87]:
from itertools import combinations_with_replacement

X_ik = {} #keeping the co-occurences
weighting_dic = {} #already scale the co-occurences using the weighting function

for bigram in combinations_with_replacement(vocabs, 2):
    if X_ik_skipgrams.get(bigram):  #if the pair exists in our corpus
        co = X_ik_skipgrams[bigram]
        X_ik[bigram] = co + 1 #for stability
        X_ik[(bigram[1], bigram[0])] = co + 1 #basically apple, banana = banana, apple
    else:
        pass
    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

## 3. Prepare train data

In [88]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    random_inputs, random_labels, random_coocs, random_weightings = [], [], [], []
    
    #convert our skipgrams to id
    skip_grams_id = [(word2index.get(skip_gram[0], word2index['<UNK>']), word2index.get(skip_gram[1], word2index['<UNK>'])) for skip_gram in skip_grams]
    #randomly choose indexes based on batch size
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False)
    
    #get the random input and labels
    for index in random_index:
        random_inputs.append([skip_grams_id[index][0]])
        random_labels.append([skip_grams_id[index][1]])
        #coocs
        pair = skip_grams[index] #e.g., ('banana', 'fruit')
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
    
        #weightings
        weighting = weighting_dic.get(pair,0)
        random_weightings.append([weighting])
        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

In [89]:
batch_size = 2
x, y, cooc, weighting = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

print("Inputs:", x)
print("Labels:", y)
print("Co-occurrences:", cooc)
print("Weightings:", weighting)


Inputs: [[10000]
 [ 4465]]
Labels: [[10000]
 [10000]]
Co-occurrences: [[0.]
 [0.]]
Weightings: [[0]
 [0]]


# 4. Modeling

In [90]:
class Glove(nn.Module):
    
    def __init__(self, voc_size, emb_size, word2index):
        super(Glove, self).__init__()
        self.center_embedding  = nn.Embedding(voc_size, emb_size)
        self.outside_embedding = nn.Embedding(voc_size, emb_size)
        
        self.center_bias       = nn.Embedding(voc_size, 1) 
        self.outside_bias      = nn.Embedding(voc_size, 1)

        self.word2index = word2index


    def forward(self, center, outside, coocs, weighting):
        center_embeds  = self.center_embedding(center) #(batch_size, 1, emb_size)
        outside_embeds = self.outside_embedding(outside) #(batch_size, 1, emb_size)
        
        center_bias    = self.center_bias(center).squeeze(1)
        target_bias    = self.outside_bias(outside).squeeze(1)
        
        inner_product  = outside_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #(batch_size, 1, emb_size) @ (batch_size, emb_size, 1) = (batch_size, 1, 1) = (batch_size, 1)
        
        loss = weighting * torch.pow(inner_product + center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)
    
    def get_embed(self, word):
        word2index = self.word2index
        
        try:
            index = word2index[word]
        except:
            index = word2index['<UNK>']
            
        word = torch.LongTensor([index])
        
        embed_c = self.center_embedding(word)
        embed_o = self.outside_embedding(word)
        embed   = (embed_c + embed_o) / 2
        
        return embed[0][0].item(), embed[0][1].item()

In [91]:
#test our system
voc_size = len(vocabs)
emb_size = 2
model = Glove(voc_size, emb_size,word2index)

In [92]:
x_tensor = torch.LongTensor(x)
y_tensor = torch.LongTensor(y)
cooc_tensor = torch.FloatTensor(cooc)
weighting_tensor = torch.FloatTensor(weighting)

In [93]:
loss = model(x_tensor, y_tensor, cooc_tensor, weighting_tensor)

In [94]:
loss

tensor(0., grad_fn=<SumBackward0>)

# 5. Training

In [95]:
batch_size     = 2 # mini-batch size
embedding_size = 2 #so we can later plot
model          = Glove(voc_size, embedding_size,word2index)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [96]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [97]:
import time

# Training
num_epochs = 100
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 10 == 0:
        print(f"Epoch: {epoch + 1} | loss: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")


Epoch: 10 | loss: 0.000000 | time: 0m 1s
Epoch: 20 | loss: 0.000000 | time: 0m 1s
Epoch: 30 | loss: 0.000000 | time: 0m 2s
Epoch: 40 | loss: 0.000000 | time: 0m 2s
Epoch: 50 | loss: 0.000000 | time: 0m 1s
Epoch: 60 | loss: 0.000000 | time: 0m 1s
Epoch: 70 | loss: 0.463153 | time: 0m 1s
Epoch: 80 | loss: 162.456757 | time: 0m 1s
Epoch: 90 | loss: 0.000000 | time: 0m 1s
Epoch: 100 | loss: 0.000000 | time: 0m 1s


# 6. Testing

In [98]:
def open_file(path_to_file):
    # Open the file in read mode
    try:
        with open(path_to_file, 'r') as file:
            content = file.readlines()
    except FileNotFoundError:
        print(f"The file {path_to_file} does not exist.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return content

In [99]:
# File paths for semantic and syntactic data
semantic_file_path = "test/semantic_capital_country.txt"
syntatic_file_path = "test/syntatic_past_tense.txt"

# Function to read content from a file
def open_file(file_path):
    with open(file_path, "r") as file:
        return file.readlines()

# Load semantic and syntactic data
semantic = [line.strip() for line in open_file(semantic_file_path) if line.strip()]
syntatic = [line.strip() for line in open_file(syntatic_file_path) if line.strip()]

current_test = semantic 
for sent in semantic + syntatic:  
    if sent.startswith(":"):  
        current_test = syntatic
        continue
    current_test.append(sent.strip())
    
    current_test.append(sent.strip())

In [100]:
vector_space = []

for word in vocabs:
    vector_space.append(model.get_embed(word))

vector_space = np.array(vector_space)

In [101]:
#scipy version
from scipy import spatial

def cos_sim(a, b):
    cos_sim = 1 - spatial.distance.cosine(a, b)  #distance = 1 - similarlity, because scipy only gives distance
    return cos_sim

def cos_sim_scores(vector_space, target_vector):
    scores = []
    for each_vect in vector_space:
        each_vect = tuple(each_vect)
        target_vector=tuple(target_vector)
        scores.append(cos_sim(target_vector, each_vect))

    return np.array(scores)

In [102]:
def similarity(model, test_data):
    words = test_data.split(" ")

    embed0 = np.array(model.get_embed(words[0]))
    embed1 = np.array(model.get_embed(words[1]))
    embed2 = np.array(model.get_embed(words[2]))

    similar_vector = embed1 - embed0 + embed2

    similarity_scores = cos_sim_scores(vector_space, similar_vector)
    max_score_idx = np.argmax(similarity_scores)
    similar_word = index2word[max_score_idx]

    result = False
    if similar_word == words[3]:
        result = True

    return result

# Syntactic Accuracy

In [103]:
syn_total = len(syntatic)
syn_correct = 0
for sent in syntatic:
    if similarity(model, sent):
        syn_correct += 1

In [104]:
syn_accuracy = syn_correct / syn_total
print(f"Syntatic accuracy: {syn_accuracy:2.2f}")

Syntatic accuracy: 0.00


# Semantic Accuracy

In [105]:
sem_total = len(semantic)
sem_correct = 0
for sent in semantic:
    if similarity(model, sent):
        sem_correct += 1

In [106]:
sem_accuracy = sem_correct / sem_total
print(f"Semantic accuracy: {sem_accuracy:2.2f}")

Semantic accuracy: 0.00


# Similarity Accuracy

In [107]:
file_path = "test/wordsim_similarity_goldstandard.txt"

content = open_file(file_path)

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [108]:
def compute_similarity(model, test_data):
    words = test_data.split("\t")

    embed0 = np.array(model.get_embed(words[0].strip()))
    embed1 = np.array(model.get_embed(words[1].strip()))

    similarity_model = embed1 @ embed0.T
    similarity_provided = float(words[2].strip())

    return similarity_provided, similarity_model

In [109]:
ds_scores = []
model_scores = []
for sent in sim_data:
    ds_score, model_score = compute_similarity(model, sent)

    ds_scores.append(ds_score)
    model_scores.append(model_score)

In [110]:
from scipy.stats import spearmanr

corr = spearmanr(ds_scores, model_scores)[0]

print(f"Correlation between the dataset metrics and model scores is {corr:2.2f}.")

Correlation between the dataset metrics and model scores is -0.01.


# 7. Save the model and data

In [111]:
import pickle
import os

In [112]:
model_dir = './app/models'
os.makedirs(model_dir, exist_ok=True)

torch.save(model.state_dict(), os.path.join(model_dir, 'glove.model'))

skipgram_args = {
    'voc_size': voc_size,
    'emb_size': emb_size,
    'word2index': word2index,
}
with open(os.path.join(model_dir, 'glove.args'), 'wb') as f:
    pickle.dump(skipgram_args, f)


In [113]:
skg_args = pickle.load(open('./app/models/glove.args', 'rb'))
load_model = Glove(**skg_args)
load_model.load_state_dict(torch.load('./app/models/glove.model'))

  load_model.load_state_dict(torch.load('./app/models/glove.model'))


<All keys matched successfully>

In [114]:
load_model.get_embed('Heritage')

(0.7706090807914734, -0.39769792556762695)

In [115]:
load_model.get_embed('apple')

(-0.4270056188106537, 0.49586576223373413)

In [116]:
load_model.get_embed('banana')

(0.12882021069526672, -0.2197781503200531)

In [117]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

vec_apple = [-0.2982, -0.2973]
vec_banana = [0.3717, -0.4830]

similarity = cosine_similarity(vec_apple, vec_banana)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.12763203458681888
