Comparison between syntactic and semantic accuracy for Glove

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

In [2]:
#Load data
corpus=["lion tiger carnivore", "tiger lion carnivore", "tiger carnivore lion ", "giraffe zebra herbivore", "zebra giraffe herbivore", "giraffe herbivore zebra", "honda toyota car", "toyota honda car"," honda car toyota","chicken beef meat","beef chicken meat","chicken meat beef", "potato cucumber vegetable","cucumber potato vegetable ", "potato vegetable cucumber", "bee ant insect","ant bee insect","bee insect ant","tennis cricket game","cricket tennis game","tennis game cricket","python java program","java python program","python program java","english hindi language","hindi english language","english language hindi","full time part time student","part time full time student","full time student part time","cocacola pepsi soda","pepsi cocacola soda","cocacola soda pepsi","asus macbook laptop","macbook asus laptop","asus laptop macbook","india brazil country","brazil india country","india country brazil","shirt pant dress","pant shirt dress","shirt dress pant","summer autumn season","autumn summer season","summer season autumn","solid liquid state","liquid solid state","solid state liquid","physics chemistry subject","chemistry physics subject","physiscs subject chemistry"]


In [3]:
corpus = [sent.split(" ") for sent in corpus]
corpus

[['lion', 'tiger', 'carnivore'],
 ['tiger', 'lion', 'carnivore'],
 ['tiger', 'carnivore', 'lion', ''],
 ['giraffe', 'zebra', 'herbivore'],
 ['zebra', 'giraffe', 'herbivore'],
 ['giraffe', 'herbivore', 'zebra'],
 ['honda', 'toyota', 'car'],
 ['toyota', 'honda', 'car'],
 ['', 'honda', 'car', 'toyota'],
 ['chicken', 'beef', 'meat'],
 ['beef', 'chicken', 'meat'],
 ['chicken', 'meat', 'beef'],
 ['potato', 'cucumber', 'vegetable'],
 ['cucumber', 'potato', 'vegetable', ''],
 ['potato', 'vegetable', 'cucumber'],
 ['bee', 'ant', 'insect'],
 ['ant', 'bee', 'insect'],
 ['bee', 'insect', 'ant'],
 ['tennis', 'cricket', 'game'],
 ['cricket', 'tennis', 'game'],
 ['tennis', 'game', 'cricket'],
 ['python', 'java', 'program'],
 ['java', 'python', 'program'],
 ['python', 'program', 'java'],
 ['english', 'hindi', 'language'],
 ['hindi', 'english', 'language'],
 ['english', 'language', 'hindi'],
 ['full', 'time', 'part', 'time', 'student'],
 ['part', 'time', 'full', 'time', 'student'],
 ['full', 'time', 's

In [4]:
#get word sequences and unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocab = list(set(flatten(corpus)))
vocab

['',
 'python',
 'honda',
 'tiger',
 'program',
 'asus',
 'brazil',
 'india',
 'hindi',
 'season',
 'pant',
 'beef',
 'soda',
 'language',
 'cocacola',
 'meat',
 'bee',
 'carnivore',
 'cricket',
 'pepsi',
 'summer',
 'country',
 'game',
 'lion',
 'time',
 'macbook',
 'ant',
 'english',
 'physics',
 'cucumber',
 'dress',
 'shirt',
 'autumn',
 'vegetable',
 'full',
 'chemistry',
 'physiscs',
 'java',
 'student',
 'giraffe',
 'subject',
 'liquid',
 'car',
 'tennis',
 'laptop',
 'herbivore',
 'potato',
 'toyota',
 'insect',
 'zebra',
 'part',
 'state',
 'chicken',
 'solid']

Now, we will do numericalization.

In [5]:
word2index = {w: i for i, w in enumerate(vocab)}
print(word2index)

{'': 0, 'python': 1, 'honda': 2, 'tiger': 3, 'program': 4, 'asus': 5, 'brazil': 6, 'india': 7, 'hindi': 8, 'season': 9, 'pant': 10, 'beef': 11, 'soda': 12, 'language': 13, 'cocacola': 14, 'meat': 15, 'bee': 16, 'carnivore': 17, 'cricket': 18, 'pepsi': 19, 'summer': 20, 'country': 21, 'game': 22, 'lion': 23, 'time': 24, 'macbook': 25, 'ant': 26, 'english': 27, 'physics': 28, 'cucumber': 29, 'dress': 30, 'shirt': 31, 'autumn': 32, 'vegetable': 33, 'full': 34, 'chemistry': 35, 'physiscs': 36, 'java': 37, 'student': 38, 'giraffe': 39, 'subject': 40, 'liquid': 41, 'car': 42, 'tennis': 43, 'laptop': 44, 'herbivore': 45, 'potato': 46, 'toyota': 47, 'insect': 48, 'zebra': 49, 'part': 50, 'state': 51, 'chicken': 52, 'solid': 53}


In [6]:
#vocab size
voc_size = len(vocab)
print(voc_size)

54


In [7]:
#append UNK
vocab.append('<UNK>')

In [8]:
vocab

['',
 'python',
 'honda',
 'tiger',
 'program',
 'asus',
 'brazil',
 'india',
 'hindi',
 'season',
 'pant',
 'beef',
 'soda',
 'language',
 'cocacola',
 'meat',
 'bee',
 'carnivore',
 'cricket',
 'pepsi',
 'summer',
 'country',
 'game',
 'lion',
 'time',
 'macbook',
 'ant',
 'english',
 'physics',
 'cucumber',
 'dress',
 'shirt',
 'autumn',
 'vegetable',
 'full',
 'chemistry',
 'physiscs',
 'java',
 'student',
 'giraffe',
 'subject',
 'liquid',
 'car',
 'tennis',
 'laptop',
 'herbivore',
 'potato',
 'toyota',
 'insect',
 'zebra',
 'part',
 'state',
 'chicken',
 'solid',
 '<UNK>']

In [9]:
word2index['<UNK>'] = 0


In [10]:
#just in case we need to use
index2word = {v:k for k, v in word2index.items()} 

Now, we will build co-occurence matrix

In [11]:
from collections import Counter

X_i = Counter(flatten(corpus)) # X_i
X_i

Counter({'lion': 3,
         'tiger': 3,
         'carnivore': 3,
         '': 3,
         'giraffe': 3,
         'zebra': 3,
         'herbivore': 3,
         'honda': 3,
         'toyota': 3,
         'car': 3,
         'chicken': 3,
         'beef': 3,
         'meat': 3,
         'potato': 3,
         'cucumber': 3,
         'vegetable': 3,
         'bee': 3,
         'ant': 3,
         'insect': 3,
         'tennis': 3,
         'cricket': 3,
         'game': 3,
         'python': 3,
         'java': 3,
         'program': 3,
         'english': 3,
         'hindi': 3,
         'language': 3,
         'full': 3,
         'time': 6,
         'part': 3,
         'student': 3,
         'cocacola': 3,
         'pepsi': 3,
         'soda': 3,
         'asus': 3,
         'macbook': 3,
         'laptop': 3,
         'india': 3,
         'brazil': 3,
         'country': 3,
         'shirt': 3,
         'pant': 3,
         'dress': 3,
         'summer': 3,
         'autumn': 3,
         '

In [12]:
# Make skip gram of two size window
skip_grams = []
# loop each word sequence
# we starts from 1 because 0 has no context
# we stop at second last for the same reason
for sent in corpus:
    for i in range(2, len(sent) - 2):
        target = sent[i]
        context = [sent[i - 2], sent[i + 2]]
        for w in context:
            skip_grams.append((target, w))

skip_grams

[('part', 'full'),
 ('part', 'student'),
 ('full', 'part'),
 ('full', 'student'),
 ('student', 'full'),
 ('student', 'time')]

In [15]:
X_ik_skipgram = Counter(skip_grams) # Co-occurece in window size 1
X_ik_skipgram

Counter({('part', 'full'): 1,
         ('part', 'student'): 1,
         ('full', 'part'): 1,
         ('full', 'student'): 1,
         ('student', 'full'): 1,
         ('student', 'time'): 1})

Weighing Function

In [16]:
def weighting(w_i, w_j, X_ik):
        
    #check whether the co-occurrences exist between these two words
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  #if does not exist, set it to 1
                
    x_max = 100 #100 # fixed in paper  #cannot exceed 100 counts
    alpha = 0.75
    
    #if co-occurrence does not exceed 100, scale it based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max)**alpha  #scale it
    else:
        result = 1  #if is greater than max, set it to 1 maximum
    
    return result

In [17]:
from itertools import combinations_with_replacement

X_ik = {}  #for keeping the co-occurences
weighting_dic = {} #scaling the percentage of sampling

for bigram in combinations_with_replacement(vocab, 2):
    if X_ik_skipgram.get(bigram) is not None:  #matches 
        co_occer = X_ik_skipgram[bigram]  #get the count from what we already counted
        X_ik[bigram] = co_occer + 1 # + 1 for stability issue
        X_ik[(bigram[1],bigram[0])] = co_occer+1   #count also for the opposite
    else:
        pass
        
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

print(f"{X_ik=}")
print(f"{weighting_dic=}")

X_ik={('full', 'student'): 2, ('student', 'full'): 2, ('full', 'part'): 2, ('part', 'full'): 2}
weighting_dic={('', ''): 0.03162277660168379, ('', 'python'): 0.03162277660168379, ('python', ''): 0.03162277660168379, ('', 'honda'): 0.03162277660168379, ('honda', ''): 0.03162277660168379, ('', 'tiger'): 0.03162277660168379, ('tiger', ''): 0.03162277660168379, ('', 'program'): 0.03162277660168379, ('program', ''): 0.03162277660168379, ('', 'asus'): 0.03162277660168379, ('asus', ''): 0.03162277660168379, ('', 'brazil'): 0.03162277660168379, ('brazil', ''): 0.03162277660168379, ('', 'india'): 0.03162277660168379, ('india', ''): 0.03162277660168379, ('', 'hindi'): 0.03162277660168379, ('hindi', ''): 0.03162277660168379, ('', 'season'): 0.03162277660168379, ('season', ''): 0.03162277660168379, ('', 'pant'): 0.03162277660168379, ('pant', ''): 0.03162277660168379, ('', 'beef'): 0.03162277660168379, ('beef', ''): 0.03162277660168379, ('', 'soda'): 0.03162277660168379, ('soda', ''): 0.03162277660

Prepare train data

In [18]:
for c in corpus:
    print(c)

['lion', 'tiger', 'carnivore']
['tiger', 'lion', 'carnivore']
['tiger', 'carnivore', 'lion', '']
['giraffe', 'zebra', 'herbivore']
['zebra', 'giraffe', 'herbivore']
['giraffe', 'herbivore', 'zebra']
['honda', 'toyota', 'car']
['toyota', 'honda', 'car']
['', 'honda', 'car', 'toyota']
['chicken', 'beef', 'meat']
['beef', 'chicken', 'meat']
['chicken', 'meat', 'beef']
['potato', 'cucumber', 'vegetable']
['cucumber', 'potato', 'vegetable', '']
['potato', 'vegetable', 'cucumber']
['bee', 'ant', 'insect']
['ant', 'bee', 'insect']
['bee', 'insect', 'ant']
['tennis', 'cricket', 'game']
['cricket', 'tennis', 'game']
['tennis', 'game', 'cricket']
['python', 'java', 'program']
['java', 'python', 'program']
['python', 'program', 'java']
['english', 'hindi', 'language']
['hindi', 'english', 'language']
['english', 'language', 'hindi']
['full', 'time', 'part', 'time', 'student']
['part', 'time', 'full', 'time', 'student']
['full', 'time', 'student', 'part', 'time']
['cocacola', 'pepsi', 'soda']
['pe

In [19]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #convert to id since our skip_grams is word, not yet id
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    random_inputs = []
    random_labels = []
    random_coocs  = []
    random_weightings = []
    random_index = np.random.choice(range(len(skip_grams_id)), batch_size, replace=False) #randomly pick without replacement
        
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])  # target, e.g., 2
        random_labels.append([skip_grams_id[i][1]])  # context word, e.g., 3
        
        #get cooc
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1
        random_coocs.append([math.log(cooc)])
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append([weighting])
                    
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)

Testing the method

In [20]:
#testing the method
batch_size = 2 # mini-batch size
input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)

print("Input: ", input_batch)
print("Target: ", target_batch)
print("Cooc: ", cooc_batch)
print("Weighting: ", weighting_batch)

Input:  [[34]
 [50]]
Target:  [[50]
 [38]]
Cooc:  [[0.69314718]
 [0.        ]]
Weighting:  [[0.05318296]
 [0.03162278]]


Model

In [21]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

Training the model

In [22]:
batch_size     = 10 # mini-batch size
embedding_size = 2 #so we can later plot
model          = GloVe(voc_size, embedding_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [23]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [26]:
import time

# Training
num_epochs = 20
for epoch in range(num_epochs):
    
    start = time.time()
    
    input_batch, target_batch, cooc_batch, weighting_batch = random_batch(batch_size, corpus, skip_grams, X_ik, weighting_dic)
    input_batch  = torch.LongTensor(input_batch)         #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch)        #[batch_size, 1]
    cooc_batch   = torch.FloatTensor(cooc_batch)         #[batch_size, 1]
    weighting_batch = torch.FloatTensor(weighting_batch) #[batch_size, 1]
    
    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()
    
    end = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start, end)

    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

ValueError: ignored

Evaluating syntactic and semantic embeddings

In [27]:
def get_embed(word, model):
    word = torch.LongTensor([index])
    embed_center = model.embedding_embed_center_word(word)
    embed_outside = model.embedding_embed_outside_word(word)
    embed = (embed_center+ embed_outside)/2
    return embed


In [28]:
def glove_embed(word, model):
    id_tensor = torch.LongTensor([word2index[word]])
    v_embed = model.embedding_v(id_tensor)
    u_embed = model.embedding_u(id_tensor) 
    word_embed = (v_embed + u_embed) / 2 
    return word_embed