# 19 Jan - GloVe

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

### 1. Loading Data

In [2]:
#define the training corpus
#I use the Inaugural corpus from NLTK
import nltk
from nltk.corpus import inaugural
corpus = nltk.corpus.inaugural.sents()

In [3]:
#the corpus is already tokenized
print(corpus[:3])

[['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':'], ['Among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ['On', 'the', 'one', 'hand', ',', 'I', 'was', 'summoned', 'by', 'my', 'Country', ',', 'whose', 'voice', 'I', 'can', 'never', 'hear', 'but', 'with', 'veneration', 'and', 'love', ',', 'from', 'a', 'retreat', 'which', 'I', 'had', 'chosen', 'with', 'the', 'fondest', 'predilection', ',', 'and', ',', 'in', 'my', 'flattering', 'hopes', ',', 'with', 'an', 'immutable', 'decision', ',', 'as', 'the', 'asylum', 'of', 'my', 'declining', 'years', '--', 'a', 'retreat', 'which', 'was', 'rendered', 'every', 'day', 'more', 'necessary', 'as', 'well', 'as', 'more', 'dear',

In [4]:
#convert the words in the corpus into lower case
corpus_tokenized = [[]] * len(corpus)
for i in range(len(corpus)):
    corpus_tokenized[i] = [word.lower() for word in corpus[i]]

print(corpus_tokenized[:3])

[['fellow', '-', 'citizens', 'of', 'the', 'senate', 'and', 'of', 'the', 'house', 'of', 'representatives', ':'], ['among', 'the', 'vicissitudes', 'incident', 'to', 'life', 'no', 'event', 'could', 'have', 'filled', 'me', 'with', 'greater', 'anxieties', 'than', 'that', 'of', 'which', 'the', 'notification', 'was', 'transmitted', 'by', 'your', 'order', ',', 'and', 'received', 'on', 'the', '14th', 'day', 'of', 'the', 'present', 'month', '.'], ['on', 'the', 'one', 'hand', ',', 'i', 'was', 'summoned', 'by', 'my', 'country', ',', 'whose', 'voice', 'i', 'can', 'never', 'hear', 'but', 'with', 'veneration', 'and', 'love', ',', 'from', 'a', 'retreat', 'which', 'i', 'had', 'chosen', 'with', 'the', 'fondest', 'predilection', ',', 'and', ',', 'in', 'my', 'flattering', 'hopes', ',', 'with', 'an', 'immutable', 'decision', ',', 'as', 'the', 'asylum', 'of', 'my', 'declining', 'years', '--', 'a', 'retreat', 'which', 'was', 'rendered', 'every', 'day', 'more', 'necessary', 'as', 'well', 'as', 'more', 'dear',

In [5]:
#remove stop words with SpaCy
import spacy
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

for sentence in corpus_tokenized:
    for word in sentence[:]:
        if word in stopwords:
            sentence.remove(word)

print(corpus_tokenized[:3])

[['fellow', '-', 'citizens', 'senate', 'house', 'representatives', ':'], ['vicissitudes', 'incident', 'life', 'event', 'filled', 'greater', 'anxieties', 'notification', 'transmitted', 'order', ',', 'received', '14th', 'day', 'present', 'month', '.'], ['hand', ',', 'summoned', 'country', ',', 'voice', 'hear', 'veneration', 'love', ',', 'retreat', 'chosen', 'fondest', 'predilection', ',', ',', 'flattering', 'hopes', ',', 'immutable', 'decision', ',', 'asylum', 'declining', 'years', '--', 'retreat', 'rendered', 'day', 'necessary', 'dear', 'addition', 'habit', 'inclination', ',', 'frequent', 'interruptions', 'health', 'gradual', 'waste', 'committed', 'time', '.']]


In [6]:
#remove punctutations with String
import string
punctutations = string.punctuation

for sentence in corpus_tokenized:
    for word in sentence[:]:
        if word in punctutations:
            sentence.remove(word)

print(corpus_tokenized[:3])

[['fellow', 'citizens', 'senate', 'house', 'representatives'], ['vicissitudes', 'incident', 'life', 'event', 'filled', 'greater', 'anxieties', 'notification', 'transmitted', 'order', 'received', '14th', 'day', 'present', 'month'], ['hand', 'summoned', 'country', 'voice', 'hear', 'veneration', 'love', 'retreat', 'chosen', 'fondest', 'predilection', 'flattering', 'hopes', 'immutable', 'decision', 'asylum', 'declining', 'years', '--', 'retreat', 'rendered', 'day', 'necessary', 'dear', 'addition', 'habit', 'inclination', 'frequent', 'interruptions', 'health', 'gradual', 'waste', 'committed', 'time']]


In [7]:
#remove '--'
for sentence in corpus_tokenized:
    for word in sentence[:]:
        if word == '--':
            sentence.remove(word)

print(corpus_tokenized[:3])

[['fellow', 'citizens', 'senate', 'house', 'representatives'], ['vicissitudes', 'incident', 'life', 'event', 'filled', 'greater', 'anxieties', 'notification', 'transmitted', 'order', 'received', '14th', 'day', 'present', 'month'], ['hand', 'summoned', 'country', 'voice', 'hear', 'veneration', 'love', 'retreat', 'chosen', 'fondest', 'predilection', 'flattering', 'hopes', 'immutable', 'decision', 'asylum', 'declining', 'years', 'retreat', 'rendered', 'day', 'necessary', 'dear', 'addition', 'habit', 'inclination', 'frequent', 'interruptions', 'health', 'gradual', 'waste', 'committed', 'time']]


In [8]:
#get unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(corpus_tokenized)))

In [9]:
#numericalize the vocabs
word2index = {w: i for i, w in enumerate(vocabs)}

In [10]:
#append <UNK>
vocabs.append('<UNK>')
word2index['<UNK>'] = len(word2index)

### 2. GloVe

#### 2.1. Co-occurence Matrix

In [39]:
#count the frequency of each word
from collections import Counter

X_i = Counter(flatten(corpus_tokenized))

X_i['fellow']

158

In [40]:
#generate skipgrams with a generic window size
def generate_skip_gram(window_size): 
    skip_grams = []
    for sentence in corpus_tokenized:
        #I include the first and last words
        #so that every word could be a center word
        for i in range(len(sentence)):
            center = sentence[i]
            context = []
            for j in range(window_size):
                if (i - window_size + j) >= 0:
                    context.append(sentence[i - window_size + j])
            for k in range(1, window_size + 1):
                if (i + k) < len(sentence):
                    context.append(sentence[i + k])
            for w in context:
                skip_grams.append((center, w))
        
    return skip_grams

In [41]:
#prepare skipgrams with window size of 2
skip_grams = generate_skip_gram(2)

skip_grams[:5]

[('fellow', 'citizens'),
 ('fellow', 'senate'),
 ('citizens', 'fellow'),
 ('citizens', 'senate'),
 ('citizens', 'house')]

In [42]:
#count co-occurences in the skipgrams
X_ik_skipgram = Counter(skip_grams)

In [43]:
print(X_ik_skipgram[('fellow', 'citizens')])
print(X_ik_skipgram[('fellow', 'communists')])

117
0


#### 2.2. Weighting Function

In [44]:
#define the weighting function
def weighting(w_i, w_j, X_ik):
    
    x_ij = X_ik[(w_i, w_j)]

    #label smoothing if there is no co-occurence (i.e., x_ij is 0)
    if x_ij == 0:
        x_ij = 1
        
    #maximum co-occurrences is 100 according to the paper
    x_max = 100
    alpha = 0.75
    
    #if the co-occurrences does not exceed x_max, scale it down based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max) ** alpha
    else:
        result = 1 #the maximum probability
        
    return result

In [45]:
#test the weighting function
w_i  = 'fellow'
w_j  = 'citizens'

print(weighting(w_i, w_j, X_ik_skipgram))

1


In [46]:
w_i  = 'fellow'
w_j  = 'communists'

print(weighting(w_i, w_j, X_ik_skipgram))
print((1 / 100) ** 0.75)

0.03162277660168379
0.03162277660168379


In [19]:
#apply this weighting to all possible pairs
from itertools import combinations_with_replacement

X_ik = {} #for keeping the co-occurrences
weighting_dic = {} #for keeping all the probabilities after passing through the weighting function

for bigram in combinations_with_replacement(vocabs, 2):  #we need to also think its reverse
    #if this bigram exists in X_ik_skipgrams
    #we gonna add this to our co-occurence matrix
    if X_ik_skipgram.get(bigram) is not None:
        cooc = X_ik_skipgram[bigram]  #get the co-occurrence
        X_ik[bigram] = cooc + 1 #this is again basically label smoothing
        X_ik[(bigram[1], bigram[0])] = cooc + 1  #trick to get all pairs
    else: #otherwise, put 0
        X_ik[bigram] = 0
        X_ik[(bigram[1], bigram[0])] = 0

    #apply the weighting function using this co-occurrence matrix thingy    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

In [20]:
#test the weighting function
print(X_ik_skipgram[('senate', 'house')])
print(X_ik_skipgram[('house', 'senate')])

print(X_ik[('senate', 'house')])
print(X_ik[('house', 'senate')])

print(weighting_dic[('senate', 'house')])
print(weighting_dic[('house', 'senate')])

print((5 / 100) ** 0.75)

4
4
5
5
0.10573712634405642
0.10573712634405642
0.10573712634405642


In [21]:
print(X_ik_skipgram[('communists', 'communists')])
print(X_ik[('communists', 'communists')])
print(weighting_dic[('communists', 'communists')])
print((1 / 100) ** 0.75)

0
0
0.03162277660168379
0.03162277660168379


#### 2.3. Preparing Training Data

In [22]:
#random batch for GloVe with generic batch size, corpus and skipgrams
import math

def random_batch_glove(batch_size, corpus, skip_grams, X_ik, weighting_dic):
    
    #change words in the skipgrams to idices
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly pick "batch_size" indices
    number_of_choices = len(skip_grams_id)
    random_index = np.random.choice(number_of_choices, batch_size, replace=False) #no repeating indexes among these random indexes
    
    random_inputs = [] #xi, wi (in batches)
    random_labels = [] #xj, wj (in batches)
    random_coocs  = [] #Xij (in batches)
    random_weightings = [] #weighting_dic(Xij) (in batches)
    #for each of the sample in these indexes
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])
        random_labels.append([skip_grams_id[i][1]])
        
        #get cooc
        #first check whether it exists...
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1 #label smoothing
            
        random_coocs.append([math.log(cooc)])
        #log according to the cost function equation
        #bracket because neural network requires size ( , 1)
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append(weighting)

        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)
    

In [23]:
#test the method
batch_size = 2
inputs, targets, coocs, weightings = random_batch_glove(batch_size, corpus_tokenized, skip_grams, X_ik, weighting_dic)

inputs, targets, coocs, weightings

(array([[4651],
        [2868]]),
 array([[7867],
        [3419]]),
 array([[0.69314718],
        [3.76120012]]),
 array([0.05318296, 0.53100834]))

#### 2.4. Model

In [24]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # context embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, context_words, coocs, weightings):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        context_bias = self.u_bias(context_words).squeeze(1)
        
        inner_product = context_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs is already log
        loss = weightings * torch.pow(inner_product + center_bias + context_bias - coocs, 2)
        
        return torch.sum(loss)

#### 2.5. Training

In [25]:
#set training parameters
batch_size  = 10
vocab_size  = len(vocabs)
emb_size    = 50
model       = GloVe(vocab_size, emb_size)
optimizer   = optim.Adam(model.parameters(), lr=0.001)

In [26]:
#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [27]:
#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    inputs, targets, coocs, weightings = random_batch_glove(batch_size, corpus_tokenized, skip_grams, X_ik, weighting_dic)
    input_batch     = torch.LongTensor(inputs)
    target_batch    = torch.LongTensor(targets)
    cooc_batch      = torch.FloatTensor(coocs)
    weighting_batch = torch.FloatTensor(weightings)

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)

        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch: 100 | cost: 188.933212 | time: 0m 9s
Epoch: 200 | cost: 295.503082 | time: 0m 19s
Epoch: 300 | cost: 203.059021 | time: 0m 29s
Epoch: 400 | cost: 431.108551 | time: 0m 38s
Epoch: 500 | cost: 476.008148 | time: 0m 47s


#### 2.6. Saving Model

In [28]:
#save the GloVe model with pickle
import pickle

pickle.dump(model, open('GloVe.pkl', 'wb'))

### 3. CBOW

In [11]:
#random batch for CBOW model with default window size and batch size of 1 each
def random_batch_cbow(window_size=1, batch_size=1): 
    cbow = []
    for sentence in corpus_tokenized:
        #I include the first and last words
        #so that every word could be a center word
        for i in range(len(sentence)):
            center = word2index[sentence[i]]
            context = []
            context_idx = []
            #group the indices of the context words
            for j in range(window_size):
                context_idx.append(i - window_size + j)
            for k in range(1, window_size + 1):
                context_idx.append(i + k)
            #append the context words based on their indices
            #append <UNK> if there is no word at an index
            for idx in context_idx:
                if idx < 0:
                    context.append(word2index['<UNK>'])
                elif idx >= len(sentence):
                    context.append(word2index['<UNK>'])
                else:
                    context.append(word2index[sentence[idx]])
            cbow.append([context, center])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(cbow)), batch_size, replace=False)
    for i in random_index:
        random_inputs.append(cbow[i][0])
        random_labels.append([cbow[i][1]])
            
    return np.array(random_inputs), np.array(random_labels)

In [12]:
#test the CBOW method
input_batch, target_batch = random_batch_cbow(2, 10)

print("Input: ", input_batch)
print("Target: ", target_batch)

Input:  [[4210 3418 8289 6532]
 [8140 1099 5047 2694]
 [3691 5801 2886 6777]
 [9019 2365 4130 6921]
 [9019 6789 6584 2886]
 [4372  865 6559 8434]
 [3845 8316 8807 1210]
 [7217 1296 5234 5276]
 [9019 9019  425 4281]
 [6251 2233 9019 9019]]
Target:  [[4749]
 [2548]
 [4988]
 [3097]
 [7087]
 [4742]
 [7149]
 [8181]
 [3418]
 [8904]]


In [13]:
#CBOW model
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(CBOW,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, context_words, center_words, all_vocabs):
        center_embeds = self.embedding_v(center_words)  #[batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words)  #[batch_size, window_size, emb_size]
        all_embeds    = self.embedding_v(all_vocabs)    #[batch_size, vocab_size, emb_size]
        
        scores      = center_embeds.bmm(context_embeds.transpose(1, 2)).squeeze(2)

        norm_scores = all_embeds.bmm(context_embeds.transpose(1, 2)).squeeze(2)

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1)))
            
        return nll # negative log likelihood

In [14]:
#set training parameters
window_size = 2
batch_size = 10
vocab_size = len(vocabs)
emb_size = 50
model = CBOW(vocab_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

#convert all vocabs to tensors
def prepare_sequence(vocabs, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], vocabs))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, vocab_size)

#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch_cbow(window_size, batch_size)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)

        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch: 100 | cost: 24.164333 | time: 0m 36s
Epoch: 200 | cost: 25.088284 | time: 1m 13s
Epoch: 300 | cost: 22.174625 | time: 1m 51s
Epoch: 400 | cost: 22.098713 | time: 2m 29s
Epoch: 500 | cost: 21.277281 | time: 3m 6s


In [17]:
#save the CBOW model
pickle.dump(model, open('CBOW.pkl', 'wb'))

### 4. Skip-gram

In [18]:
#random batch for skip-gram model with default window size and batch size of 1 each
def random_batch_skip_gram(window_size=1, batch_size=1): 
    skip_grams = []
    for sentence in corpus_tokenized:
        #I include the first and last words
        #so that every word could be a center word
        for i in range(len(sentence)):
            center = word2index[sentence[i]]
            context = []
            for j in range(window_size):
                if (i - window_size + j) >= 0:
                    context.append(word2index[sentence[i - window_size + j]])
            for k in range(1, window_size + 1):
                if (i + k) < len(sentence):
                    context.append(word2index[sentence[i + k]])
            for w in context:
                skip_grams.append([center, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])
        random_labels.append([skip_grams[i][1]])
            
    return np.array(random_inputs), np.array(random_labels)

In [19]:
#test the skip-gram method
input_batch, target_batch = random_batch_skip_gram(2, 10)

print("Input: ", input_batch)
print("Target: ", target_batch)

Input:  [[   5]
 [ 301]
 [8579]
 [6921]
 [2760]
 [1147]
 [6559]
 [6218]
 [3865]
 [2070]]
Target:  [[2430]
 [4110]
 [6971]
 [2886]
 [ 890]
 [ 497]
 [6330]
 [1788]
 [4703]
 [7538]]


In [20]:
#Skip-gram model
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, context_words, all_vocabs):
        center_embeds = self.embedding_v(center_words)  #[batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words)  #[batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs)    #[batch_size, vocab_size, emb_size]
        
        scores      = context_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, vocab_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, vocab_size, 1] = [batch_size, vocab_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

In [21]:
#set training parameters
window_size = 2
batch_size = 10
vocab_size = len(vocabs)
emb_size = 50
model = Skipgram(vocab_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

#convert all vocabs to tensors
def prepare_sequence(vocabs, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], vocabs))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, vocab_size)

#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch_skip_gram(window_size, batch_size)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)

        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch: 100 | cost: 25.881420 | time: 0m 48s
Epoch: 200 | cost: 26.217566 | time: 1m 37s
Epoch: 300 | cost: 23.113102 | time: 2m 26s
Epoch: 400 | cost: 24.003254 | time: 3m 14s
Epoch: 500 | cost: 29.626434 | time: 4m 4s


In [22]:
#save the skip-gram model
pickle.dump(model, open('Skipgram.pkl', 'wb'))

### 5. Skip-gram with Negative Sampling

In [23]:
#count all the occurrences of vocabs
from collections import Counter
word_count = Counter(flatten(corpus_tokenized))

#count the number of total words
num_total_words = sum([c for w, c in word_count.items()])

#create the scaled-up unigram distribution table for vocabs
z = 0.001 #the scaler
unigram_table = []
for v in vocabs:
    unigram_table.extend([v] * int(((word_count[v]/num_total_words)**0.75)/z))

In [24]:
#convert word indices to tensors
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#generate random negative samples
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

In [25]:
#test the negative sampling method
input_batch, target_batch = random_batch_skip_gram(2, 10)
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)

num_neg = 5 #number of negative samples for each target word

neg_samples = negative_sampling(target_batch, unigram_table, num_neg)

neg_samples

tensor([[5630, 5219, 2602, 8259, 2688],
        [6113,  703, 4486, 1417, 2741],
        [4723, 4997, 8459, 8681, 5108],
        [6701, 5953, 1590,  497,  925],
        [1211, 4928,  782, 3051, 8198],
        [2287, 2855, 2776, 5155, 6376],
        [5710,  603, 5553, 7308, 2859],
        [2013, 8716, 4389, 3051, 2741],
        [4052, 5832,  938, 5106, 6535],
        [7618, 2631, 4742, 1485, 3807]])

In [26]:
#Skip-gram with negative sampling model
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, context_words, neg_samples):
        center_embeds  = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words) # [batch_size, 1, emb_size]
        neg_embeds     = self.embedding_u(neg_samples) # [batch_size, num_neg, emb_size]
        
        positive_score = context_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = -neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, num_neg, 1]
        
        loss = -torch.mean(self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1))
                
        return loss

In [27]:
#set parameters
window_size = 2
batch_size  = 10
vocab_size  = len(vocabs)
emb_size    = 50
model       = SkipgramNegSampling(vocab_size, emb_size)
num_neg     = 10
optimizer   = optim.Adam(model.parameters(), lr=0.001)

#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):    
    input_batch, target_batch = random_batch_skip_gram(window_size, batch_size)
    input_batch = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)
    neg_samples = negative_sampling(target_batch, unigram_table, num_neg)
    
    optimizer.zero_grad()   
    loss = model(input_batch, target_batch, neg_samples)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

Epoch: 100 | cost: 31.318594 | time: 0m 45s
Epoch: 200 | cost: 22.963648 | time: 1m 32s
Epoch: 300 | cost: 27.809229 | time: 2m 21s
Epoch: 400 | cost: 37.649330 | time: 3m 12s
Epoch: 500 | cost: 34.258484 | time: 3m 59s


In [28]:
#save the skip-gram with negative sampling model
pickle.dump(model, open('SkipgramNegSampling.pkl', 'wb'))

### 6. Testing

#### 6.1. Preparing Test Data

In [29]:
#load the dataset for testing
file_path = "data/questions-words.txt"

with open(file_path, 'r') as f:
    contents = f.read()
    data = contents.split('\n')

data[:5]

[': capital-common-countries',
 'Athens Greece Baghdad Iraq',
 'Athens Greece Bangkok Thailand',
 'Athens Greece Beijing China',
 'Athens Greece Berlin Germany']

In [30]:
#explore the dataset
for idx, sent in enumerate(data):
    if sent[0] == ':':
        print(idx, sent)

0 : capital-common-countries
507 : capital-world
5032 : currency
5899 : city-in-state
8367 : family
8874 : gram1-adjective-to-adverb
9867 : gram2-opposite
10680 : gram3-comparative
12013 : gram4-superlative
13136 : gram5-present-participle
14193 : gram6-nationality-adjective
15793 : gram7-past-tense
17354 : gram8-plural
18687 : gram9-plural-verbs


In [31]:
#create the corpora for testing
family = data[8368:8874]
family_tokenized = [sent.split(' ') for sent in family]
print(family_tokenized[:5])

plural = data[17355:18687]
plural_tokenized = [sent.split(' ') for sent in plural]
print(plural_tokenized[:5])


[['boy', 'girl', 'brother', 'sister'], ['boy', 'girl', 'brothers', 'sisters'], ['boy', 'girl', 'dad', 'mom'], ['boy', 'girl', 'father', 'mother'], ['boy', 'girl', 'grandfather', 'grandmother']]
[['banana', 'bananas', 'bird', 'birds'], ['banana', 'bananas', 'bottle', 'bottles'], ['banana', 'bananas', 'building', 'buildings'], ['banana', 'bananas', 'car', 'cars'], ['banana', 'bananas', 'cat', 'cats']]


In [32]:
#get unique words
flatten = lambda l: [item for sublist in l for item in sublist]
family_vocabs  = list(set(flatten(family_tokenized)))
plural_vocabs  = list(set(flatten(plural_tokenized)))

In [33]:
#numericalize the vocabs
family_word2index = {w: i for i, w in enumerate(family_vocabs)}
plural_word2index = {w: i for i, w in enumerate(plural_vocabs)}

In [34]:
#append <UNK>
family_vocabs.append('<UNK>')
family_word2index['<UNK>'] = len(family_word2index)

plural_vocabs.append('<UNK>')
plural_word2index['<UNK>'] = len(plural_word2index)

In [35]:
#prepare index2word
family_index2word = {i:w for w, i in family_word2index.items()}
plural_index2word = {i:w for w, i in plural_word2index.items()}

#### 6.1. Syntactic Test

##### The 'plural' corpus will be used for syntactic analogies.

In [36]:
#get embedding
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
    
    word = torch.LongTensor([index])

    center_embed  = model.embedding_v(word)
    context_embed = model.embedding_u(word)
    
    embed = (center_embed + context_embed) / 2
    
    return  embed[0].detach().numpy()

In [37]:
#prepare word2index and index2word
word2index = plural_word2index
index2word = {i:w for w, i in word2index.items()}

In [38]:
#get embeddings for GloVe
model = pickle.load(open('GloVe.pkl', 'rb'))
GloVe_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    GloVe_embeds.update({word: embed})

AttributeError: Can't get attribute 'GloVe' on <module '__main__'>

In [None]:
#get embeddings for CBOW
model = pickle.load(open('CBOW.pkl', 'rb'))
CBOW_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    CBOW_embeds.update({word: embed})

In [None]:
#get embeddings for Skipgram
model = pickle.load(open('Skipgram.pkl', 'rb'))
Skipgram_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    Skipgram_embeds.update({word: embed})

In [None]:
#get embeddings for SkipgramNegSampling
model = pickle.load(open('SkipgramNegSampling.pkl', 'rb'))
SkipgramNegSampling_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    SkipgramNegSampling_embeds.update({word: embed})

In [None]:
#cosine similarity function
from numpy import dot
from numpy.linalg import norm

def cos_sim(a, b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [None]:
#analogy function
def analogy(a, b, c, embeds, vocabs):
    d_vector = embeds[c] - embeds[a] + embeds[b]

    similarity = -1
    for vocab in vocabs:
        if vocab not in [a, b, c]:
            if cos_sim(d_vector, embeds[vocab]) > similarity:
                similarity = cos_sim(d_vector, embeds[vocab])
                d = (vocab, similarity)
    
    return d

In [None]:
analogy('car', 'cars', 'lion', GloVe_embeds, plural_vocabs)

In [None]:
#accuracy function
def accuracy(label, pred):
    if pred == label:
        True
    else:
        False

In [None]:
#count the syntactic accuracies of model embeddings
models = ['GloVe', 'CBOW', 'Skipgram', 'SkipgramNegSampling']
embeds = [GloVe_embeds, CBOW_embeds, Skipgram_embeds, SkipgramNegSampling_embeds]

for i, model in enumerate(models):
    embed = embeds[i]
    accuracy_count = 0
    for sent in plural_tokenized:
        label = sent[-1]
        a, b, c = sent[:-1]
        pred = analogy(a, b, c, embed, plural_vocabs)[0]
        if accuracy(label, pred) is True:
            accuracy_count += 1
    
    print(f'The accuracy of {model} is {accuracy_count/len(plural_tokenized)}%.')

#### 6.2. Semantic Test

The 'family' corpus will be used for semantic analogies.

In [None]:
#prepare word2index and index2word
word2index = family_word2index
index2word = {i:w for w, i in word2index.items()}

In [None]:
#get embeddings for GloVe
model = pickle.load(open('GloVe.pkl', 'rb'))
GloVe_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    GloVe_embeds.update({word: embed})

In [None]:
#get embeddings for CBOW
model = pickle.load(open('CBOW.pkl', 'rb'))
CBOW_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    CBOW_embeds.update({word: embed})

In [None]:
#get embeddings for Skipgram
model = pickle.load(open('Skipgram.pkl', 'rb'))
Skipgram_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    Skipgram_embeds.update({word: embed})

In [None]:
#get embeddings for SkipgramNegSampling
model = pickle.load(open('SkipgramNegSampling.pkl', 'rb'))
SkipgramNegSampling_embeds = {}

for i in range(len(word2index)):
    word = index2word[i]
    embed = get_embed(word)
    SkipgramNegSampling_embeds.update({word: embed})

In [None]:
#count the semantic accuracies of model embeddings
models = ['GloVe', 'CBOW', 'Skipgram', 'SkipgramNegSampling']
embeds = [GloVe_embeds, CBOW_embeds, Skipgram_embeds, SkipgramNegSampling_embeds]

for i, model in enumerate(models):
    embed = embeds[i]
    accuracy_count = 0
    for sent in family_tokenized:
        label = sent[-1]
        a, b, c = sent[:-1]
        pred = analogy(a, b, c, embed, family_vocabs)[0]
        if accuracy(label, pred) is True:
            accuracy_count += 1
    
    print(f'The accuracy of {model} is {accuracy_count/len(plural_tokenized)}%.')

#### 6.4. Findings and Conclusion

**The 4 embedding models were trained with the following parameters:**
- window size = 2
- batch size = 10
- embedding size = 50
- number of negative samples = 10
- optimizer = Adam
- learning rate = 0.001

**Then, the embeddings were tested on syntactic and semantic analogies. The test results are as follows:**

| Model | Syntactic Accuracy | Semantic Accuracy  |
| --- | --- | ---  |
| GloVe | 0.0 | 0.0  |
| CBOW | 0.0 | 0.0  |
| Skip-gram | 0.0 | 0.0  |
| Skip-gram (Neg) | 0.0 | 0.0  |

**All models achieved 0% accuracy. It may be due to the limited size of the corpus.**

### 7. Word Similarity

In [None]:
#load data
import pandas as pd
file_path = "data/wordsim_similarity_goldstandard.txt"
df = pd.read_table(file_path, header=None)
df.head()

In [None]:
#extract features and labels
x1 = df.iloc[:, 0]
x2 = df.iloc[:, 1]
y  = df.iloc[:, 2]

In [None]:
#recall the Inaugural corpus
print(corpus_tokenized[:3])

In [None]:
#prepare word2index
vocabs = list(set(flatten(corpus_tokenized)))
word2index = {w: i for i, w in enumerate(vocabs)}

vocabs.append('<UNK>')
word2index['<UNK>'] = len(word2index)

In [None]:
#get embedding
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
    
    word = torch.LongTensor([index])

    center_embed  = model.embedding_v(word)
    context_embed = model.embedding_u(word)
    
    embed = (center_embed + context_embed) / 2
    
    return  embed[0].detach().numpy()

In [None]:
#calculate Spearman's correlation
from scipy import stats

def correlation(x1, x2):
    x1_embed = get_embed(x1)
    x2_embed = get_embed(x2)
    return stats.spearmanr(x1_embed, x2_embed)[0]

In [None]:
#test
model = pickle.load(open('GloVe.pkl', 'rb'))

print(correlation('tiger', 'cat'))
print(correlation('tiger', 'tiger'))
print(correlation('plane', 'car'))
print(correlation('train', 'car'))
print(correlation('television', 'radio'))

In [None]:
model = pickle.load(open('CBOW.pkl', 'rb'))

print(correlation('tiger', 'cat'))
print(correlation('tiger', 'tiger'))
print(correlation('plane', 'car'))
print(correlation('train', 'car'))
print(correlation('television', 'radio'))

In [None]:
#calcuate word similarity correlations of model embeddings
models = ['GloVe.pkl', 'CBOW.pkl', 'Skipgram.pkl', 'SkipgramNegSampling.pkl']
yhats = [[]] * len(models)

for m in range(len(models)):
    model = pickle.load(open(models[m], 'rb'))

    yhat_list = []
    for i in range(len(y)):
        yhat_list.append(correlation(x1[i], x2[i])*10)

    yhats[m] = yhat_list
    
    print(yhats[m][:5])

In [None]:
#evaluate word similarity correlations of model embeddings
from sklearn.metrics import mean_squared_error as mse

models = ['GloVe', 'CBOW', 'Skipgram', 'SkipgramNegSampling']

for m in range(len(models)):
    yhat = yhats[m]
    loss = mse(y, yhat)
    print(f'MSE of the word similarity correlations of {models[m]} is {loss:.2f}.')

**Although CBOW gets the lowest error, all models perform more or less the same in terms of word similarity based on the WS353 dataset. However, as many words from the dataset are unknown in the corpus, the performance scores are not reliable.**