# 19 Jan - GloVe

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

### 1. Loading Data

In [2]:
#define the training corpus
#I use the Australian Broadcasting Commission corpus from NLTK
import nltk
from nltk.corpus import abc
corpus = nltk.corpus.abc.sents()

In [3]:
#the corpus is already tokenized
print(corpus[:3])

[['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', 'The', 'Prime', 'Minister', 'has', 'denied', 'he', 'knew', 'AWB', 'was', 'paying', 'kickbacks', 'to', 'Iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'Iraq', 'wheat', 'sales', '.'], ['Letters', 'from', 'John', 'Howard', 'and', 'Deputy', 'Prime', 'Minister', 'Mark', 'Vaile', 'to', 'AWB', 'have', 'been', 'released', 'by', 'the', 'Cole', 'inquiry', 'into', 'the', 'oil', 'for', 'food', 'program', '.'], ['In', 'one', 'of', 'the', 'letters', 'Mr', 'Howard', 'asks', 'AWB', 'managing', 'director', 'Andrew', 'Lindberg', 'to', 'remain', 'in', 'close', 'contact', 'with', 'the', 'Government', 'on', 'Iraq', 'wheat', 'sales', '.']]


In [4]:
#convert the words in the corpus into lower case
corpus_tokenized = [[]] * len(corpus)
for i in range(len(corpus)):
    corpus_tokenized[i] = [word.lower() for word in corpus[i]]

print(corpus_tokenized[:3])

[['pm', 'denies', 'knowledge', 'of', 'awb', 'kickbacks', 'the', 'prime', 'minister', 'has', 'denied', 'he', 'knew', 'awb', 'was', 'paying', 'kickbacks', 'to', 'iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'iraq', 'wheat', 'sales', '.'], ['letters', 'from', 'john', 'howard', 'and', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'to', 'awb', 'have', 'been', 'released', 'by', 'the', 'cole', 'inquiry', 'into', 'the', 'oil', 'for', 'food', 'program', '.'], ['in', 'one', 'of', 'the', 'letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'to', 'remain', 'in', 'close', 'contact', 'with', 'the', 'government', 'on', 'iraq', 'wheat', 'sales', '.']]


In [5]:
#remove stop words with SpaCy
import spacy
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

for sentence in corpus_tokenized:
    for word in sentence[:]:
        if word in stopwords:
            sentence.remove(word)

print(corpus_tokenized[:3])

[['pm', 'denies', 'knowledge', 'awb', 'kickbacks', 'prime', 'minister', 'denied', 'knew', 'awb', 'paying', 'kickbacks', 'iraq', 'despite', 'writing', 'wheat', 'exporter', 'asking', 'kept', 'fully', 'informed', 'iraq', 'wheat', 'sales', '.'], ['letters', 'john', 'howard', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'awb', 'released', 'cole', 'inquiry', 'oil', 'food', 'program', '.'], ['letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'remain', 'close', 'contact', 'government', 'iraq', 'wheat', 'sales', '.']]


In [6]:
#remove punctutations with String
import string
punctutations = string.punctuation

for sentence in corpus_tokenized:
    for word in sentence[:]:
        if word in punctutations:
            sentence.remove(word)

print(corpus_tokenized[:3])

[['pm', 'denies', 'knowledge', 'awb', 'kickbacks', 'prime', 'minister', 'denied', 'knew', 'awb', 'paying', 'kickbacks', 'iraq', 'despite', 'writing', 'wheat', 'exporter', 'asking', 'kept', 'fully', 'informed', 'iraq', 'wheat', 'sales'], ['letters', 'john', 'howard', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'awb', 'released', 'cole', 'inquiry', 'oil', 'food', 'program'], ['letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'remain', 'close', 'contact', 'government', 'iraq', 'wheat', 'sales']]


In [7]:
#get unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(corpus_tokenized)))

In [8]:
#numericalize the vocabs
word2index = {w: i for i, w in enumerate(vocabs)}

In [9]:
#append <UNK>
vocabs.append('<UNK>')
word2index['<UNK>'] = len(word2index)

### 2. GloVe

#### 2.1. Co-occurence Matrix

In [10]:
#count the frequency of each word
from collections import Counter

X_i = Counter(flatten(corpus_tokenized))

X_i['minister']

337

In [11]:
#generate skipgrams with a generic window size
def generate_skip_gram(window_size): 
    skip_grams = []
    for sentence in corpus_tokenized:
        #I include the first and last words
        #so that every word could be a center word
        for i in range(len(sentence)):
            center = sentence[i]
            context = []
            for j in range(window_size):
                if (i - window_size + j) >= 0:
                    context.append(sentence[i - window_size + j])
            for k in range(1, window_size + 1):
                if (i + k) < len(sentence):
                    context.append(sentence[i + k])
            for w in context:
                skip_grams.append((center, w))
        
    return skip_grams

In [12]:
#prepare skipgrams with window size of 2
skip_grams = generate_skip_gram(2)

skip_grams[:5]

[('pm', 'denies'),
 ('pm', 'knowledge'),
 ('denies', 'pm'),
 ('denies', 'knowledge'),
 ('denies', 'awb')]

In [13]:
#count co-occurences in the skipgrams
X_ik_skipgram = Counter(skip_grams)

X_ik_skipgram

Counter({('pm', 'denies'): 1,
         ('pm', 'knowledge'): 1,
         ('denies', 'pm'): 1,
         ('denies', 'knowledge'): 1,
         ('denies', 'awb'): 1,
         ('knowledge', 'pm'): 1,
         ('knowledge', 'denies'): 1,
         ('knowledge', 'awb'): 2,
         ('knowledge', 'kickbacks'): 1,
         ('awb', 'denies'): 1,
         ('awb', 'knowledge'): 2,
         ('awb', 'kickbacks'): 9,
         ('awb', 'prime'): 1,
         ('kickbacks', 'knowledge'): 1,
         ('kickbacks', 'awb'): 9,
         ('kickbacks', 'prime'): 1,
         ('kickbacks', 'minister'): 1,
         ('prime', 'awb'): 1,
         ('prime', 'kickbacks'): 1,
         ('prime', 'minister'): 92,
         ('prime', 'denied'): 1,
         ('minister', 'kickbacks'): 1,
         ('minister', 'prime'): 92,
         ('minister', 'denied'): 1,
         ('minister', 'knew'): 2,
         ('denied', 'prime'): 1,
         ('denied', 'minister'): 1,
         ('denied', 'knew'): 1,
         ('denied', 'awb'): 2,
     

In [14]:
print(X_ik_skipgram[('prime', 'minister')])
print(X_ik_skipgram[('prime', 'director')])

92
0


#### 2.2. Weighting Function

In [15]:
#define the weighting function
def weighting(w_i, w_j, X_ik):
    
    x_ij = X_ik[(w_i, w_j)]

    #label smoothing if there is no co-occurence (i.e., x_ij is 0)
    if x_ij == 0:
        x_ij = 1
        
    #maximum co-occurrences is 100 according to the paper
    x_max = 100
    alpha = 0.75
    
    #if the co-occurrences does not exceed x_max, scale it down based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max) ** alpha
    else:
        result = 1 #the maximum probability
        
    return result

In [16]:
#test the weighting function
w_i  = 'prime'
w_j  = 'minister'

print(weighting(w_i, w_j, X_ik_skipgram))

0.9393790503782488


In [17]:
w_i  = 'prime'
w_j  = 'director'

print(weighting(w_i, w_j, X_ik_skipgram))
print((1 / 100) ** 0.75)

0.03162277660168379
0.03162277660168379


In [18]:
#apply this weighting to all possible pairs
from itertools import combinations_with_replacement

X_ik = {} #for keeping the co-occurrences
weighting_dic = {} #for keeping all the probabilities after passing through the weighting function

for bigram in combinations_with_replacement(vocabs, 2):  #we need to also think its reverse
    #if this bigram exists in X_ik_skipgrams
    #we gonna add this to our co-occurence matrix
    if X_ik_skipgram.get(bigram) is not None:
        cooc = X_ik_skipgram[bigram]  #get the co-occurrence
        X_ik[bigram] = cooc + 1 #this is again basically label smoothing
        X_ik[(bigram[1], bigram[0])] = cooc + 1  #trick to get all pairs
    else: #otherwise, put 0
        X_ik[bigram] = 0
        X_ik[(bigram[1], bigram[0])] = 0

    #apply the weighting function using this co-occurrence matrix thingy    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)

In [None]:
#test the weighting function
print(X_ik_skipgram[('managing', 'director')])
print(X_ik_skipgram[('director', 'managing')])

print(X_ik[('managing', 'director')])
print(X_ik[('director', 'managing')])

print(weighting_dic[('managing', 'director')])
print(weighting_dic[('director', 'managing')])

print((5 / 100) ** 0.75)

In [None]:
print(X_ik_skipgram[('prime', 'director')])
print(X_ik[('prime', 'director')])
print(weighting_dic[('prime', 'director')])
print((1 / 100) ** 0.75)

#### 2.3. Preparing Training Data

In [None]:
#random batch for GloVe with generic batch size, corpus and skipgrams
import math

def random_batch_glove(batch_size, corpus, skip_grams, X_ik, weighting_dic):
    
    #change words in the skipgrams to idices
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly pick "batch_size" indices
    number_of_choices = len(skip_grams_id)
    random_index = np.random.choice(number_of_choices, batch_size, replace=False) #no repeating indexes among these random indexes
    
    random_inputs = [] #xi, wi (in batches)
    random_labels = [] #xj, wj (in batches)
    random_coocs  = [] #Xij (in batches)
    random_weightings = [] #weighting_dic(Xij) (in batches)
    #for each of the sample in these indexes
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]])
        random_labels.append([skip_grams_id[i][1]])
        
        #get cooc
        #first check whether it exists...
        pair = skip_grams[i]
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1 #label smoothing
            
        random_coocs.append([math.log(cooc)])
        #log according to the cost function equation
        #bracket because neural network requires size ( , 1)
        
        #get weighting
        weighting = weighting_dic[pair]
        random_weightings.append(weighting)

        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weightings)
    

In [None]:
#test the method
batch_size = 2
inputs, targets, coocs, weightings = random_batch_glove(batch_size, corpus_tokenized, skip_grams, X_ik, weighting_dic)

inputs, targets, coocs, weightings

#### 2.4. Model

In [None]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # context embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, context_words, coocs, weightings):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        context_bias = self.u_bias(context_words).squeeze(1)
        
        inner_product = context_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs is already log
        loss = weightings * torch.pow(inner_product + center_bias + context_bias - coocs, 2)
        
        return torch.sum(loss)

#### 2.5. Training

In [None]:
#set training parameters
batch_size  = 10
vocab_size  = len(vocabs)
emb_size    = 50
model       = GloVe(vocab_size, emb_size)
optimizer   = optim.Adam(model.parameters(), lr=0.001)

In [None]:
#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    inputs, targets, coocs, weightings = random_batch_glove(batch_size, corpus_tokenized, skip_grams, X_ik, weighting_dic)
    input_batch     = torch.LongTensor(inputs)
    target_batch    = torch.LongTensor(targets)
    cooc_batch      = torch.FloatTensor(coocs)
    weighting_batch = torch.FloatTensor(weightings)

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, cooc_batch, weighting_batch)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)

        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

#### 2.6. Saving Model

In [None]:
#save the GloVe model with pickle
import pickle

pickle.dump(model, open('GloVe.pkl', 'wb'))

### 3. CBOW

In [None]:
#random batch for CBOW model with default window size and batch size of 1 each
def random_batch_cbow(window_size=1, batch_size=1): 
    cbow = []
    for sentence in corpus_tokenized:
        #I include the first and last words
        #so that every word could be a center word
        for i in range(len(sentence)):
            center = word2index[sentence[i]]
            context = []
            context_idx = []
            #group the indices of the context words
            for j in range(window_size):
                context_idx.append(i - window_size + j)
            for k in range(1, window_size + 1):
                context_idx.append(i + k)
            #append the context words based on their indices
            #append <UNK> if there is no word at an index
            for idx in context_idx:
                if idx < 0:
                    context.append(word2index['<UNK>'])
                elif idx >= len(sentence):
                    context.append(word2index['<UNK>'])
                else:
                    context.append(word2index[sentence[idx]])
            cbow.append([context, center])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(cbow)), batch_size, replace=False)
    for i in random_index:
        random_inputs.append(cbow[i][0])
        random_labels.append([cbow[i][1]])
            
    return np.array(random_inputs), np.array(random_labels)

In [None]:
#test the CBOW method
input_batch, target_batch = random_batch_cbow(2, 10)

print("Input: ", input_batch)
print("Target: ", target_batch)

In [None]:
#CBOW model
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(CBOW,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, context_words, center_words, all_vocabs):
        center_embeds = self.embedding_v(center_words)  #[batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words)  #[batch_size, window_size, emb_size]
        all_embeds    = self.embedding_v(all_vocabs)    #[batch_size, vocab_size, emb_size]
        
        scores      = center_embeds.bmm(context_embeds.transpose(1, 2)).squeeze(2)

        norm_scores = all_embeds.bmm(context_embeds.transpose(1, 2)).squeeze(2)

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1)))
            
        return nll # negative log likelihood

In [None]:
#set training parameters
window_size = 2
batch_size = 10
vocab_size = len(vocabs)
emb_size = 50
model = CBOW(vocab_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

#convert all vocabs to tensors
def prepare_sequence(vocabs, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], vocabs))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, vocab_size)

#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch_cbow(window_size, batch_size)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)

        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

In [None]:
#save the CBOW model
pickle.dump(model, open('CBOW.pkl', 'wb'))

### 4. Skip-gram

In [None]:
#random batch for skip-gram model with default window size and batch size of 1 each
def random_batch_skip_gram(window_size=1, batch_size=1): 
    skip_grams = []
    for sentence in corpus_tokenized:
        #I include the first and last words
        #so that every word could be a center word
        for i in range(len(sentence)):
            center = word2index[sentence[i]]
            context = []
            for j in range(window_size):
                if (i - window_size + j) >= 0:
                    context.append(word2index[sentence[i - window_size + j]])
            for k in range(1, window_size + 1):
                if (i + k) < len(sentence):
                    context.append(word2index[sentence[i + k]])
            for w in context:
                skip_grams.append([center, w])
    
    random_inputs = []
    random_labels = []
    random_index = np.random.choice(range(len(skip_grams)), batch_size, replace=False)
    for i in random_index:
        random_inputs.append([skip_grams[i][0]])
        random_labels.append([skip_grams[i][1]])
            
    return np.array(random_inputs), np.array(random_labels)

In [None]:
#test the skip-gram method
input_batch, target_batch = random_batch_skip_gram(2, 10)

print("Input: ", input_batch)
print("Target: ", target_batch)

In [None]:
#Skip-gram model
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
    
    def forward(self, center_words, context_words, all_vocabs):
        center_embeds = self.embedding_v(center_words)  #[batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words)  #[batch_size, 1, emb_size]
        all_embeds    = self.embedding_u(all_vocabs)    #[batch_size, vocab_size, emb_size]
        
        scores      = context_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]

        norm_scores = all_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, vocab_size, emb_size] @ [batch_size, emb_size, 1] = [batch_size, vocab_size, 1] = [batch_size, vocab_size]

        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        # scalar (loss must be scalar)    
            
        return nll # negative log likelihood

In [None]:
#set training parameters
window_size = 2
batch_size = 10
vocab_size = len(vocabs)
emb_size = 50
model = Skipgram(vocab_size, emb_size)
optimizer = optim.Adam(model.parameters(), lr=0.001)

#convert all vocabs to tensors
def prepare_sequence(vocabs, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], vocabs))
    return torch.LongTensor(idxs)

all_vocabs = prepare_sequence(list(vocabs), word2index).expand(batch_size, vocab_size)

#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):
    
    input_batch, target_batch = random_batch_skip_gram(window_size, batch_size)
    input_batch  = torch.LongTensor(input_batch)  #[batch_size, 1]
    target_batch = torch.LongTensor(target_batch) #[batch_size, 1]

    optimizer.zero_grad()
    loss = model(input_batch, target_batch, all_vocabs)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)

        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

In [None]:
#save the skip-gram model
pickle.dump(model, open('Skipgram.pkl', 'wb'))

### 5. Skip-gram with Negative Sampling

In [None]:
#count all the occurrences of vocabs
from collections import Counter
word_count = Counter(flatten(corpus_tokenized))

#count the number of total words
num_total_words = sum([c for w, c in word_count.items()])

#create the scaled-up unigram distribution table for vocabs
z = 0.001 #the scaler
unigram_table = []
for v in vocabs:
    unigram_table.extend([v] * int(((word_count[v]/num_total_words)**0.75)/z))

In [None]:
#convert word indices to tensors
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return torch.LongTensor(idxs)

#generate random negative samples
import random

def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k:
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

In [None]:
#test the negative sampling method
input_batch, target_batch = random_batch_skip_gram(2, 10)
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)

num_neg = 5 #number of negative samples for each target word

neg_samples = negative_sampling(target_batch, unigram_table, num_neg)

neg_samples

In [None]:
#Skip-gram with negative sampling model
class SkipgramNegSampling(nn.Module):
    
    def __init__(self, vocab_size, emb_size):
        super(SkipgramNegSampling, self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, emb_size)
        self.embedding_u = nn.Embedding(vocab_size, emb_size)
        self.logsigmoid = nn.LogSigmoid()
                    
    def forward(self, center_words, context_words, neg_samples):
        center_embeds  = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        context_embeds = self.embedding_u(context_words) # [batch_size, 1, emb_size]
        neg_embeds     = self.embedding_u(neg_samples) # [batch_size, num_neg, emb_size]
        
        positive_score = context_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        negative_score = -neg_embeds.bmm(center_embeds.transpose(1, 2))
        #[batch_size, k, emb_size] @ [batch_size, emb_size, 1] = [batch_size, num_neg, 1]
        
        loss = -torch.mean(self.logsigmoid(positive_score) + torch.sum(self.logsigmoid(negative_score), 1))
                
        return loss

In [None]:
#set parameters
window_size = 2
batch_size  = 10
vocab_size  = len(vocabs)
emb_size    = 50
model       = SkipgramNegSampling(vocab_size, emb_size)
num_neg     = 10
optimizer   = optim.Adam(model.parameters(), lr=0.001)

#calculate epoch time
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time // 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#train the model
import time

num_epochs = 500
start = time.time()
for epoch in range(num_epochs):    
    input_batch, target_batch = random_batch_skip_gram(window_size, batch_size)
    input_batch = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)
    neg_samples = negative_sampling(target_batch, unigram_table, num_neg)
    
    optimizer.zero_grad()   
    loss = model(input_batch, target_batch, neg_samples)
    
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        end = time.time()
        epoch_mins, epoch_secs = epoch_time(start, end)
        print(f"Epoch: {epoch + 1} | cost: {loss:.6f} | time: {epoch_mins}m {epoch_secs}s")

In [None]:
#save the skip-gram with negative sampling model
pickle.dump(model, open('SkipgramNegSampling.pkl', 'wb'))

### 6. Testing

#### 6.1. Preparing Test Data

In [None]:
#load the dataset for testing
file_path = "C:/Users/MARC/Downloads/Datasets/questions-words.txt"

with open(file_path, 'r') as f:
    contents = f.read()
    data = contents.split('\n')

data[:5]

In [None]:
#explore the dataset
for idx, sent in enumerate(data):
    if sent[0] == ':':
        print(idx, sent)

In [None]:
#create the corpora for testing
family = data[8368:8874]
family_tokenized = [sent.split(' ') for sent in family]
print(family_tokenized[:5])

plural = data[17355:18687]
plural_tokenized = [sent.split(' ') for sent in plural]
print(plural_tokenized[:5])


In [None]:
#get unique words
flatten = lambda l: [item for sublist in l for item in sublist]
family_vocabs  = list(set(flatten(family_tokenized)))
plural_vocabs  = list(set(flatten(plural_tokenized)))

In [None]:
#numericalize the vocabs
family_word2index = {w: i for i, w in enumerate(family_vocabs)}
plural_word2index = {w: i for i, w in enumerate(plural_vocabs)}

In [None]:
#append <UNK>
family_vocabs.append('<UNK>')
family_word2index['<UNK>'] = len(family_word2index)

plural_vocabs.append('<UNK>')
plural_word2index['<UNK>'] = len(plural_word2index)

In [None]:
#prepare index2word
family_index2word = {i:w for w, i in family_word2index.items()}
plural_index2word = {i:w for w, i in plural_word2index.items()}

#### 6.1. Syntactic Test

##### The 'plural' corpus will be used for syntactic analogies.

#### 6.2. Semantic Test

The 'family' corpus will be used for semantic analogies.

#### 6.3. Similarity Test