# 19 Jan - GloVe

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim

### 1. Loading Data

In [2]:
#define the training corpus
#I use the Australian Broadcasting Comission 2006 Corpus from NLTK
import nltk
from nltk.corpus import abc
corpus = nltk.corpus.abc.sents()

In [3]:
#the corpus is already tokenized
corpus

[['PM', 'denies', 'knowledge', 'of', 'AWB', 'kickbacks', 'The', 'Prime', 'Minister', 'has', 'denied', 'he', 'knew', 'AWB', 'was', 'paying', 'kickbacks', 'to', 'Iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'Iraq', 'wheat', 'sales', '.'], ['Letters', 'from', 'John', 'Howard', 'and', 'Deputy', 'Prime', 'Minister', 'Mark', 'Vaile', 'to', 'AWB', 'have', 'been', 'released', 'by', 'the', 'Cole', 'inquiry', 'into', 'the', 'oil', 'for', 'food', 'program', '.'], ...]

In [4]:
#convert the words in the corpus into lower case
corpus_tokenized = [[]] * len(corpus)
for i in range(len(corpus)):
    corpus_tokenized[i] = [word.lower() for word in corpus[i]]

print(corpus_tokenized[:5])

[['pm', 'denies', 'knowledge', 'of', 'awb', 'kickbacks', 'the', 'prime', 'minister', 'has', 'denied', 'he', 'knew', 'awb', 'was', 'paying', 'kickbacks', 'to', 'iraq', 'despite', 'writing', 'to', 'the', 'wheat', 'exporter', 'asking', 'to', 'be', 'kept', 'fully', 'informed', 'on', 'iraq', 'wheat', 'sales', '.'], ['letters', 'from', 'john', 'howard', 'and', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'to', 'awb', 'have', 'been', 'released', 'by', 'the', 'cole', 'inquiry', 'into', 'the', 'oil', 'for', 'food', 'program', '.'], ['in', 'one', 'of', 'the', 'letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'to', 'remain', 'in', 'close', 'contact', 'with', 'the', 'government', 'on', 'iraq', 'wheat', 'sales', '.'], ['the', 'opposition', "'", 's', 'gavan', 'o', "'", 'connor', 'says', 'the', 'letter', 'was', 'sent', 'in', '2002', ',', 'the', 'same', 'time', 'awb', 'was', 'paying', 'kickbacks', 'to', 'iraq', 'though', 'a', 'jordanian', 'trucking', 'company',

In [5]:
#remove stop words with SpaCy
import spacy
nlp = spacy.load("en_core_web_sm")
stopwords = nlp.Defaults.stop_words

for sentence in corpus_tokenized:
    for word in sentence[:]:
        if word in stopwords:
            sentence.remove(word)

print(corpus_tokenized[:5])

[['pm', 'denies', 'knowledge', 'awb', 'kickbacks', 'prime', 'minister', 'denied', 'knew', 'awb', 'paying', 'kickbacks', 'iraq', 'despite', 'writing', 'wheat', 'exporter', 'asking', 'kept', 'fully', 'informed', 'iraq', 'wheat', 'sales', '.'], ['letters', 'john', 'howard', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'awb', 'released', 'cole', 'inquiry', 'oil', 'food', 'program', '.'], ['letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'remain', 'close', 'contact', 'government', 'iraq', 'wheat', 'sales', '.'], ['opposition', "'", 's', 'gavan', 'o', "'", 'connor', 'says', 'letter', 'sent', '2002', ',', 'time', 'awb', 'paying', 'kickbacks', 'iraq', 'jordanian', 'trucking', 'company', '.'], ['says', 'government', 'longer', 'wipe', 'hands', 'illicit', 'payments', ',', 'totalled', '$', '290', 'million', '.']]


In [6]:
#remove punctutations with String
import string
punctutations = string.punctuation

for sentence in corpus_tokenized:
    for word in sentence[:]:
        if word in punctutations:
            sentence.remove(word)

print(corpus_tokenized[:5])

[['pm', 'denies', 'knowledge', 'awb', 'kickbacks', 'prime', 'minister', 'denied', 'knew', 'awb', 'paying', 'kickbacks', 'iraq', 'despite', 'writing', 'wheat', 'exporter', 'asking', 'kept', 'fully', 'informed', 'iraq', 'wheat', 'sales'], ['letters', 'john', 'howard', 'deputy', 'prime', 'minister', 'mark', 'vaile', 'awb', 'released', 'cole', 'inquiry', 'oil', 'food', 'program'], ['letters', 'mr', 'howard', 'asks', 'awb', 'managing', 'director', 'andrew', 'lindberg', 'remain', 'close', 'contact', 'government', 'iraq', 'wheat', 'sales'], ['opposition', 's', 'gavan', 'o', 'connor', 'says', 'letter', 'sent', '2002', 'time', 'awb', 'paying', 'kickbacks', 'iraq', 'jordanian', 'trucking', 'company'], ['says', 'government', 'longer', 'wipe', 'hands', 'illicit', 'payments', 'totalled', '290', 'million']]


In [7]:
#get unique words
flatten = lambda l: [item for sublist in l for item in sublist]
vocabs = list(set(flatten(corpus_tokenized)))

print(len(vocabs))

27444


In [8]:
#numericalize the vocabs
word2index = {w: i for i, w in enumerate(vocabs)}

word2index

{'pace': 0,
 'damaged': 1,
 'offloading': 2,
 'squib': 3,
 'shacks': 4,
 'campfire': 5,
 'households': 6,
 'matscape': 7,
 'tongues': 8,
 'indefinite': 9,
 '1990s': 10,
 'absorbs': 11,
 'comte': 12,
 'buoys': 13,
 'hooks': 14,
 'companion': 15,
 'tas2r38': 16,
 'imaged': 17,
 'adobe': 18,
 'skeletal': 19,
 'trent': 20,
 'flooding': 21,
 'inaccurate': 22,
 'lemurs': 23,
 'mandil': 24,
 'relief': 25,
 'seeney': 26,
 'bucked': 27,
 'preschool': 28,
 'chilli': 29,
 'collecting': 30,
 'emperors': 31,
 'sounding': 32,
 'prosthetics': 33,
 'ngc': 34,
 'toll': 35,
 'fare': 36,
 '4395': 37,
 'reproductive': 38,
 'collaborated': 39,
 'folded': 40,
 'plagiarism': 41,
 'empathise': 42,
 'daydreams': 43,
 'partnership': 44,
 'rose': 45,
 'globe': 46,
 'hail': 47,
 'provider': 48,
 'clings': 49,
 'shakeup': 50,
 'ladines': 51,
 'susceptibilities': 52,
 'recorded': 53,
 'affirms': 54,
 'snacks': 55,
 'providence': 56,
 'retirement': 57,
 'nott': 58,
 'pais': 59,
 'serpins': 60,
 'whinnies': 61,
 'ura

In [9]:
#append <UNK>
vocabs.append('<UNK>')
word2index['<UNK>'] = 27444

len(word2index)

27445

### 2. Co-occurrence matrix

In [10]:
#count the frequency of each word
from collections import Counter

X_i = Counter(flatten(corpus_tokenized))

X_i

Counter({'pm': 19,
         'denies': 22,
         'knowledge': 54,
         'awb': 546,
         'kickbacks': 26,
         'prime': 109,
         'minister': 337,
         'denied': 13,
         'knew': 29,
         'paying': 37,
         'iraq': 107,
         'despite': 191,
         'writing': 37,
         'wheat': 676,
         'exporter': 78,
         'asking': 47,
         'kept': 47,
         'fully': 25,
         'informed': 16,
         'sales': 142,
         'letters': 41,
         'john': 216,
         'howard': 70,
         'deputy': 46,
         'mark': 119,
         'vaile': 68,
         'released': 128,
         'cole': 144,
         'inquiry': 230,
         'oil': 284,
         'food': 435,
         'program': 203,
         'mr': 411,
         'asks': 12,
         'managing': 88,
         'director': 216,
         'andrew': 86,
         'lindberg': 17,
         'remain': 74,
         'close': 138,
         'contact': 49,
         'government': 945,
         'opposition'

In [None]:
#define a skipgram of window size 1
skip_grams = []

for sent in corpus_tokenized:
    for i in range(len(sent)):
        target  = sent[i]
        context = [sent[i+1], sent[i-1]]
        for c in context:
            skip_grams.append((target, c))

In [None]:
skip_grams

In [None]:
#make the co-occurrence matrix
X_ik_skipgram = Counter(skip_grams)
X_ik_skipgram

### 3. Weighting function f

In [None]:
def weighting(w_i, w_j, X_ik):
    
    #check whether the co-occurrences between these two word exists???
    try:
        x_ij = X_ik[(w_i, w_j)]
    except:
        x_ij = 1  #why one, so that the probability thingy won't break...(label smoothing)
        
    #maximum co-occurrences; we follow the paper
    x_max = 100
    alpha = 0.75
    
    #if the co-occurrences does not exceed x_max, scale it down based on some alpha
    if x_ij < x_max:
        result = (x_ij/x_max) ** alpha
    else:
        result = 1 #this is the maximum probability you can have
        
    return result

In [None]:
w_i  = 'democracy'
w_j  = 'freedom'

print(weighting(w_i, w_j, X_ik_skipgram))


In [None]:
#now apply this weighting to all possible pairs
from itertools import combinations_with_replacement

X_ik = {} #for keeping the co-occurrences
weighting_dic = {} #for keeping all the probability after passing through the weighting function

for bigram in combinations_with_replacement(vocabs, 2):  #we need to also think its reverse
    #if this bigram exists in X_ik_skipgrams
    #we gonna add this to our co-occurence matrix
    if X_ik_skipgram.get(bigram) is not None:
        cooc = X_ik_skipgram[bigram]  #get the co-occurrence
        X_ik[bigram] = cooc + 1 #this is again basically label smoothing....(stability issues (especially when divide something))
        X_ik[(bigram[1], bigram[0])] = cooc + 1  #trick to get all pairs
    else: #otherwise, do nothing
        pass
    
    #apply the weighting function using this co-occurrence matrix thingy    
    weighting_dic[bigram] = weighting(bigram[0], bigram[1], X_ik)
    weighting_dic[(bigram[1], bigram[0])] = weighting(bigram[1], bigram[0], X_ik)
    

In [None]:
len(X_ik_skipgram)

In [None]:
X_ik

In [None]:
# weighting_dic  #give small probability to never-occurred is called "label smoothing"

## 4. Prepare train data
You move the window along, and create those tuples as we said in class

In [None]:
for c in corpus_tokenized:
    print(c)

In [None]:
skip_grams

In [None]:
import math

def random_batch(batch_size, word_sequence, skip_grams, X_ik, weighting_dic):
    
    #loop through this skipgram, and change it id  because when sending model, it must number
    skip_grams_id = [(word2index[skip_gram[0]], word2index[skip_gram[1]]) for skip_gram in skip_grams]
    
    #randomly pick "batch_size" indexes
    number_of_choices = len(skip_grams_id)
    random_index = np.random.choice(number_of_choices, batch_size, replace=False) #no repeating indexes among these random indexes
    
    random_inputs = [] #xi, wi (in batches)
    random_labels = [] #xj, wj (in batches)
    random_coocs  = [] #Xij (in batches)
    random_weighting = [] #f(Xij) (in batches)
    #for each of the sample in these indexes
    for i in random_index:
        random_inputs.append([skip_grams_id[i][0]]) #same reason why i put bracket here....
        random_labels.append([skip_grams_id[i][1]])
        
        #get cooc
        #first check whether it exists...
        pair = skip_grams[i]  #e.g., ('banana', 'fruit)
        try:
            cooc = X_ik[pair]
        except:
            cooc = 1 #label smoothing
            
        random_coocs.append([math.log(cooc)])  #1. why log, #2, why bracket -> size ==> (, 1)  #my neural network expects (, 1)
        
        #get weighting
        weighting = weighting_dic[pair]  #why not use try....maybe it does not exist....
        random_weighting.append(weighting)

        
    return np.array(random_inputs), np.array(random_labels), np.array(random_coocs), np.array(random_weighting)
    

In [None]:
batch_size = 2
input, target, cooc, weightin = random_batch(batch_size, corpus_tokenized, skip_grams, X_ik, weighting_dic)

In [None]:
input, target, cooc, weightin

## 4. Model

<img src ="../figures/glove.png">

In [None]:
class GloVe(nn.Module):
    
    def __init__(self, vocab_size,embed_size):
        super(GloVe,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, embed_size) # center embedding
        self.embedding_u = nn.Embedding(vocab_size, embed_size) # out embedding
        
        self.v_bias = nn.Embedding(vocab_size, 1)
        self.u_bias = nn.Embedding(vocab_size, 1)
        
    def forward(self, center_words, target_words, coocs, weighting):
        center_embeds = self.embedding_v(center_words) # [batch_size, 1, emb_size]
        target_embeds = self.embedding_u(target_words) # [batch_size, 1, emb_size]
        
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)
        
        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)
        #[batch_size, 1, emb_size] @ [batch_size, emb_size, 1] = [batch_size, 1, 1] = [batch_size, 1]
        
        #note that coocs already got log
        loss = weighting*torch.pow(inner_product +center_bias + target_bias - coocs, 2)
        
        return torch.sum(loss)

## 4. Training

In [None]:
voc_size   = len(vocabs)
batch_size = 2 #why?  no reason; 
emb_size   = 2 #why?  no reason; usually 50, 100, 300, but 2 so we can plot (50 can also plot, but need PCA)
model      = GloVe(voc_size, emb_size)

optimizer  = optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 5000
#for epoch
for epoch in range(num_epochs):

    #get random batch
    input, target, cooc, weightin = random_batch(batch_size, corpus_tokenized, skip_grams, X_ik, weighting_dic)
    input_batch    = torch.LongTensor(input)
    target_batch   = torch.LongTensor(target)
    cooc_batch     = torch.FloatTensor(cooc)
    weightin_batch = torch.FloatTensor(weightin)
    
    
    # print(input_batch.shape, label_batch.shape, cooc_batch.shape, weightin_batch)
    
    #loss = model
    loss = model(input_batch, target_batch, cooc_batch, weightin_batch)
    
    #backpropagate
    loss.backward()
    
    #update alpha
    optimizer.step()
    
    #print epoch loss
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch {epoch+1} | Loss: {loss:.6f} | Time: ??")

## 5. Plot the embeddings

Is really the related stuff are close to each other, and vice versa?

The most fun part:  Will "banana" closer to "fruit" than "cat"?

In [None]:
vocabs

In [None]:
banana = torch.LongTensor([word2index['banana']])
banana

In [None]:
banana_center_embed = model.embedding_v(banana)
banana_outisde_embed = model.embedding_u(banana)

banana_embed = (banana_center_embed + banana_outisde_embed) / 2
banana_embed

In [None]:
def get_embed(word):
    try:
        index = word2index[word]
    except:
        index = word2index['<UNK>']
    
    word = torch.LongTensor([index])

    center_embed  = model.embedding_v(word)
    outside_embed = model.embedding_u(word)
    
    embed = (center_embed + outside_embed) / 2
    
    return  embed[0][0].item(), embed[0][1].item()


In [None]:
#find embedding of fruit, cat
print(get_embed('fruit'))
print(get_embed('cat'))
print(get_embed('chaky'))

In [None]:
#help me plot fruit cat banana on matplotlib
plt.figure(figsize=(6,3))
for i, word in enumerate(vocabs[:20]): #loop each unique vocab
    x, y = get_embed(word)
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), xytext=(5, 2), textcoords='offset points')
plt.show()

## 6. Cosine similarity

How do (from scratch) calculate cosine similarity?