# Global Vectors for Word Representation

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter, defaultdict

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

## Load Datasets

In [None]:
class GloveDataset:
    
    def __init__(self, text, n_words=200000, window_size=5):
        
        self._window_size = window_size
        self._tokens = text.split(' ')[:n_words]
        word_counter = Counter()
        word_counter.update(self._tokens)
        
        self._word2idx = {w:i for i, (w, _) in enumerate(word_counter.most_common())}
        self._idx2word = {i:w for w, i in self._word2idx.items()}
        self._vocab_len = len(self._word2idx)
        self._id_tokens = [self._word2idx[w] for w in self._tokens]
        self._create_coocurrence_matrix()
        
        print('Total of words: {}'.format(len(self._tokens)))
        print('Vocabulary length: {}'.format(self._vocab_len))
        
    def _create_coocurrence_matrix(self):
        
        co_matrix = defaultdict(Counter)
        for i, w in enumerate(self._id_tokens):
            start_i = max(i - self._window_size, 0)
            end_i = min(i + self._window_size + 1, len(self._id_tokens))
            for j in range(start_i, end_i):
                if i != j:
                    c = self._id_tokens[j]
                    co_matrix[w][c] += 1 / abs(j-i)
        
        self._i_idx = list()
        self._j_idx = list()
        self._xij = list()
        
        # create indexes and x values tensors
        for w, co in co_matrix.items():
            for c, v in co.items():
                self._i_idx.append(w)
                self._j_idx.append(c)
                self._xij.append(v)
        
        self._i_idx = torch.LongTensor(self._i_idx)
        self._j_idx = torch.LongTensor(self._j_idx)
        self._xij = torch.FloatTensor(self._xij)
    
    def get_batches(self, batch_size):
        # generate random idx
        random_idx = torch.LongTensor(np.random.choice(len(self._xij), len(self._xij), replace=False))
        
        for p in range(0, len(random_idx), batch_size):
            batch_idx = random_idx[p:p+batch_size]
            yield self._xij[batch_idx], self._i_idx[batch_idx], self._j_idx[batch_idx]

In [None]:
datasets = GloveDataset(open('./datasets/text8').read(), 10000000)

## Set Configs

In [None]:
LR = 0.05
N_EPOCHS = 1 # try 100 to get much better results

BATCH_SIZE = 2048
EMBEDDING_DIM = 300
X_MAX = 100
ALPHA = 0.75

In [None]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

## Build [Glove](https://nlp.stanford.edu/pubs/glove.pdf) Network

In [None]:
class Glove(nn.Module):
    
    def __init__(self, input_size, embedding_dim):
        super(Glove, self).__init__()
        
        self.vocab_size = input_size
        
        self.wi = nn.Embedding(self.vocab_size, embedding_dim)
        self.wj = nn.Embedding(self.vocab_size, embedding_dim)
        self.bi = nn.Embedding(self.vocab_size, 1)
        self.bj = nn.Embedding(self.vocab_size, 1)
        
        self.wi.weight.data.uniform_(-1, 1)
        self.wj.weight.data.uniform_(-1, 1)
        self.bi.weight.data.zero_()
        self.bj.weight.data.zero_()
        
    def forward(self, i_indices, j_indices):
        w_i = self.wi(i_indices)
        w_j = self.wj(j_indices)
        b_i = self.bi(i_indices).squeeze()
        b_j = self.bj(j_indices).squeeze()
        
        x = torch.sum(w_i * w_j, dim=1) + b_i + b_j
        
        return x

#### Initialize Glove Network

In [None]:
glove = Glove(datasets._vocab_len, EMBEDDING_DIM)
glove.to(device)

## Set Loss Function

In [None]:
def wmse_loss(weights, inputs, targets):
    loss = weights * F.mse_loss(inputs, targets, reduction='none')
    loss = torch.mean(loss)
    
    return loss

In [None]:
def weight_loss(x, x_max, alpha):
    wx = (x/x_max) ** alpha
    wx = torch.min(wx, torch.ones_like(wx))
    
    return wx

## Set Optimizer

In [None]:
optimizer = torch.optim.Adagrad(glove.parameters(), lr=LR)

## Train Glove Network

In [None]:
print_every = 100
train_losses = []

glove.train()
for epoch in range(N_EPOCHS):
    
    batch_i = 0
    for x_ij, i_idx, j_idx in datasets.get_batches(BATCH_SIZE):
        
        batch_i += 1
        
        x_ij = x_ij.to(device)
        i_idx = i_idx.to(device)
        j_idx = j_idx.to(device)

        optimizer.zero_grad()
        outputs = glove(i_idx, j_idx)
        weights = weight_loss(x_ij, X_MAX, ALPHA)
        loss = wmse_loss(weights, outputs, torch.log(x_ij))
        
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
        if batch_i % print_every == 0:
            print('Epoch: {}/{}, '.format(epoch, N_EPOCHS),
                  'Batch: {}/{}, '.format(batch_i, int(len(datasets._xij) / BATCH_SIZE)),
                  'Train Loss: {}'.format(np.mean(train_losses[-20:])))
    
    print('Saving model...')
    if not os.path.exists('./weights/'): os.makedirs('./weights/')
    torch.save(glove.state_dict(), 'weights/glove_text8.pt')

In [None]:
plt.figure(figsize=(10,5))
plt.title("Train Loss of Glove Network")
plt.plot(train_losses, label="Train Loss", color="green")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/glove_final_train_loss.png')
plt.show()

## Plot Word Vectors Using PCA

In [None]:
from sklearn.decomposition import PCA

In [None]:
embeds_i = glove.wi.weight.cpu().data.numpy()
embeds_j = glove.wj.weight.cpu().data.numpy()
embeds = embeds_i + embeds_j

top_k = 300

In [None]:
pca = PCA(n_components=2)
embeds_pca = pca.fit_transform(embeds[:top_k, :])

plt.figure(figsize=(20,20))
plt.scatter(embeds_pca[:, 0], embeds_pca[:, 1])
for i in range(top_k):
    plt.annotate(datasets._idx2word[i], xy=(embeds_pca[i, 0], embeds_pca[i, 1]))

if not os.path.exists('./images/'): os.makedirs('./images/')
plt.savefig('./images/glove_pca_word_vectors.png')
plt.show()

## Plot Word Vectors Using TSNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(metric='cosine', random_state=9)
embeds_tsne = tsne.fit_transform(embeds[:top_k, :])
fig, ax = plt.subplots(figsize=(20,20))
for idx in range(top_k):
    plt.scatter(*embedding_tsne[idx, :], color='steelblue')
    plt.annotate(datasets._idx2word[idx], (embedding_tsne[idx, 0], embedding_tsne[idx, 1]), alpha=0.7)
    
plt.savefig('./images/glove_tsne_word_vectors.png')
plt.show()

---