In [0]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import torchtext
from torchtext import data
from torchtext.datasets import PennTreebank
import random

In [0]:
TEXT = data.Field(sequential=True, tokenize='spacy', lower=True)
train, valid, test = PennTreebank.splits(TEXT)
TEXT.build_vocab(train, min_freq=5)

BATCH_SIZE = 512

groups = 2 # train on pairs
vocab_size = len(TEXT.vocab)

train_iter, valid_iter, test_iter = data.BPTTIterator.splits((train, valid, test),
                                                             batch_size=BATCH_SIZE,
                                                             bptt_len=groups,
                                                             repeat=False)

In [0]:
class GloVe(nn.Module):
    def __init__(self, coo_matrix, embedding_dim):
        super(GloVe, self).__init__()
        
        self.coo_matrix = coo_matrix
        
        # empirically found values from the paper
        self.alpha = 0.75
        self.cutoff = 100
        
        # laplacian smoothing
        self.coo_matrix += 1
        
        self.vocab_size = coo_matrix.shape[0]
        self.embedding_i = nn.Embedding(vocab_size, embedding_dim)
        self.bias_i = nn.Embedding(vocab_size, 1)
        
        self.embedding_j = nn.Embedding(vocab_size, embedding_dim)
        self.bias_j = nn.Embedding(vocab_size, 1)

    def forward(self, word_i, word_j):
        # word_i: [batch_size], long
        # word_j: [batch_size], long
        coos = Variable(torch.from_numpy(np.array([self.coo_matrix[word_i[x], word_j[x]] for x in range(BATCH_SIZE)]))).to(torch.long) # [batch_size]
        weighting = Variable(torch.from_numpy(np.array([self._get_weighting(x) for x in coos]))).to(torch.long) # [batch_size]

        embed_i_out = self.embedding_i(word_i)
        bias_i_out = self.bias_i(word_i)
        embed_j_out = self.embedding_j(word_j)
        bias_j_out = self.bias_j(word_j)

        return embed_i_out, embed_j_out, bias_i_out, bias_j_out, coos, weighting
    
    def _get_weighting(self, occur):
        return 1.0 if occur > self.cutoff else (occur / self.cutoff) ** self.alpha

In [0]:
def GloVeLoss(embed_i, embed_j, bias_i, bias_j, coos, weighting): # a modified version of MSELoss
    return (torch.pow(((embed_i * embed_j).sum(1) + bias_i + bias_j) - torch.log(coos.to(torch.float)), 2) * weighting).sum()

coo_matrix = np.random.rand(vocab_size, vocab_size)

model = GloVe(coo_matrix, 100)
optimizer = optim.SGD(model.parameters(), lr=1e-3)

for epoch in range(1):
    for i, batch in enumerate(train_iter):
        optimizer.zero_grad()
        # batch: [groups, batch_size]
        
        text, target = batch.text[0], batch.target[-1] # not super proud of this, but whatever
        embed_i, embed_j, bias_i, bias_j, coos, weighting = model(text, target)
        loss = GloVeLoss(embed_i, embed_j, bias_i, bias_j, coos, weighting.to(torch.float))
        loss.backward()
        optimizer.step()
