In [1]:
import pandas as pd
import numpy as np

from collections import Counter
import operator

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.nn.functional as F

# Data loading
The Penn Treebank datafiles are given in the urls below, where we have three different datasets: train, validation and test.

In [2]:
urls = ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt',
        'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt',
        'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt']

In [3]:
df_train = pd.read_table(urls[0], header=None)
df_valid = pd.read_table(urls[1], header=None)
df_test = pd.read_table(urls[2], header=None)

In [19]:
class DataLoader:
    def __init__(self):
        
        self.word2idx = {}
        self.idx2word = {}
        
    
    def readData(self, data, name, window_size=1, sep=None):
        corpus = [x.split(sep) for x in data]
        vocab = set([y for x in corpus for y in x])
        
        setattr(self, name + "_corpus", corpus)
        setattr(self, name + "_vocab", vocab)
        
        if name == "train":
            self.createIndices(name, vocab=vocab, vocab_size=len(vocab))
        
        self.generate_pairs(name, window_size)
    
    def createIndices(self, name, vocab, vocab_size):
        
        word2idx = {w: idx for (idx, w) in enumerate(vocab)}
        idx2word = {idx: w for (idx, w) in enumerate(vocab)}
        
        # error checks
        assert len(word2idx) == vocab_size
        assert len(idx2word) == vocab_size
        
        setattr(self, "word2idx", word2idx)
        setattr(self, "idx2word", idx2word)
        setattr(self, "vocab_size", vocab_size)
        
    def __str__(self, name):
        return "{0} sentences in {3} data, consisting of {1} unique words".format(len(getattr(self, name + "_vocab")), 
                                                                                  getattr(self, name + "_vocab_size"), name)
    
    def generate_pairs(self, name, window_size):
        corpus = getattr(self, name + "_corpus")
        
        idx_pairs = []

        for sentence in corpus:
            indices = [self.word2idx[word] for word in sentence]
            # for each word, threated as center word
            for center_word_pos in range(len(indices)):
                # for each window position
                for w in range(-window_size, window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                        continue
                    context_word_idx = indices[context_word_pos]
                    idx_pairs.append((indices[center_word_pos], context_word_idx))

        setattr(self, name + "_idx_pairs",  np.array(idx_pairs))
        
#     def wordFrequencies(self, n=10, update=False):
        
#         if not hasattr(self, "sorted_counts") or update:
#             flattened = [y for x in self.corpus for y in x]
#             self.nwords = len(flattened)
#             counts = Counter(flattened)

#             self.sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
            
#             self.word2freq = {w: f/self.nwords for (w, f) in counts.items()}
        
#         print("Top {0} most common words".format(n))
#         for i in range(n):
#             print(self.sorted_counts[i])

In [20]:
dataloader = DataLoader()
dataloader.readData(data=df_train[0], name='train')

dataloader.readData(data=df_valid[0], name='valid')

dataloader.readData(data=df_test[0], name='test')

# Skip-gram

Objective of skip-gram model: figure out word representations that are useful for predicting the surrounding words in a sentence. 

Given a sequence of words for training, $w_1,w_2,w_3,...,w_T$, the objective is then to maximize the average log probability over the training context $c$:
$$\frac{1}{T}\sum_{t=1}^T\sum_{-c\leq j \leq c, j\neq0} \log p(w_{t+j}|w_t)$$

$p(w_{t+j}|w_t)$ is defined by the softmax function $$p(w_{O}|w_I)=\frac{\exp\big(v_{w_O}'^\intercal v_{w_I}\big)}{\sum_{w=1}^W\exp(v_w'^\intercal v_{w_I})}$$

Assumtions:
* context windows are symmetrical with a value $x\in \R $

In [106]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        """
        Attributes:
            vocab_size: number of vocab
            embedding_dim: embedding dimensions
        """
        super(SkipGram, self).__init__()
        
        self.in_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, sparse=True)
#         self.out_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, sparse=True)
        
        # hidden layer
#         self.hidden = nn.Linear(in_features=embedding_dim, out_features=vocab_size)
        
        # output layer        
        self.output = nn.Linear(in_features=embedding_dim, out_features=vocab_size)
    
    def forward(self, x, yt):
        """
        Attributes:
            x: input data
            yt: true y
        
        Return:
            out: predictions of words, dim corresponds to vocab size
        """
        
        # get embeddings for input x
        x_vec = self.in_embeddings(x)
        
        # output
        out = self.output(x_vec)
        
        # log probabilities
        out_log = F.softmax(out, dim=1)
        
        return out_log

net = SkipGram(vocab_size=dataloader.vocab_size, embedding_dim=100)
print(net)

SkipGram(
  (in_embeddings): Embedding(9999, 100, sparse=True)
  (output): Linear(in_features=100, out_features=9999, bias=True)
)


## Convenience functions

In [107]:
def accuracy(ys, ts):
    """
    Attributes:
        ys: predicted values
        ts: true values
    """
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

In [119]:
use_cuda = torch.cuda.is_available()

if use_cuda:
    print("Cuda available")
    net.cuda()

In [109]:
train = data_utils.TensorDataset(torch.from_numpy(dataloader.train_idx_pairs[:,0]), torch.from_numpy(dataloader.train_idx_pairs[:,1]))
train_loader = data_utils.DataLoader(train, batch_size=128, shuffle=True, num_workers=4)

valid = data_utils.TensorDataset(torch.from_numpy(dataloader.valid_idx_pairs[:,0]), torch.from_numpy(dataloader.valid_idx_pairs[:,1]))
valid_loader = data_utils.DataLoader(valid, batch_size=128, shuffle=True, num_workers=4)

## Criterion and optimizer

In [115]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

## Test the network quickly

In [116]:
data, target = next(iter(train_loader))

if use_cuda:
    data = data.cuda()
    target = target.cuda()

output = net(x = data.long(), yt=target.long())
loss = criterion(output, target.long())
acc = accuracy(output, target.long())

print(loss.item(), acc.item())

9.210245132446289 0.0


## Training of the network

should implement batch sizes, as this is too slow :)

In [117]:
num_epochs = 10

log_every = 2000
N = len(train_loader)

for epoch in range(num_epochs):
    
    running_loss, running_acc = 0.0, 0.0
    
    net.train()
    for i, (inputs, labels) in enumerate(train_loader):
        
        # zero parameter gradients
        optimizer.zero_grad()
        
        # check if cuda is available
        if use_cuda:
            inputs = inputs.cuda()
            labels = labels.cuda()
        
        # forward + backward + optimize
        output = net(inputs.long(), labels.long())
        loss = criterion(output, labels.long())
        acc = accuracy(output, labels.long())
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        running_acc += acc.item()
        
        if i % log_every == 0:
            print("Epoch {}, iteration {}/{}, loss: {:.3f}, acc: {:.3f}".format(epoch+1, i+1, N, 
                                                                                running_loss / log_every, 
                                                                                running_acc / log_every))
            
            running_loss, running_acc = 0.0, 0.0
    
    net.eval()
    val_losses, val_accs, val_lengths = 0, 0, 0
    for j, (inputs, labels) in enumerate(valid_loader):
        # check if cuda is available
        if use_cuda:
            inputs = inputs.cuda()
            labels = labels.cuda()
        # forward + backward
        output = net(inputs.long(), labels.long())
        loss = criterion(output, labels.long())
        
        val_losses += loss.item() * valid_loader.batch_size
        val_accs += accuracy(output, labels.long()).item() * valid_loader.batch_size
        val_lengths += valid_loader.batch_size
        
    val_losses /= val_lengths
    val_accs /= val_lengths
    
    print("\nValidation loss: {:.3f}, accuracy: {:.3f}\n".format(val_losses, val_accs))

Epoch 1, iteration 1/13211, loss: 0.005, acc: 0.000
Epoch 1, iteration 2001/13211, loss: 9.210, acc: 0.000
Epoch 1, iteration 4001/13211, loss: 9.210, acc: 0.000
Epoch 1, iteration 6001/13211, loss: 9.210, acc: 0.000


KeyboardInterrupt: 

# Notes
- should we include subsampling!? probably :)
- padding?

# Good urls
https://stackoverflow.com/questions/31847682/how-to-compute-skipgrams-in-python
https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb
https://github.com/deborausujono/word2vecpy/blob/master/word2vec.py