In [1]:
# http://pytorch.org/
!pip3 install torch -U
!pip3 install torchvision -U

Requirement already up-to-date: torch in /usr/local/lib/python3.6/dist-packages (0.4.1)
Requirement already up-to-date: torchvision in /usr/local/lib/python3.6/dist-packages (0.2.1)


In [1]:
import pandas as pd
import numpy as np

from collections import Counter
import operator

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.nn.functional as F

# Data loading
The Penn Treebank datafiles are given in the urls below, where we have three different datasets: train, validation and test.

In [2]:
urls = ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt',
        'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt',
        'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt']

In [3]:
df_train = pd.read_table(urls[0], header=None)
df_valid = pd.read_table(urls[1], header=None)
df_test = pd.read_table(urls[2], header=None)

In [4]:
class DataLoader:
    def __init__(self):
        
        self.word2idx = {}
        self.idx2word = {}
        
    
    def readData(self, data, name, window_size=1, sep=None):
        corpus = [x.split(sep) for x in data]
        vocab = set([y for x in corpus for y in x])
        
        setattr(self, name + "_corpus", corpus)
        setattr(self, name + "_vocab", vocab)
        setattr(self, name + "_vocab_size", len(vocab))
        
        if name == "train":
            self.createIndices(name, vocab=vocab, vocab_size=len(vocab))
        
        self.generate_pairs(name, window_size)
    
    def createIndices(self, name, vocab, vocab_size):
        
        word2idx = {w: idx for (idx, w) in enumerate(vocab)}
        idx2word = {idx: w for (idx, w) in enumerate(vocab)}
        
        # error checks
        assert len(word2idx) == vocab_size
        assert len(idx2word) == vocab_size
        
        setattr(self, "word2idx", word2idx)
        setattr(self, "idx2word", idx2word)
        
    def __str__(self, name):
        return "{0} sentences in {3} data, consisting of {1} unique words".format(len(getattr(self, name + "_vocab")), 
                                                                                  getattr(self, name + "_vocab_size"), name)
    
    def generate_pairs(self, name, window_size):
        corpus = getattr(self, name + "_corpus")
        
        idx_pairs = []

        for sentence in corpus:
            indices = [self.word2idx[word] for word in sentence]
            # for each word, threated as center word
            for center_word_pos in range(len(indices)):
                # for each window position
                for w in range(-window_size, window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                        continue
                    context_word_idx = indices[context_word_pos]
                    idx_pairs.append((indices[center_word_pos], context_word_idx))

        setattr(self, name + "_idx_pairs",  np.array(idx_pairs))
        
#     def wordFrequencies(self, n=10, update=False):
        
#         if not hasattr(self, "sorted_counts") or update:
#             flattened = [y for x in self.corpus for y in x]
#             self.nwords = len(flattened)
#             counts = Counter(flattened)

#             self.sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
            
#             self.word2freq = {w: f/self.nwords for (w, f) in counts.items()}
        
#         print("Top {0} most common words".format(n))
#         for i in range(n):
#             print(self.sorted_counts[i])

In [5]:
dataloader = DataLoader()
dataloader.readData(data=df_train[0], name='train', window_size=1)

dataloader.readData(data=df_valid[0], name='valid', window_size=1)

dataloader.readData(data=df_test[0], name='test', window_size=1)

# Skip-gram

Objective of skip-gram model: figure out word representations that are useful for predicting the surrounding words in a sentence. 

Given a sequence of words for training, $w_1,w_2,w_3,...,w_T$, the objective is then to maximize the average log probability over the training context $c$:
$$\frac{1}{T}\sum_{t=1}^T\sum_{-c\leq j \leq c, j\neq0} \log p(w_{t+j}|w_t)$$

$p(w_{t+j}|w_t)$ is defined by the softmax function $$p(w_{O}|w_I)=\frac{\exp\big(v_{w_O}'^\intercal v_{w_I}\big)}{\sum_{w=1}^W\exp(v_w'^\intercal v_{w_I})}$$

Assumtions:
* context windows are symmetrical with a value $x\in \R $

In [6]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        """
        Attributes:
            vocab_size: number of vocab
            embedding_dim: embedding dimensions
        """
        super(SkipGram, self).__init__()
        
        self.in_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, sparse=True)
#         self.out_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, sparse=True)
        
        # hidden layer
#         self.hidden = nn.Linear(in_features=embedding_dim, out_features=vocab_size)
        
        # output layer        
        self.output = nn.Linear(in_features=embedding_dim, out_features=vocab_size)
    
    def forward(self, x, yt):
        """
        Attributes:
            x: input data
            yt: true y
        
        Return:
            out: predictions of words, dim corresponds to vocab size
        """
        
        # get embeddings for input x
        x_vec = self.in_embeddings(x)
        
        # output
        out = self.output(x_vec)
        
        # log probabilities
        out_log = F.log_softmax(out, dim=1)
        
        
        return out_log
        
        
net = SkipGram(vocab_size=dataloader.train_vocab_size, embedding_dim=300)
print(net)

SkipGram(
  (in_embeddings): Embedding(9999, 300, sparse=True)
  (output): Linear(in_features=300, out_features=9999, bias=True)
)


## Convenience functions

In [7]:
def accuracy(ys, ts):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

In [8]:
use_cuda = torch.cuda.is_available()

if use_cuda:
    print("Cuda available")
    net.cuda()

In [9]:
train = data_utils.TensorDataset(torch.from_numpy(dataloader.train_idx_pairs[:,0]), torch.from_numpy(dataloader.train_idx_pairs[:,1]))
train_loader = data_utils.DataLoader(train, batch_size=64, shuffle=True, num_workers=4)

valid = data_utils.TensorDataset(torch.from_numpy(dataloader.valid_idx_pairs[:,0]), torch.from_numpy(dataloader.valid_idx_pairs[:,1]))
valid_loader = data_utils.DataLoader(valid, batch_size=64, shuffle=True, num_workers=4)

## Criterion and optimizer

In [10]:
criterion = nn.NLLLoss()
optimizer = optim.SGD(net.parameters(), lr=1)

# add a schedular for dynamically updating the learning rate
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, patience=2, threshold=1e-3)

## Test the network quickly

In [11]:
data, target = next(iter(train_loader))

if use_cuda:
    data = data.cuda()
    target = target.cuda()

output = net(x = data.long(), yt=target.long())
loss = criterion(output, target.long())
acc = accuracy(output, target.long())

print(loss.item(), acc.item())

9.331087112426758 0.0


## Training of the network

In [12]:
print(net)
print(loss.grad_fn)
print(optimizer)

num_epochs = 100

N = len(train_loader)
log_every = 6000

epoch_train_loss, epoch_train_acc, epoch_valid_loss, epoch_valid_acc = [], [], [], []

for epoch in range(num_epochs):
    
    train_loss, train_acc, train_lengths = 0.0, 0.0, 0.0
    
    net.train()
    for i, (inputs, labels) in enumerate(train_loader):
        # check if cuda is available
        if use_cuda:
            inputs = inputs.cuda()
            labels = labels.cuda()
        
        # forward + backward + optimize
        output = net(inputs.long(), labels.long())
        loss = criterion(output, labels.long())
        acc = accuracy(output, labels.long())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * train_loader.batch_size
        train_acc += acc.item() * train_loader.batch_size
        train_lengths += train_loader.batch_size
        
        if i % log_every == 0 and i > 0:
            print("Epoch {}, iteration {}/{}, loss: {:.3f}, acc: {:.3f}".format(epoch+1, i+1, N, train_loss / train_lengths, train_acc / train_lengths))
            
    
    net.eval()
    val_losses, val_accs, val_lengths = 0, 0, 0
    for j, (inputs, labels) in enumerate(valid_loader):
        # check if cuda is available
        if use_cuda:
            inputs = inputs.cuda()
            labels = labels.cuda()
        # forward + backward
        output= net(inputs.long(), labels.long())
        loss = criterion(output, labels.long())
        
        val_losses += loss.item() * valid_loader.batch_size
        val_accs += accuracy(output, labels.long()).item() * valid_loader.batch_size
        val_lengths += valid_loader.batch_size
        
    val_losses /= val_lengths
    val_accs /= val_lengths
    
    train_loss /= train_lengths
    train_acc /= train_lengths
    
    epoch_train_acc.append(train_acc)
    epoch_train_loss.append(train_loss)
    epoch_valid_acc.append(val_accs)
    epoch_valid_loss.append(val_losses)
    
    print("\nTrain loss: {:.3f}, accuracy: {:.3f}".format(train_loss, train_acc))
    print("Validation loss: {:.3f}, accuracy: {:.3f}\n".format(val_losses, val_accs))
    
    # adjust learning rate if needed
    scheduler.step(val_losses)

SkipGram(
  (in_embeddings): Embedding(9999, 300, sparse=True)
  (output): Linear(in_features=300, out_features=9999, bias=True)
)
<NllLossBackward object at 0x00000235D185C080>
SGD (
Parameter Group 0
    dampening: 0
    lr: 1
    momentum: 0
    nesterov: False
    weight_decay: 0
)
Epoch 1, iteration 6001/26421, loss: 7.117, acc: 0.066
Epoch 1, iteration 12001/26421, loss: 6.909, acc: 0.071
Epoch 1, iteration 18001/26421, loss: 6.799, acc: 0.074
Epoch 1, iteration 24001/26421, loss: 6.724, acc: 0.076

Train loss: 6.700, accuracy: 0.077
Validation loss: 6.543, accuracy: 0.078

Epoch 2, iteration 6001/26421, loss: 6.142, acc: 0.085
Epoch 2, iteration 12001/26421, loss: 6.164, acc: 0.087
Epoch 2, iteration 18001/26421, loss: 6.174, acc: 0.088
Epoch 2, iteration 24001/26421, loss: 6.178, acc: 0.088

Train loss: 6.178, accuracy: 0.089
Validation loss: 6.412, accuracy: 0.089

Epoch 3, iteration 6001/26421, loss: 5.948, acc: 0.092
Epoch 3, iteration 12001/26421, loss: 5.978, acc: 0.092
Ep

Epoch 26, iteration 18001/26421, loss: 4.998, acc: 0.144
Epoch 26, iteration 24001/26421, loss: 4.995, acc: 0.145

Train loss: 4.995, accuracy: 0.145
Validation loss: 5.999, accuracy: 0.136

Epoch 27, iteration 6001/26421, loss: 4.991, acc: 0.145
Epoch 27, iteration 12001/26421, loss: 4.993, acc: 0.145
Epoch 27, iteration 18001/26421, loss: 4.994, acc: 0.145
Epoch 27, iteration 24001/26421, loss: 4.996, acc: 0.145

Train loss: 4.995, accuracy: 0.145
Validation loss: 6.000, accuracy: 0.136

Epoch    26: reducing learning rate of group 0 to 1.0000e-05.
Epoch 28, iteration 6001/26421, loss: 4.998, acc: 0.144
Epoch 28, iteration 12001/26421, loss: 4.998, acc: 0.144
Epoch 28, iteration 18001/26421, loss: 4.996, acc: 0.144
Epoch 28, iteration 24001/26421, loss: 4.995, acc: 0.145

Train loss: 4.995, accuracy: 0.145
Validation loss: 6.000, accuracy: 0.136

Epoch 29, iteration 6001/26421, loss: 4.997, acc: 0.144
Epoch 29, iteration 12001/26421, loss: 4.994, acc: 0.145
Epoch 29, iteration 18001/

KeyboardInterrupt: 

# Notes
- should we include subsampling!? probably :)
- padding?

# Good urls
https://stackoverflow.com/questions/31847682/how-to-compute-skipgrams-in-python
https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb
https://github.com/deborausujono/word2vecpy/blob/master/word2vec.py