In [1]:
import pandas as pd
import numpy as np

from collections import Counter
import operator

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data_utils
import torch.nn.functional as F

# Data loading
The Penn Treebank datafiles are given in the urls below, where we have three different datasets: train, validation and test.

In [2]:
urls = ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt',
        'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt',
        'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt']

In [3]:
df_train = pd.read_table(urls[0], header=None)
df_valid = pd.read_table(urls[1], header=None)
df_test = pd.read_table(urls[2], header=None)

In [4]:
class DataLoader:
    def __init__(self):
        self.corpus = []
        self.vocab = []
    
    def readData(self, data, sep=None):
        self.corpus = [x.split(sep) for x in data]
        
        self.vocab = set([y for x in self.corpus for y in x])
        
        self.vocab_size = len(self.vocab)
        
        self.createIndices()
    
    def createIndices(self):
        self.word2idx = {w: idx for (idx, w) in enumerate(self.vocab)}
        self.idx2word = {idx: w for (idx, w) in enumerate(self.vocab)}
        
        # error checks
        assert len(self.word2idx) == self.vocab_size
        assert len(self.idx2word) == self.vocab_size
        
    def __str__(self):
        return "{0} sentences in data, consisting of {1} unique words".format(len(self.corpus), self.nunique)
    
    def generate_pairs(self, window_size):
        idx_pairs = []

        for sentence in self.corpus:
            indices = [self.word2idx[word] for word in sentence]
            # for each word, threated as center word
            for center_word_pos in range(len(indices)):
                # for each window position
                for w in range(-window_size, window_size + 1):
                    context_word_pos = center_word_pos + w
                    # make soure not jump out sentence
                    if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                        continue
                    context_word_idx = indices[context_word_pos]
                    idx_pairs.append((indices[center_word_pos], context_word_idx))

        self.idx_pairs = np.array(idx_pairs)  
        
    def wordFrequencies(self, n=10, update=False):
        
        if not hasattr(self, "sorted_counts") or update:
            flattened = [y for x in self.corpus for y in x]
            self.nwords = len(flattened)
            counts = Counter(flattened)

            self.sorted_counts = sorted(counts.items(), key=operator.itemgetter(1), reverse=True)
            
            self.word2freq = {w: f/self.nwords for (w, f) in counts.items()}
        
        print("Top {0} most common words".format(n))
        for i in range(n):
            print(self.sorted_counts[i])

In [5]:
train_data = DataLoader()
train_data.readData(data=df_train[0])
train_data.generate_pairs(window_size=1)

valid_data = DataLoader()
valid_data.readData(data=df_valid[0])
valid_data.generate_pairs(window_size=1)

test_data = DataLoader()
test_data.readData(data=df_test[0])
test_data.generate_pairs(window_size=1)

In [6]:
train_data.wordFrequencies(n=10)

Top 10 most common words
('the', 50770)
('<unk>', 45020)
('N', 32481)
('of', 24400)
('to', 23638)
('a', 21196)
('in', 18000)
('and', 17474)
("'s", 9784)
('that', 8931)


# Skip-gram

Objective of skip-gram model: figure out word representations that are useful for predicting the surrounding words in a sentence. 

Given a sequence of words for training, $w_1,w_2,w_3,...,w_T$, the objective is then to maximize the average log probability over the training context $c$:
$$\frac{1}{T}\sum_{t=1}^T\sum_{-c\leq j \leq c, j\neq0} \log p(w_{t+j}|w_t)$$

$p(w_{t+j}|w_t)$ is defined by the softmax function $$p(w_{O}|w_I)=\frac{\exp\big(v_{w_O}'^\intercal v_{w_I}\big)}{\sum_{w=1}^W\exp(v_w'^\intercal v_{w_I})}$$

Assumtions:
* context windows are symmetrical with a value $x\in \R $

In [153]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        """
        Attributes:
            vocab_size: number of vocab
            embedding_dim: embedding dimensions
        """
        super(SkipGram, self).__init__()
        
        self.in_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, sparse=True)
        self.out_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, sparse=True)
        
        # hidden layer
#         self.hidden = nn.Linear(in_features=embedding_dim, out_features=vocab_size)
        
        # output layer        
#         self.output = nn.Linear(in_features=embedding_dim, out_features=vocab_size)
    
    def forward(self, x, yt):
        """
        Attributes:
            x: input data
            yt: true y
        
        Return:
            out: predictions of words, dim corresponds to vocab size
        """
        
        # get embeddings for input x
        x_vec = self.in_embeddings(x)
        
        # get embeddings for output yt
        yt_vec = self.out_embeddings(yt)

        # calculate output
        out = torch.mul(x_vec, yt_vec)
        
        loss = torch.sum(-F.logsigmoid(out), -1)
        loss = torch.mean(loss)
        
        return out, loss
        
        
net = SkipGram(vocab_size=train_data.vocab_size, embedding_dim=300)
print(net)

SkipGram(
  (in_embeddings): Embedding(9999, 300, sparse=True)
  (out_embeddings): Embedding(9999, 300, sparse=True)
)


## Convenience functions

In [154]:
def accuracy(ys, ts):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

In [155]:
use_cuda = torch.cuda.is_available() and args.gpu_id > -1

if use_cuda:
    torch.cuda.set_device(args.gpu_id)
    model.cuda()

In [156]:
train = data_utils.TensorDataset(torch.from_numpy(train_data.idx_pairs[:,0]), torch.from_numpy(train_data.idx_pairs[:,1]))
train_loader = data_utils.DataLoader(train, batch_size=64, shuffle=True, num_workers=4)

valid = data_utils.TensorDataset(torch.from_numpy(valid_data.idx_pairs[:,0]), torch.from_numpy(valid_data.idx_pairs[:,1]))
valid_loader = data_utils.DataLoader(valid, batch_size=64, shuffle=True, num_workers=4)

## Criterion and optimizer

In [157]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

## Test the network quickly

In [158]:
data, target = next(iter(train_loader))

if use_cuda:
    data = data.cuda()
    target = target.cuda()

output, loss = net(x = data.long(), yt=target.long())
# loss = criterion(output, target.long())
# acc = accuracy(output, target.long())

print(loss.item())#, acc.item())

238.08462524414062


## Training of the network

should implement batch sizes, as this is too slow :)

In [None]:
num_epochs = 2

log_every = 499
N = len(train_loader)

for epoch in range(num_epochs):
    
    running_loss = 0
    
    net.train()
    for i, (inputs, labels) in enumerate(train_loader):
        
        # zero parameter gradients
        optimizer.zero_grad()
        
        # check if cuda is available
        if use_cuda:
            inputs = inputs.cuda()
            labels = labels.cuda()
        
        # forward + backward + optimize
        output, loss = net(inputs.long(), labels.long())
#         loss = criterion(output, labels.long())
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i % log_every == 0:
            print("Epoch {}, iteration {}/{}, loss: {:.3f}".format(epoch+1, i+1, N, running_loss / log_every))
            
            running_loss = 0.0  
    
    net.eval()
    val_losses, val_accs, val_lengths = 0, 0, 0
    for j, (inputs, labels) in enumerate(valid_loader):
        # check if cuda is available
        if use_cuda:
            inputs = inputs.cuda()
            labels = labels.cuda()
        # forward + backward
        output, loss = net(inputs.long(), labels.long())
#         loss = criterion(output, labels.long())
        
        val_losses += loss.item() * valid_loader.batch_size
        val_accs += accuracy(output, labels.long()).item() * valid_loader.batch_size
        val_lengths += valid_loader.batch_size
        
    val_losses /= val_lengths
    val_accs /= val_lengths
    
    print("\nValidation loss: {:.3f}, accuracy: {:.3f}".format(val_losses, val_accs))

Epoch 1, iteration 1/26421, loss: 0.476
Epoch 1, iteration 500/26421, loss: 237.557
Epoch 1, iteration 999/26421, loss: 236.343
Epoch 1, iteration 1498/26421, loss: 235.263
Epoch 1, iteration 1997/26421, loss: 234.242
Epoch 1, iteration 2496/26421, loss: 233.274
Epoch 1, iteration 2995/26421, loss: 232.378
Epoch 1, iteration 3494/26421, loss: 231.922
Epoch 1, iteration 3993/26421, loss: 231.097
Epoch 1, iteration 4492/26421, loss: 230.433
Epoch 1, iteration 4991/26421, loss: 229.877
Epoch 1, iteration 5490/26421, loss: 229.292
Epoch 1, iteration 5989/26421, loss: 228.844
Epoch 1, iteration 6488/26421, loss: 228.487
Epoch 1, iteration 6987/26421, loss: 227.869
Epoch 1, iteration 7486/26421, loss: 227.640
Epoch 1, iteration 7985/26421, loss: 227.081
Epoch 1, iteration 8484/26421, loss: 226.834
Epoch 1, iteration 8983/26421, loss: 226.510
Epoch 1, iteration 9482/26421, loss: 226.218
Epoch 1, iteration 9981/26421, loss: 225.813
Epoch 1, iteration 10480/26421, loss: 225.557
Epoch 1, iterati

In [None]:
output

In [98]:
valid_loader.batch_size

64

In [15]:
num_epochs = 4

train_loss, train_accs = [],  []
valid_loss, valid_accs = [],  []

eval_every = 1

net.train()
for epoch in range(num_epochs):    
    for data, target in train_loader:        
        x = data.long()
        y_true = target.long()
        
        optimizer.zero_grad()
        
        output = net(x)
        
        loss = criterion(output['out'], y_true)
        
        loss.backward()
        optimizer.step()
        
        acc = accuracy(output['out'], y_true)
        
        train_loss.append(loss.item())
        train_accs.append(acc)
        
        
    if epoch % eval_every == 0:
        
        net.eval()
        val_losses, val_accs, val_lengths = 0, 0, 0
        
        for i, (val_data, val_target) in enumerate(valid_loader):
            x = data.long()
            y_true = target.long()
            
            output = net(x)
            
            loss = criterion(output['out'], y_true)
            acc = accuracy(output['out'], y_true)
            
            val_losses += loss * valid_loader.batch_size
            val_accs += acc * valid_loader.batch_size
            val_lengths += valid_loader.batch_size
            
        val_losses /= val_lengths
        val_accs /= val_lengths
        
        
        print("Epoch: {}, train loss: {:.2f}, accs: {:.2f}".format(epoch, np.mean(train_loss), np.mean(train_accs)))
        print("Epoch: {}, valid loss: {:.2f}, accs: {:.2f}".format(epoch, get_numpy(val_losses), get_numpy(val_accs)))

Epoch: 0, train loss: 9.21, accs: 0.00
Epoch: 0, valid loss: 9.21, accs: 0.00
Epoch: 1, train loss: 9.21, accs: 0.00
Epoch: 1, valid loss: 9.21, accs: 0.00
Epoch: 2, train loss: 9.21, accs: 0.00
Epoch: 2, valid loss: 9.21, accs: 0.00
Epoch: 3, train loss: 9.21, accs: 0.00
Epoch: 3, valid loss: 9.21, accs: 0.00


# Notes
- should we include subsampling!? probably :)
- padding?

# Good urls
https://stackoverflow.com/questions/31847682/how-to-compute-skipgrams-in-python
https://towardsdatascience.com/implementing-word2vec-in-pytorch-skip-gram-model-e6bae040d2fb
https://github.com/deborausujono/word2vecpy/blob/master/word2vec.py