In [1]:
seed = 42
import numpy as np
np.random.seed(seed)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch.utils.data as data_utils
torch.manual_seed(seed)

from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import sys

In [2]:
# General parameters 
pad = False  # Padding
ws = 2       # Window_size
bs = 1000    # Batch size

# Data loading
The Penn Treebank datafiles are given in the urls below, where we have three different datasets: train, validation and test. Data was downloaded from [train](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt), [validation](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt) and [test](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt). 

In [3]:
# Data loader class
class DataLoader: 
    def __init__(self):
        self.corpus = []
    
    # Load words
    def load_corpus(self, path): 
        with open(path, 'r') as infile: 
            for line in infile: 
                line = line[:-1].split()
                self.corpus.append(line)
    
    # Make dict
    def count_corpus(self, padding=True, verbose=False): 
        # Count occurrences
        unique, counts = np.unique(np.array([item for sublist in self.corpus for item in sublist]), return_counts=True)
        self.corpus_counts = dict(zip(unique, counts))

        if verbose: 
            for v, k in sorted(zip(counts, unique), reverse=True): 
                print('Key is "{0}" with count {1}'.format(k, v))

        # Build vocabulary
        if padding: 
            indices = list(range(1,len(unique+1)))
            self.word_to_idx = dict(zip(sorted(unique, indices)))
            #self.word_to_idx = {word: i+1 for i, word in enumerate(self.corpus_counts.keys())}
            self.word_to_idx['padding'] = 0
        else: 
            indices = list(range(len(unique)))
            self.word_to_idx = dict(zip(sorted(unique), indices))
            #self.word_to_idx = {word: i for i, word in enumerate(self.corpus_counts.keys())}
        
        # Make reverse dict
        self.idx_to_word = {idx: w for idx, w in self.word_to_idx.items()}
            
    # Function to make context pairs
    def make_context_pairs(self, window_size=2, padding=True): 
        self.window_size = window_size
        
        # Run through each sample
        self.word_data = []
        for line in self.corpus: 
            if padding: 
                # Add padding corresponding to the size of the window on either side
                padding = ['padding']*window_size
                line = padding+line+padding

            # Make contexts
            for i in range(window_size, len(line) - window_size):
                context, c = [], -window_size
                while c <= window_size:
                    if c != 0: 
                        context.append(line[i+c])
                    c += 1
                self.word_data.append((context, line[i]))

    # Convert word_data to numpy array tuples
    def words_to_index(self, word2idx):
        if hasattr(self, 'window_size'): 
            # Pre-allocate
            data = np.empty((len(self.word_data), self.window_size*2), dtype=int)
            labels = np.empty((len(self.word_data)), dtype=int)
            print(data.shape)
            
            # Run through context pairs and fill arrays
            i = 0
            for d, l in self.word_data: 
                data[i, :] = np.array([word2idx[w] for w in d])
                labels[i,] = word2idx[l]
 
                i += 1
                
            # Save as tuple
            self.context_array = (data, labels)
        else: 
            print('# Make context pairs, first!')
            sys.exit(1)

## Training data

In [4]:
# Get training data
train_data = DataLoader()
train_data.load_corpus(path='data/ptb.train.txt')
train_data.count_corpus(padding=pad)

# Make context pairs for training data
train_data.make_context_pairs(window_size=ws, padding=pad)

In [None]:
# Check word contexts
word_sum = len(train_data.word_data)
print(word_sum)
for context, word in train_data.word_data[:10]: 
    print(context, word)

720533
['aer', 'banknote', 'calloway', 'centrust'] berlitz
['banknote', 'berlitz', 'centrust', 'cluett'] calloway
['berlitz', 'calloway', 'cluett', 'fromstein'] centrust
['calloway', 'centrust', 'fromstein', 'gitano'] cluett
['centrust', 'cluett', 'gitano', 'guterman'] fromstein
['cluett', 'fromstein', 'guterman', 'hydro-quebec'] gitano
['fromstein', 'gitano', 'hydro-quebec', 'ipo'] guterman
['gitano', 'guterman', 'ipo', 'kia'] hydro-quebec
['guterman', 'hydro-quebec', 'kia', 'memotec'] ipo
['hydro-quebec', 'ipo', 'memotec', 'mlx'] kia


In [None]:
# Convert to numpy
train_data.words_to_index(word2idx=train_data.word_to_idx)

(720533, 4)


## Validation data

In [None]:
# Get validation data
valid_data = DataLoader()
valid_data.load_corpus(path='data/ptb.valid.txt')

# Make context pairs for validation data
valid_data.make_context_pairs(window_size=ws, padding=pad)

# Convert to numpy
valid_data.words_to_index(word2idx=train_data.word_to_idx)

(57004, 4)


After data has been loaded it is good to check what is looks like. 

In [None]:
print('Number of training samples:\t', train_data.context_array[0].shape)
print('Number of validation samples:\t', valid_data.context_array[0].shape)

Number of training samples:	 (720533, 4)
Number of validation samples:	 (57004, 4)


# CBOW class

In [None]:
class cbow(nn.Module):

    def __init__(self, vocab_size, window_size, embedding_dim=2, n_hid=128, padding=True):
        super(cbow, self).__init__()
        # num_embeddings is the number of words in your train, val and test set
        # embedding_dim is the dimension of the word vectors you are using
        if padding: 
            self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, 
                                          padding_idx=0)
        else: 
            self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, 
                                          padding_idx=None)
        
        self.linear1 = nn.Linear(in_features=window_size * embedding_dim, out_features=n_hid, bias=True)
        self.linear2 = nn.Linear(in_features=n_hid, out_features=vocab_size, bias=False)
        self.window = window_size
        self.embed_dim = embedding_dim

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        #print(embeds.shape, embeds.view((-1, self.window*self.embed_dim)).shape)
        out = F.relu(self.linear1(embeds.view((-1, self.window*self.embed_dim))))
        out = self.linear2(out)
        probs = F.softmax(out, dim=1)
        return probs

In [None]:
# Estimate performance
def accuracy(y_true, y_pred):
    # Make y_pred for the word with max probability
    values, indices = torch.max(input=y_pred, dim=1)
    
    # Check if indices match
    check = torch.eq(indices, y_true)
    
    # Estimate accuracy
    acc = check.sum()/len(check)
    return acc.item()

# Model parameters

In [None]:
# Pytorch batch_loader
train = data_utils.TensorDataset(torch.from_numpy(train_data.context_array[0]), torch.from_numpy(train_data.context_array[1]))
load_train = data_utils.DataLoader(train, batch_size=bs, shuffle=True)

valid = data_utils.TensorDataset(torch.from_numpy(valid_data.context_array[0]), torch.from_numpy(valid_data.context_array[1]))
load_valid = data_utils.DataLoader(valid, batch_size=bs, shuffle=True)

In [None]:
# Set loss, model and optimizer
criterion = nn.CrossEntropyLoss()
net = cbow(vocab_size=len(train_data.word_to_idx), window_size=ws*2, embedding_dim=100, n_hid=100, padding=pad)
optimizer = optim.Adam(net.parameters(), lr=0.01)
#optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)

In [None]:
# Look at loaded model
net

cbow(
  (embeddings): Embedding(9999, 100)
  (linear1): Linear(in_features=400, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=9999, bias=False)
)

In [None]:
# Test the network 
data, target = next(iter(load_train))
output = net(data.long())
loss = criterion(output, target)

# Estimate accuracy
acc = accuracy(y_true=target, y_pred=output)

print('Loss:', loss.item(), '\tAccuracy:', acc)

Loss: 9.210275650024414 	Accuracy: 0


# Model training

In [None]:
# Get batch slice
def get_batch(batch_size, i):
    start_idx = i*batch_size
    end_idx = (i+1)*batch_size
    return start_idx, end_idx

def batch_to_context(batch, word_to_idx): 
    x_batch, y_batch =  [], []
    
    # Run through samples
    for context, target in batch: 
        x = [word_to_idx[w] for w in context]
        y = word_to_idx[target]
        
        # Append idx words to batch
        x_batch.append(x)
        y_batch.append(y)
    
    # Make tensors
    x_batch = torch.tensor(x_batch, dtype=torch.long)
    y_batch = torch.tensor(y_batch, dtype=torch.long)
    return x_batch, y_batch

In [None]:
losses = []
train_acc, train_loss = [], []
valid_acc, valid_loss = [], []
max_epochs = 10

# Run through epochs
for epoch in range(max_epochs):
    #print('# Epoch {0}/{1}'.format(epoch+1, max_epochs))
    
    ### Train ###
    current_loss = 0
    net.train()
    for i, (inputs, labels) in enumerate(load_train):

        # Zero gradient
        net.zero_grad()

        # Step 3. Run the forward pass, getting probabilities over next words
        probs = net(inputs)

        # Step 4. Compute your loss function.
        loss = criterion(probs, labels)

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        current_loss += loss.item()  
        
        if i % 50 == 0:
            print('Epoch {0}/{1}, Batch: {2}/{3}, Loss: {4}'.format(epoch+1, max_epochs, i+1, 
                                                                    len(load_train), current_loss/(i+1)))
        
    losses.append(current_loss)

    
    ### Evaluate training ###
    net.eval()
    train_preds, train_targs = [], []
    for i, (inputs, labels) in enumerate(load_train):
        # Get predictions
        output = net(inputs)
        preds = torch.max(inputs=output, dim=1)[1]
        
        train_targs += list(y_batch)
        train_preds += list(preds.data.numpy())
    
    ### Evaluate validation ###
    val_preds, val_targs = [], []
    for i, (inputs, labels) in enumerate(load_train):
        output = net(inputs)
        preds = torch.max(inputs=output, dim=1)[1]
        val_preds += list(preds.data.numpy())
        val_targs += list(y_batch)

    train_acc_cur = accuracy(y_true=train_targs, y_pred=train_preds)
    valid_acc_cur = accuracy(y_true=val_targs, y_pred=val_preds)
    
    train_acc.append(train_acc_cur)
    valid_acc.append(valid_acc_cur)
    
    if epoch % 10 == 0:
        print("Epoch %2i : Train Loss %f , Train acc %f, Valid acc %f\n" % (
                epoch+1, losses[-1], train_acc_cur, valid_acc_cur))

Epoch 1/10, Batch: 1/721, Loss: 9.210273742675781
Epoch 1/10, Batch: 51/721, Loss: 9.119373321533203


In [None]:
print(losses)  # The loss does not decrease that much...

In [None]:
# Plot training and validation performances
epoch = np.arange(max_epochs)
plt.figure()
plt.plot(epoch, train_acc, 'r', epoch, valid_acc, 'b')
plt.legend(['Train Acc', 'Val Acc'])
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.show()

In [None]:
### Evaluate test set
#x_batch = Variable(torch.from_numpy(x_test))
#output = net(x_batch)
#preds = torch.max(output, 1)[1]
#print("\nTest set Acc:  %f" % (accuracy_score(list(targets_test), list(preds.data.numpy()))))

# Notes 