In [1]:
seed = 42
import numpy as np
np.random.seed(seed)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(seed)

<torch._C.Generator at 0x104466bd0>

# Data loading
The Penn Treebank datafiles are given in the urls below, where we have three different datasets: train, validation and test. Data was downloaded from [train](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt), [validation](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt) and [test](https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt). 

In [2]:
def load_corpus(path): 
    corpus = []
    with open(path, 'r') as infile: 
        for line in infile: 
            line = line[:-1].split()
            corpus.append(line)
    return corpus

In [3]:
# Make corpus
def corpus_counts(path, verbose=False): 
    # Open training file 
    corpus = load_corpus(path=path)

    # Count occurrences
    unique, counts = np.unique(np.array([item for sublist in corpus for item in sublist]), return_counts=True)
    corpus_counts = dict(zip(unique, counts))
    
    if verbose: 
        for v, k in sorted(zip(counts, unique), reverse=True): 
            print('Key is "{0}" with count {1}'.format(k, v))
    
    # Build vocabulary
    vocab_size = len(corpus_counts)
    word_to_idx = {word: i+1 for i, word in enumerate(corpus_counts.keys())}
    word_to_idx['padding'] = 0
    
    return corpus, corpus_counts, word_to_idx

## Data loader

In [4]:
# Function to make context pairs
def make_context_pairs(data, word_to_idx, window_size=2): 
    # Run through each sample
    word_data = []
    for line in data: 
        # Add padding corresponding to the size of the window on either side
        padding = ['padding']*window_size
        line = padding+line+padding
        
        # Make contexts
        for i in range(window_size, len(line) - window_size):
            context, c = [], -window_size
            while c <= window_size:
                if c != 0: 
                    context.append(line[i+c])
                c += 1
            word_data.append((context, line[i]))
    
    return word_data

## Load data

In [5]:
# Make corpus
train, train_counts, word_to_idx = corpus_counts(path='data/ptb.train.txt', verbose=False)

In [6]:
# Set window size 
ws = 1

In [7]:
# Load training data
#words, word_to_idx = data_loader(path='data/ptb.train.txt', window_size=2)
train_words = make_context_pairs(data=train, word_to_idx=word_to_idx, window_size=ws)

In [8]:
# Check word contexts
word_sum = len(train_words)
print(word_sum)
for context, word in train_words[:10]: 
    #s, t = context_pair
    #print(s, t)
    print(context, word)

887521
['padding', 'banknote'] aer
['aer', 'berlitz'] banknote
['banknote', 'calloway'] berlitz
['berlitz', 'centrust'] calloway
['calloway', 'cluett'] centrust
['centrust', 'fromstein'] cluett
['cluett', 'gitano'] fromstein
['fromstein', 'guterman'] gitano
['gitano', 'hydro-quebec'] guterman
['guterman', 'ipo'] hydro-quebec


In [9]:
# Load validation data
valid = load_corpus(path='data/ptb.valid.txt')
valid_words = make_context_pairs(data=valid, word_to_idx=word_to_idx, window_size=ws)

After data has been loaded it is good to check what is looks like. 

In [10]:
print('Number of training samples:\t', len(train_words))
print('Number of validation samples:\t', len(valid_words))

Number of training samples:	 887521
Number of validation samples:	 70390


# CBOW class

In [11]:
class cbow(nn.Module):

    def __init__(self, vocab_size, embedding_dim, window_size):
        super(cbow, self).__init__()
        # num_embeddings is the number of words in your train, val and test set
        # embedding_dim is the dimension of the word vectors you are using
        self.embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim, 
                                      padding_idx=0)
        self.linear1 = nn.Linear(in_features=window_size * embedding_dim, out_features=128, bias=True)
        self.linear2 = nn.Linear(in_features=128, out_features=vocab_size, bias=False)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((1, -1))
        out = F.relu(self.linear1(embeds))
        out = self.linear2(out)
        probs = F.softmax(out, dim=1)
        return probs

# Model training

In [15]:
# Set loss, model and optimizer
loss_function = nn.CrossEntropyLoss()
model = cbow(vocab_size=len(train_counts), embedding_dim=4, window_size=ws*2)
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [16]:
# Look at loaded model
model

cbow(
  (embeddings): Embedding(9999, 4, padding_idx=0)
  (linear1): Linear(in_features=8, out_features=128, bias=True)
  (linear2): Linear(in_features=128, out_features=9999, bias=False)
)

In [19]:
losses = []
max_epochs = 10
max_samples = 100
train_batch = train_words[:max_samples]
for epoch in range(max_epochs):
    print('\n# Epoch {0}/{1}'.format(epoch+1, max_epochs))
    total_loss = 0
    i = 0
    model.train()
    for context, target in train_batch:
        #print('\tSample {0}/{1}'.format(i, max_samples))
        # Step 1. Prepare the inputs to be passed to the model (i.e, turn the words
        # into integer indices and wrap them in tensors)
        vec_context = torch.tensor([word_to_idx[w] for w in context], dtype=torch.long)
        #vec_context = make_context_vector(context=context, word_to_ix=word_to_idx)

        # Step 2. Recall that torch *accumulates* gradients. Before passing in a
        # new instance, you need to zero out the gradients from the old instance
        model.zero_grad()

        # Step 3. Run the forward pass, getting probabilities over next words
        probs = model(vec_context)

        # Step 4. Compute your loss function. (Again, Torch wants the target
        # word wrapped in a tensor)
        loss = loss_function(probs, torch.tensor([word_to_idx[target]], dtype=torch.long))

        # Step 5. Do the backward pass and update the gradient
        loss.backward()
        optimizer.step()

        # Get the Python number from a 1-element Tensor by calling tensor.item()
        total_loss += loss.item()
        
        i += 1
    losses.append(total_loss)
print(losses)  # The loss decreased every iteration over the training data!


# Epoch 1/10

# Epoch 2/10

# Epoch 3/10

# Epoch 4/10

# Epoch 5/10

# Epoch 6/10

# Epoch 7/10

# Epoch 8/10

# Epoch 9/10

# Epoch 10/10
[921.024097442627, 921.024097442627, 921.024097442627, 921.024097442627, 921.024097442627, 921.024097442627, 921.024097442627, 921.0240964889526, 921.0240964889526, 921.0240964889526]


# Notes 