In [None]:
%%time
import nltk
import pandas as pd
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

In [2]:
def get_tokens_tags_from_sents(sents):
    tokens, tags = [], []
    for sent in sents:
        sent_tokens, _, sent_tags = list(zip(*sent))
        tokens.append(sent_tokens)
        tags.append(sent_tags)
    return tokens, tags

In [3]:
%%time
train_tokens, train_tags = get_tokens_tags_from_sents(train_sents)
val_tokens, val_tags = get_tokens_tags_from_sents(test_sents)

CPU times: user 63.9 ms, sys: 6.83 ms, total: 70.7 ms
Wall time: 70.4 ms


You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:

In [4]:
idx = 0
pd.DataFrame([train_tokens[idx], train_tags[idx]])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Melbourne,(,Australia,),",",25,may,(,EFE,),.
1,B-LOC,O,B-LOC,O,O,O,O,O,B-ORG,O,O


### Prepare dictionaries

To train a neural network, we will use two mappings: 
- {token}$\to${token id}: address the row in embeddings matrix for the current token;
- {tag}$\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.

Now you need to implement the function *build_dict* which will return {token or tag}$\to${index} and vice versa. 

In [5]:
from collections import defaultdict

In [6]:
def build_dict(tokens_or_tags, special_tokens):
    # Create a dictionary with default value 0
    tok2idx = defaultdict(lambda: 0)
    ind = 0
    for t in special_tokens:
        tok2idx[t] = ind
        ind += 1
    for sam in tokens_or_tags:
        for t in sam:
            if t not in special_tokens and t not in tok2idx:
                tok2idx[t] = ind
                ind += 1
    return tok2idx, dict((v,k) for k,v in tok2idx.items())

After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens for tokens will be:
 - `<UNK>` token for out of vocabulary tokens; index = 0
 - `<PAD>` token for padding sentence to the same length when we create batches of sentences. index = 1

In [None]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create dictionaries 
token2idx, idx2token = build_dict(train_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)

### Generate batches

Neural Networks are usually trained with batches. It means that weight updates of the network are based on several sequences at every single time. The tricky part is that all sequences within a batch need to have the same length. So we will pad them with a special `<PAD>` token. It is also a good practice to provide RNN with sequence lengths, so it can skip computations for padding parts. We provide the batching function *batches_generator* readily available for you to save time. 

In [8]:
def prepare_sequence(seq, to_ix):
    return torch.tensor(np.vectorize(lambda t: to_ix.get(t, to_ix.get("<UNK>", to_ix['O'])))(seq), dtype=torch.long)

In [9]:
def batches_generator(batch_size, tokens, tags,
                      shuffle=True, allow_smaller_last_batch=True, seed=8):
    """Generates padded batches of tokens and tags."""
    # TODO: use DataLoader from Pytorch for this
    # tokens is a list of docs, and each docs is a list of tokens
    # SHUFFLE
    n_samples = len(tokens)
    np.random.seed(seed)
    if shuffle:
        order = np.random.permutation(n_samples)
    else:
        order = np.arange(n_samples)

    # NUMBER OF BATCHES
    n_batches = n_samples // batch_size
    # and n_samples / batch_size not integer, put the leftovers in last batch
    if allow_smaller_last_batch and n_samples % batch_size:
        n_batches += 1

    # for each batch, get the docs, labels and real lengths and yield them
    for k in range(n_batches):
        batch_start = k * batch_size
        batch_end = min((k + 1) * batch_size, n_samples)
        x, y = [], []
        batch_lengths = torch.zeros(batch_end - batch_start, dtype=torch.int32)
        # x will be a list of lists of indices (one list of indices per doc in this batch)
        for sample_in_batch_index, sample_idx in enumerate(order[batch_start: batch_end]):
            try:
                x.append(prepare_sequence(tokens[sample_idx], token2idx))
                y.append(prepare_sequence(tags[sample_idx], tag2idx))
            except Exception as marc:
                print(marc)
                import pdb; pdb.set_trace()
            batch_lengths[sample_in_batch_index] = len(tags[sample_idx])
        x = pad_sequence(x, batch_first=True, padding_value=token2idx["<PAD>"])
        y = pad_sequence(y, batch_first=True, padding_value=-1)
        batch_lengths, perm_idx = batch_lengths.sort(0, descending=True)
        # yield each batch
        yield x[perm_idx, ...], y[perm_idx, ...], batch_lengths

## Build a recurrent neural network

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.nn.utils.rnn import (
    pack_padded_sequence, pad_sequence, pad_packed_sequence
)
from torch.nn.utils import clip_grad_norm_
torch.manual_seed(1)
import numpy as np
from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score

In [None]:
class LSTMTagger(nn.Module):

    def __init__(self, batch_size, embedding_dim, hidden_dim, vocab_size, tagset_size,
                padding_idx, verbose=False, bidirectional=False):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,
                           bidirectional=bidirectional)
        self.tagset_size = tagset_size
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear((1+bidirectional)*hidden_dim, tagset_size)
        self.verbose = verbose

    def forward(self, X, X_lens):
        # embeddings
        embeds = self.word_embeddings(X)
        if self.verbose: print(f"Embeds: {embeds.size()}")
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        embeds = pack_padded_sequence(embeds, X_lens.cpu().numpy(), batch_first=True)
        # lstm
        #lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out, _ = self.lstm(embeds)
        # undo the packing operation
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        if self.verbose: print(f"lstm_out: {lstm_out.size()}")
        # (batch_size, seq_len, hidden_dim) --> (batch_size * seq_len, hidden_dim)
        s = lstm_out.contiguous().view(-1, lstm_out.shape[2])
        # (batch_size * seq_len, hidden_dim) --> (batch_size * seq_len, tag_dim)
        tag_space = self.hidden2tag(lstm_out)
        if self.verbose: print(f"tag space: {tag_space.size()}")
        # normalize logits
        tag_scores = F.log_softmax(tag_space, dim=1)
        if self.verbose: print(f"tag scores: {tag_scores.size()}")
        return tag_scores
    
    def loss(self, Y_hat, Y):
        criterion = nn.CrossEntropyLoss(ignore_index=-1)
        return criterion(Y_hat.view(-1, Y_hat.size()[2]), Y.view(-1))

### Evaluation helpers

In [13]:
labels_to_score = list(tag2idx.keys())
labels_to_score.remove('O')
labels_to_score

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [14]:
# group B and I results
sorted_labels = sorted(
    labels_to_score, 
    key=lambda name: (name[1:], name[0])
)

In [27]:
def predict_tags(model, batch_tokens, batch_lengths, batch_tags=None):
    """Performs predictions and transforms indices to tokens and tags."""
    
    tag_scores = model(batch_tokens, batch_lengths)
    predicted_tags = np.vectorize(idx2tag.get)(torch.argmax(tag_scores, dim=2).data.numpy())
    if batch_tags is not None:
        return predicted_tags, model.loss(tag_scores, batch_tags)
    return predicted_tags


def my_scorer(true_tags, predicted_tags):
    logger.info(flat_f1_score(true_tags, predicted_tags, average='weighted', labels=sorted_labels))
    logger.info(flat_classification_report(
        true_tags, predicted_tags, labels=sorted_labels, digits=3
    ))


def eval_model_for_set(model, tokens, tags, scoring=my_scorer):
    """Computes NER quality measures given model and a dataset"""
    model.eval()
    predicted_tags, true_tags, loss = [], [], 0
    with torch.no_grad():
        for x_batch, y_batch, lengths in batches_generator(1, tokens, tags):
            padded_predicted_tags, batch_loss = predict_tags(model, x_batch, lengths, y_batch)
            loss += batch_loss
            padded_true_tags = np.vectorize(idx2tag.get)(y_batch.data)
            for x, y, l in zip(padded_predicted_tags, padded_true_tags, lengths): 
                predicted_tags.append(x[:l])
                true_tags.append(y[:l])    
        scoring(true_tags, predicted_tags)
        return loss / len(true_tags)

In [16]:
def print_example(training_data, i, model, word2idx, idx2tag):
    # Note that element i,j of tag_scores is the score for tag j for word i.
    # Here we don't need to train, so the code is wrapped in torch.no_grad()
    with torch.no_grad():
        seq = training_data[0][i]
        labs = training_data[1][i]
        inputs = prepare_sequence(seq, word2idx)
        tag_scores = model(inputs.view(1, len(inputs)), 
                           torch.tensor([len(seq)]))
        tags = np.vectorize(idx2tag.get)(torch.argmax(tag_scores, dim=2).data.numpy())
        print(seq)
        print()
        print(tags)
        print()
        print(len(seq), tag_scores.size(), tags.shape)
        print()
        print(training_data[1][i])
        print(training_data[1][i] == tags)       
#print_example(training_data, 79, model, token2idx, idx2tag)

## Set hyperparams and train the model

In [28]:
EMBEDDING_DIM = 200
HIDDEN_DIM = 200
BATCH_SIZE = 32
EPOCHS = 50
VOCAB_SIZE = len(token2idx)
TAGSET_SIZE = len(tag2idx)
PADDING_IDX = token2idx["<PAD>"]
training_data = (train_tokens, train_tags)
model = LSTMTagger(BATCH_SIZE, EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE, 
                   TAGSET_SIZE, PADDING_IDX, verbose=False, bidirectional=True)
optimiser = torch.optim.Adam(model.parameters(), lr=0.005)

In [30]:
%%time
# print predictions before training
#print_example(training_data, 123, model, token2idx, idx2tag)
logger.info("START!")
train_loss, val_loss = [], []
for epoch in range(EPOCHS): 
    train_loader = batches_generator(BATCH_SIZE, train_tokens, train_tags, seed=epoch)
    epoch_loss = 0
    model.train()
    for idx_batch, batch in enumerate(train_loader):
        batch_sents, batch_tags, batch_lens = batch
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        # Step 2. Run our forward pass.
        tag_scores = model(batch_sents, batch_lens)
        # Step 3. Compute the loss, gradients, and update the parameters
        loss = model.loss(tag_scores, batch_tags)
        loss.backward()
        epoch_loss += float(loss)
        clip_grad_norm_(model.parameters(), 5)
        optimiser.step()
        if (idx_batch + 1) % 970 == 0:
            logger.info(f'Epoch [{epoch + 1}/{EPOCHS}], '
                  f"Step [{idx_batch + 1}/{len(train_tags)// BATCH_SIZE}], "
                  f"Loss: {loss:.4f}")
        
    logger.info(f"avg epoch {epoch + 1} train loss: {epoch_loss/(idx_batch + 1):.4f}")
    if ((epoch + 1) % 5) == 0:
        logger.info("**********TRAINING PERFORMANCE*********")
        loss = eval_model_for_set(model, train_tokens, train_tags)
        train_loss.append(loss)
        logger.info("**********VALIDATION PERFORMANCE*********")
        loss = eval_model_for_set(model, val_tokens, val_tags)
        val_loss.append(loss)
        logger.info(f"Loss: {loss}")

# print predictions after training
#print_example(training_data, 123, model, token2idx, idx2tag)
#print(training_data[1][123])

2019-03-08 13:56:41,937 - START!
2019-03-08 13:59:20,601 - avg epoch 1 train loss: 0.3225
2019-03-08 14:01:47,314 - avg epoch 2 train loss: 0.1231
2019-03-08 14:04:11,705 - avg epoch 3 train loss: 0.0644
2019-03-08 14:06:37,306 - avg epoch 4 train loss: 0.0391
2019-03-08 14:09:09,900 - avg epoch 5 train loss: 0.0275
2019-03-08 14:09:09,901 - **********TRAINING PERFORMANCE*********
2019-03-08 14:10:15,848 - 0.6647981460439756
2019-03-08 14:10:17,747 -               precision    recall  f1-score   support

       B-LOC      0.640     0.700     0.669      4913
       I-LOC      0.415     0.682     0.516      1891
      B-MISC      0.461     0.771     0.577      2173
      I-MISC      0.382     0.660     0.484      3212
       B-ORG      0.752     0.720     0.736      7390
       I-ORG      0.690     0.591     0.637      4992
       B-PER      0.610     0.831     0.704      4321
       I-PER      0.725     0.863     0.788      3903

   micro avg      0.601     0.724     0.657     32795
   

2019-03-08 15:17:20,553 - **********VALIDATION PERFORMANCE*********
2019-03-08 15:17:32,846 - 0.5427281004220501
2019-03-08 15:17:33,198 -               precision    recall  f1-score   support

       B-LOC      0.554     0.551     0.553      1084
       I-LOC      0.230     0.557     0.326       325
      B-MISC      0.251     0.528     0.341       339
      I-MISC      0.240     0.463     0.316       557
       B-ORG      0.691     0.611     0.648      1400
       I-ORG      0.627     0.530     0.574      1104
       B-PER      0.495     0.653     0.563       735
       I-PER      0.544     0.754     0.632       634

   micro avg      0.471     0.585     0.522      6178
   macro avg      0.454     0.581     0.494      6178
weighted avg      0.528     0.585     0.543      6178

2019-03-08 15:17:33,203 - Loss: 0.6535874605178833
2019-03-08 15:20:07,192 - avg epoch 26 train loss: 0.0077
2019-03-08 15:22:41,176 - avg epoch 27 train loss: 0.0067
2019-03-08 15:25:17,347 - avg epoch 28 trai

2019-03-08 16:15:04,631 - Loss: 0.6858983635902405
2019-03-08 16:17:44,831 - avg epoch 46 train loss: 0.0049
2019-03-08 16:20:25,213 - avg epoch 47 train loss: 0.0052
2019-03-08 16:23:06,634 - avg epoch 48 train loss: 0.0073
2019-03-08 16:25:47,544 - avg epoch 49 train loss: 0.0072
2019-03-08 16:28:28,366 - avg epoch 50 train loss: 0.0060
2019-03-08 16:28:28,366 - **********TRAINING PERFORMANCE*********
2019-03-08 16:29:46,090 - 0.7515934796303456
2019-03-08 16:29:48,019 -               precision    recall  f1-score   support

       B-LOC      0.731     0.735     0.733      4913
       I-LOC      0.501     0.864     0.634      1891
      B-MISC      0.563     0.821     0.668      2173
      I-MISC      0.451     0.788     0.574      3212
       B-ORG      0.845     0.786     0.815      7390
       I-ORG      0.766     0.799     0.782      4992
       B-PER      0.717     0.850     0.778      4321
       I-PER      0.768     0.919     0.837      3903

   micro avg      0.685     0.812 

CPU times: user 5h 8min 56s, sys: 15min 10s, total: 5h 24min 7s
Wall time: 2h 33min 19s


In [24]:
eval_model_for_set(model, train_tokens, train_tags)

2019-03-08 13:33:40,768 - 0.8425425855009366
2019-03-08 13:33:42,699 -               precision    recall  f1-score   support

       B-LOC      0.880     0.848     0.864      4913
       I-LOC      0.474     0.917     0.625      1891
      B-MISC      0.655     0.947     0.774      2173
      I-MISC      0.490     0.940     0.644      3212
       B-ORG      0.933     0.923     0.928      7390
       I-ORG      0.849     0.935     0.890      4992
       B-PER      0.817     0.902     0.858      4321
       I-PER      0.801     0.984     0.883      3903

   micro avg      0.754     0.921     0.829     32795
   macro avg      0.737     0.924     0.808     32795
weighted avg      0.793     0.921     0.843     32795



tensor(0.4153)

## From hereafter, things to experiment, e.g. Dropout