In [1]:
%%time
import nltk
import pandas as pd
import logging

CPU times: user 725 ms, sys: 272 ms, total: 997 ms
Wall time: 1.32 s


In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

## $(token, pos, tag)^N$ --> $(tokens, tags)$

In [3]:
%%time
from data_preparation import get_tokens_tags_from_sents
train_tokens, train_tags = get_tokens_tags_from_sents(train_sents)
val_tokens, val_tags = get_tokens_tags_from_sents(test_sents)

CPU times: user 205 ms, sys: 60.4 ms, total: 266 ms
Wall time: 341 ms


You should always understand what kind of data you deal with. For this purpose, you can print the data running the following cell:

In [4]:
idx = 0
pd.DataFrame([train_tokens[idx], train_tags[idx]])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Melbourne,(,Australia,),",",25,may,(,EFE,),.
1,B-LOC,O,B-LOC,O,O,O,O,O,B-ORG,O,O


### Prepare mappings

To train a neural network, we will use two mappings:
- {token}$\to${token id}: address the row in embeddings matrix for the current token;
- {tag}$\to${tag id}: one-hot ground truth probability distribution vectors for computing the loss at the output of the network.

Now you need to implement the function *build_dict* which will return {token or tag}$\to${index} and vice versa.

After implementing the function *build_dict* you can make dictionaries for tokens and tags. Special tokens for tokens will be:
 - `<UNK>` token for out of vocabulary tokens; index = 0
 - `<PAD>` token for padding sentence to the same length when we create batches of sentences. index = 1

In [5]:
special_tokens = ['<UNK>', '<PAD>']
special_tags = ['O']

# Create dictionaries
from data_preparation import build_dict
token2idx, idx2token = build_dict(train_tokens, special_tokens)
tag2idx, idx2tag = build_dict(train_tags, special_tags)

### Generate batches

Neural Networks are usually trained with batches. It means that weight
updates of the network are based on several sequences at every single time.
The tricky part is that all sequences within a batch need to have the same
length. So we will pad them with a special `<PAD>` token. It is also a good
practice to provide RNN with sequence lengths, so it can skip computations
for padding parts. We provide the batching function *batches_generator*
readily available for you to save time.

In [6]:
from data_preparation import batches_generator

### Model

In [7]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.nn.utils.rnn import (
    pack_padded_sequence, pad_packed_sequence
)
from torch.nn.utils import clip_grad_norm_
torch.manual_seed(1)

<torch._C.Generator at 0x1a1904b550>

In [8]:
class LSTMTagger(nn.Module):

    def __init__(self, batch_size, embedding_dim, hidden_dim, vocab_size,
                 tagset_size, padding_idx, verbose=False, bidirectional=False):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size
        self.word_embeddings = nn.Embedding(
            vocab_size, embedding_dim, padding_idx=padding_idx
        )

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.bidirectional = bidirectional
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True,
                            bidirectional=bidirectional)
        self.tagset_size = tagset_size
        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear((1+bidirectional)*hidden_dim, tagset_size)
        self.verbose = verbose

    def forward(self, X, X_lens):
        # embeddings
        embeds = self.word_embeddings(X)
        if self.verbose: print(f"Embeds: {embeds.size()}")
        # pack_padded_sequence so that padded items in the sequence won't be
        # shown to the LSTM
        embeds = pack_padded_sequence(embeds, X_lens.cpu().numpy(), batch_first=True)
        # lstm
        #lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out, _ = self.lstm(embeds)
        # undo the packing operation
        lstm_out, _ = pad_packed_sequence(lstm_out, batch_first=True)
        if self.verbose: print(f"lstm_out: {lstm_out.size()}")
        # (batch_size, seq_len, hidden_dim) --> (batch_size * seq_len, hidden_dim)
        s = lstm_out.contiguous().view(-1, lstm_out.shape[2])
        # (batch_size * seq_len, hidden_dim) --> (batch_size * seq_len, tag_dim)
        tag_space = self.hidden2tag(lstm_out)
        if self.verbose: print(f"tag space: {tag_space.size()}")
        # normalize logits
        tag_scores = F.log_softmax(tag_space, dim=1)
        if self.verbose: print(f"tag scores: {tag_scores.size()}")
        return tag_scores

    @staticmethod
    def loss(y_hat, y):
        criterion = nn.CrossEntropyLoss(ignore_index=-1)
        return criterion(y_hat.view(-1, y_hat.size()[2]), y.view(-1))

### Evaluation helpers

In [9]:
labels_to_score = list(tag2idx.keys())
labels_to_score.remove('O')
labels_to_score

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']

In [10]:
# group B and I results
sorted_labels = sorted(
    labels_to_score,
    key=lambda name: (name[1:], name[0])
)

In [11]:
from evaluation import eval_model_for_set

## Set hyperparams and train the model

In [12]:
EMBEDDING_DIM = 200
HIDDEN_DIM = 200
BATCH_SIZE = 32
EPOCHS = 5
VOCAB_SIZE = len(token2idx)
TAGSET_SIZE = len(tag2idx)
PADDING_IDX = token2idx["<PAD>"]
training_data = (train_tokens, train_tags)
model = LSTMTagger(BATCH_SIZE, EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE,
                   TAGSET_SIZE, PADDING_IDX, verbose=False, bidirectional=True)
optimiser = torch.optim.Adam(model.parameters(), lr=0.005)

In [13]:
%%time
# print predictions before training
#print_example(training_data, 123, model, token2idx, idx2tag)
logger.info("START!")
train_loss, val_loss = [], []
for epoch in range(EPOCHS):
    train_loader = batches_generator(
        BATCH_SIZE, train_tokens, train_tags, token2idx, tag2idx, seed=epoch
    )
    epoch_loss = 0
    model.train()
    for idx_batch, batch in enumerate(train_loader):
        batch_sents, batch_tags, batch_lens = batch
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        # Step 2. Run our forward pass.
        tag_scores = model(batch_sents, batch_lens)
        # Step 3. Compute the loss, gradients, and update the parameters
        loss = model.loss(tag_scores, batch_tags)
        loss.backward()
        epoch_loss += float(loss)
        clip_grad_norm_(model.parameters(), 5)
        optimiser.step()
        # disabled for now
        if (idx_batch + 1) % 970 == 0:
            logger.info(
                f'Epoch [{epoch + 1}/{EPOCHS}], '
                f"Step [{idx_batch + 1}/{len(train_tags)// BATCH_SIZE}], "
                f"Loss: {loss:.4f}"
            )

    logger.info(f"avg epoch {epoch + 1} train loss: {epoch_loss/(idx_batch + 1):.4f}")
    if ((epoch + 1) % 5) == 0:
        logger.info("**********TRAINING PERFORMANCE*********")
        train_loss.append(eval_model_for_set(
            model, train_tokens, train_tags, token2idx, tag2idx, sorted_labels
        ))
        logger.info(f"Loss: {train_loss[-1]}")
        logger.info("**********VALIDATION PERFORMANCE*********")
        val_loss.append(eval_model_for_set(
            model, val_tokens, val_tags, token2idx, tag2idx, sorted_labels
        ))
        logger.info(f"Loss: {val_loss[-1]}")

# print predictions after training
#print_example(training_data, 123, model, token2idx, idx2tag)
#print(training_data[1][123])

2019-03-10 13:05:34 - START!
2019-03-10 13:08:28 - avg epoch 1 train loss: 0.3591
2019-03-10 13:11:14 - avg epoch 2 train loss: 0.1266
2019-03-10 13:14:44 - avg epoch 3 train loss: 0.0665
2019-03-10 13:17:36 - avg epoch 4 train loss: 0.0401
2019-03-10 13:20:55 - avg epoch 5 train loss: 0.0272
2019-03-10 13:20:55 - **********TRAINING PERFORMANCE*********
2019-03-10 13:24:11 - f1 score: 0.975


              precision    recall  f1-score   support

       B-LOC      0.961     0.967     0.964      4913
       I-LOC      0.979     0.950     0.964      1891
      B-MISC      0.959     0.950     0.954      2173
      I-MISC      0.961     0.979     0.970      3212
       B-ORG      0.968     0.976     0.972      7390
       I-ORG      0.980     0.973     0.976      4992
       B-PER      0.989     0.994     0.991      4321
       I-PER      0.994     0.993     0.994      3903

   micro avg      0.974     0.976     0.975     32795
   macro avg      0.974     0.973     0.973     32795
weighted avg      0.974     0.976     0.975     32795



2019-03-10 13:24:13 - Loss: 1.587467181707325e-06
2019-03-10 13:24:13 - **********VALIDATION PERFORMANCE*********
2019-03-10 13:24:19 - f1 score: 0.719


              precision    recall  f1-score   support

       B-LOC      0.822     0.697     0.754      1084
       I-LOC      0.798     0.597     0.683       325
      B-MISC      0.625     0.546     0.583       339
      I-MISC      0.621     0.558     0.588       557
       B-ORG      0.743     0.823     0.781      1400
       I-ORG      0.691     0.832     0.755      1104
       B-PER      0.417     0.917     0.573       735
       I-PER      0.813     0.864     0.838       634

   micro avg      0.664     0.767     0.712      6178
   macro avg      0.691     0.729     0.694      6178
weighted avg      0.701     0.767     0.719      6178



2019-03-10 13:24:19 - Loss: 0.00013211334589868784


CPU times: user 31min 28s, sys: 3min 9s, total: 34min 38s
Wall time: 18min 44s


### Conclusions

Really decent, given the simplicity of the model (it's just a BiLSTM with a dense layer afterwards). Lot of overfitting

### Ideas to improve

Accuracy:
* Dropout
* Early stopping
* Fine-tunning hyperparams: learning rate (https://www.jeremyjordan.me/nn-learning-rate/), embedding and hidden dimensions
* Use trained embeddings
* CRF / CNN

Coding:
* Use `DataLoader` from Pytorch rather than `batches_generator`