In [1]:
import pandas as pd, numpy as np
import logging
%load_ext autoreload
%autoreload 2
from IPython.core.debugger import set_trace
import os, sys
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
logger = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### $(token, pos, tag)^N$ --> $(set\_tokens, set\_pos, set\_tags)$

In [2]:
%%time
from data_preparation import ConLL2002DataSet

CPU times: user 1.06 s, sys: 348 ms, total: 1.41 s
Wall time: 1.62 s


In [3]:
%%time
train_tokens, train_pos, train_tags = ConLL2002DataSet("esp.train").get_tokens_tags_from_sents()
val_tokens, val_pos, val_tags = ConLL2002DataSet("esp.testb").get_tokens_tags_from_sents()

CPU times: user 1.55 s, sys: 131 ms, total: 1.68 s
Wall time: 1.7 s


You should always understand what kind of data you deal with. For this purpose, you
can print the data running the following cell:

In [4]:
idx = 0
pd.DataFrame([train_tokens[idx], train_tags[idx]])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,Melbourne,(,Australia,),",",25,may,(,EFE,),.
1,B-LOC,O,B-LOC,O,O,O,O,O,B-ORG,O,O


### Prepare mappings

A neural network needs to work with word indices, not next. Then, we need to learn
the vocabulary of tokens and tags. This is accomplished with the Vectorizer, and then
used to transform the datasets into VectorizedDataset objects

Some special tokens in the vocabulary:
 - `<PAD>` token for padding sentence to the same length when we create batches of
 sentences. index = 0
 - `<UNK>` token for out of vocabulary tokens; index = 1
 - `<START>` index = 2 (not used here)
 - `<END>` index = 3 (not used here)

In [5]:
from data_preparation import Vectorizer

vectorizer = Vectorizer(use_start_end=False, use_pad=True)
vectorizer.fit(train_tokens, train_tags)
train_data = vectorizer.transform(train_tokens, train_tags)
val_data = vectorizer.transform(val_tokens, val_tags)

In [6]:
print(train_tokens[0])
print(train_data.input[0])
vectorizer.map_sequence_back(vectorizer.word_vocab, train_data.input[0])

('Melbourne', '(', 'Australia', ')', ',', '25', 'may', '(', 'EFE', ')', '.')
tensor([ 2,  3,  4,  5,  6,  7,  8,  3,  9,  5, 10])


array(['Melbourne', '(', 'Australia', ')', ',', '25', 'may', '(', 'EFE',
       ')', '.'], dtype='<U9')

In [7]:
print(train_tags[0])
print(train_data.target[0])
vectorizer.map_sequence_back(vectorizer.tag_vocab, train_data.target[0])

('B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O')
tensor([2, 1, 2, 1, 1, 1, 1, 1, 3, 1, 1])


array(['B-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O'],
      dtype='<U5')

### Generate batches

Neural Networks are usually trained with batches. It means that weight
updates of the network are based on several sequences at every single time.
The tricky part is that all sequences within a batch need to have the same
length. So we will pad them with a special `<PAD>` token. It is also a good
practice to provide RNN with sequence lengths, so it can skip computations
for padding parts. We provide the batching function *batches_generator*
readily available for you to save time.

### Model

In [8]:
import torch
from torch import nn
from torch import optim
from torch.nn.utils import clip_grad_norm_
torch.manual_seed(1)
from models import CRFTagger
from evaluation import eval_model_for_set
from torch.utils.data import DataLoader
from data_preparation import pad_and_sort_batch

## Set hyperparams and train the model

In [9]:
BATCH_SIZE = 32
EPOCHS = 5
PRINT_EVERY_NBATCHES = 100
PRINT_EVERY_NEPOCHS = 1
lstm_args = {
    "use_lstm": True,
    "embedding_dim": 200,
    "hidden_dim": 200,
    "vocab_size": len(vectorizer.word_vocab),
    "tagset_size": len(vectorizer.tag_vocab),
    "bidirectional": True
}
model = CRFTagger(lstm_args)
LEARNING_RATE = 0.005
optimiser = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [10]:
%%time
logger.info("START!")
train_loss, val_loss = [], []
for epoch in range(EPOCHS):
    # TODO: review how to set the seed
    train_loader = DataLoader(
        train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_and_sort_batch
    )
    epoch_loss = 0
    model.train()
    for idx_batch, batch in enumerate(train_loader):
        batch_sents, batch_tags, batch_lens = batch
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        # Step 2. Run our forward pass.
        tag_scores, mask = model(batch_sents, batch_lens)
        # Step 3. Compute the loss, gradients, and update the parameters
        loss = model.loss(tag_scores, mask, batch_tags)
        loss.backward()
        epoch_loss += float(loss)
        clip_grad_norm_(model.parameters(), 5)
        optimiser.step()
        # disabled for now
        if (idx_batch + 1) % PRINT_EVERY_NBATCHES == 0:
            logger.info(
                f"Epoch [{epoch + 1}/{EPOCHS}], "
                f"Step [{idx_batch + 1}/{len(train_tags)// BATCH_SIZE}], "
                f"Loss: {loss:.4f}"
            )

    logger.info(f"avg epoch {epoch + 1} train loss: {epoch_loss/(idx_batch + 1):.4f}")
    if ((epoch + 1) % PRINT_EVERY_NEPOCHS) == 0:
        logger.info("**********TRAINING PERFORMANCE*********")
        train_loss.append(eval_model_for_set(model, train_data, vectorizer, True))
        logger.info(f"Loss: {train_loss[-1]}")
        logger.info("**********VALIDATION PERFORMANCE*********")
        val_loss.append(eval_model_for_set(model, val_data, vectorizer, True))
        logger.info(f"Loss: {val_loss[-1]}")

2019-05-03 11:14:05 - START!
2019-05-03 11:14:57 - Epoch [1/5], Step [100/260], Loss: 204.7820
2019-05-03 11:15:59 - Epoch [1/5], Step [200/260], Loss: 58.8843
2019-05-03 11:16:40 - avg epoch 1 train loss: 253.9890
2019-05-03 11:16:40 - **********TRAINING PERFORMANCE*********


KeyboardInterrupt: 

### Ideas to improve

Accuracy:
* Dropout
* Early stopping
* Fine-tunning hyperparams: learning rate (https://www.jeremyjordan.me/nn-learning-rate/), embedding and hidden dimensions
* Use trained embeddings / hand-crafted features
* CNN

Speed:
* _viterbi_decode_nbest vs _viterbi_decode when nbest=1

Coding:
* Clean NCRF++ implementation, probably more efficient