# Named Entity Recognition with PyTorch

## Data

In [1]:
import nltk

In [2]:
train_sents = list(nltk.corpus.conll2002.iob_sents('ned.train'))
dev_sents = list(nltk.corpus.conll2002.iob_sents('ned.testa'))
test_sents = list(nltk.corpus.conll2002.iob_sents('ned.testb'))

train_sents[:3]

[[('De', 'Art', 'O'),
  ('tekst', 'N', 'O'),
  ('van', 'Prep', 'O'),
  ('het', 'Art', 'O'),
  ('arrest', 'N', 'O'),
  ('is', 'V', 'O'),
  ('nog', 'Adv', 'O'),
  ('niet', 'Adv', 'O'),
  ('schriftelijk', 'Adj', 'O'),
  ('beschikbaar', 'Adj', 'O'),
  ('maar', 'Conj', 'O'),
  ('het', 'Art', 'O'),
  ('bericht', 'N', 'O'),
  ('werd', 'V', 'O'),
  ('alvast', 'Adv', 'O'),
  ('bekendgemaakt', 'V', 'O'),
  ('door', 'Prep', 'O'),
  ('een', 'Art', 'O'),
  ('communicatiebureau', 'N', 'O'),
  ('dat', 'Conj', 'O'),
  ('Floralux', 'N', 'B-ORG'),
  ('inhuurde', 'V', 'O'),
  ('.', 'Punc', 'O')],
 [('In', 'Prep', 'O'),
  ("'81", 'Num', 'O'),
  ('regulariseert', 'V', 'O'),
  ('de', 'Art', 'O'),
  ('toenmalige', 'Adj', 'O'),
  ('Vlaamse', 'Adj', 'B-MISC'),
  ('regering', 'N', 'O'),
  ('de', 'Art', 'O'),
  ('toestand', 'N', 'O'),
  ('met', 'Prep', 'O'),
  ('een', 'Art', 'O'),
  ('BPA', 'N', 'B-MISC'),
  ('dat', 'Pron', 'O'),
  ('het', 'Art', 'O'),
  ('bedrijf', 'N', 'O'),
  ('op', 'Prep', 'O'),
  ('eigen', 

In [3]:
from torchtext.data import Example
from torchtext.data import Field, Dataset

token_field = Field(init_token="<BOS>", eos_token="<EOS>")
label_field = Field(init_token="<BOS>", eos_token="<EOS>")

text_field = Field(sequential=True, tokenize=lambda x:x) # Default behaviour is to tokenize by splitting
label_field = Field(sequential=True, tokenize=lambda x:x, is_target=True)

def read_data(sentences):
    examples = []
    fields = {'sentence_labels': ('labels', label_field),
              'sentence_tokens': ('text', text_field)}
    
    for sentence in sentences: 
        tokens = [t[0] for t in sentence]
        labels = [t[2] for t in sentence]
        
        e = Example.fromdict({"sentence_labels": labels, "sentence_tokens": tokens},
                             fields=fields)
        examples.append(e)
    
    return Dataset(examples, fields=[('labels', label_field), ('text', text_field)])

train_data = read_data(train_sents)
dev_data = read_data(dev_sents)
test_data = read_data(test_sents)

print(train_data.fields)
print(train_data[0].text)
print(train_data[0].labels)

print("Train:", len(train_data))
print("Dev:", len(dev_data))
print("Test:", len(test_data))

{'labels': <torchtext.data.field.Field object at 0x1a16698518>, 'text': <torchtext.data.field.Field object at 0x1a16698630>}
['De', 'tekst', 'van', 'het', 'arrest', 'is', 'nog', 'niet', 'schriftelijk', 'beschikbaar', 'maar', 'het', 'bericht', 'werd', 'alvast', 'bekendgemaakt', 'door', 'een', 'communicatiebureau', 'dat', 'Floralux', 'inhuurde', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']
Train: 15806
Dev: 2895
Test: 5195


#One index is kept for unknown words, another one for padding.

In [4]:
VOCAB_SIZE = 20000

text_field.build_vocab(train_data, max_size=VOCAB_SIZE)
label_field.build_vocab(train_data)

## Training

In [5]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The BucketIterator creates batches of similar-length examples. This minimizes the apount of padding. 

In [6]:
from torchtext.data import BucketIterator

BATCH_SIZE = 32
train_iter = BucketIterator(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
dev_iter = BucketIterator(dataset=dev_data, batch_size=BATCH_SIZE)
test_iter = BucketIterator(dataset=test_data, batch_size=BATCH_SIZE)

In [7]:
import torch.nn as nn

class BiLSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_size):
        super(BiLSTMTagger, self).__init__()
        
        # 1. Embedding Layer
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # 2. LSTM Layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, num_layers=2)
        
        # 3. Dense Layer
        self.hidden2tag = nn.Linear(2*hidden_dim, output_size)
        
        # Optional dropout layer
        self.dropout_layer = nn.Dropout(p=0.4)

    def forward(self, batch_text):

        embeddings = self.embeddings(batch_text)
        
        lstm_output, _ = self.lstm(embeddings)
        print("O", lstm_output.shape)
        lstm_output = self.dropout_layer(lstm_output)
        
        logits = self.hidden2tag(lstm_output)
        print("L", logits.shape)
        return logits

In [38]:
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

def train(model, train_iter, dev_iter, batch_size, num_batches):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())

    max_epochs = 20
    for epoch in range(max_epochs):

        total_loss = 0
        predictions, correct = [], []
        for batch in tqdm(train_iter, total=num_batches):
            optimizer.zero_grad()

            pred = model(batch.text.to(device)).view(batch_size*len(batch.text), NUM_CLASSES)
            gold = batch.labels.to(device).view(batch_size*len(batch.text))
            print("Pr", pred.shape)
            print("La", gold.shape)
            loss = criterion(pred, gold)
            
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

            _, pred_indices = torch.max(pred, 1)
            predictions += list(pred_indices.cpu().numpy())
            correct += list(batch.labels.cpu().numpy())

        print("=== Epoch", epoch, "===")
        print("Total training loss:", total_loss)
        print("Training performance:", precision_recall_fscore_support(correct, predictions))
        
        total_loss = 0
        predictions, correct = [], []
        for batch in dev_iter:

            pred = model(batch.text.to(device)).view(batch_size*len(batch.text), NUM_CLASSES)
            gold = batch.labels.to(device).view(batch_size*len(batch.text))
            loss = criterion(pred, gold)
            total_loss += loss.item()

            _, pred_indices = torch.max(pred, 1)
            pred_indices = list(pred_indices.cpu().numpy())
            predictions += pred_indices
            correct += list(batch.label.cpu().numpy())

        print("Total development loss:", total_loss)
        print("Development performance:", precision_recall_fscore_support(correct, predictions))

In [39]:
EMBEDDING_DIM = 300
HIDDEN_DIM = 256
NUM_CLASSES = len(label_field.vocab)
num_batches = int(len(train_data) / BATCH_SIZE)

tagger = BiLSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, VOCAB_SIZE+2, NUM_CLASSES)  

train(tagger.to(device), train_iter, dev_iter, BATCH_SIZE, num_batches)

HBox(children=(IntProgress(value=0, max=493), HTML(value='')))

O torch.Size([101, 32, 512])
L torch.Size([101, 32, 11])
Pr torch.Size([3232, 11])
La torch.Size([3232])
O torch.Size([48, 32, 512])
L torch.Size([48, 32, 11])
Pr torch.Size([1536, 11])
La torch.Size([1536])
O torch.Size([31, 32, 512])
L torch.Size([31, 32, 11])
Pr torch.Size([992, 11])
La torch.Size([992])
O torch.Size([35, 32, 512])
L torch.Size([35, 32, 11])
Pr torch.Size([1120, 11])
La torch.Size([1120])
O torch.Size([33, 32, 512])
L torch.Size([33, 32, 11])
Pr torch.Size([1056, 11])
La torch.Size([1056])
O torch.Size([53, 32, 512])
L torch.Size([53, 32, 11])
Pr torch.Size([1696, 11])
La torch.Size([1696])



KeyboardInterrupt: 