# NER  Bi-LSTM




In [1]:
from google.colab import drive
drive.mount('/content/drive')

root_folder = '/data/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!pip install conllu
!pip install torchtext --upgrade

Collecting conllu
  Downloading https://files.pythonhosted.org/packages/a8/03/4a952eb39cdc8da80a6a2416252e71784dda6bf9d726ab98065fff2aeb73/conllu-2.3.2-py2.py3-none-any.whl
Installing collected packages: conllu
Successfully installed conllu-2.3.2
Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/f2/17/e7c588245aece7aa93f360894179374830daf60d7ed0bbb59332de3b3b61/torchtext-0.6.0-py3-none-any.whl (64kB)
[K     |████████████████████████████████| 71kB 2.5MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 9.6MB/s 
Installing collected packages: sentencepiece, torchtext
  Found existing installation: torchtext 0.3.1
    Uninstalling torchtext-0.3.1:
      Successfully uninstalled torchtext-0.3.1
Successfully installed sentencepiece-0.1.91 torchtex

In [0]:
# Import libraries
import torch
from torch import nn
from torch.utils.data import Dataset
from torchtext import data
from torchtext.vocab import Vectors

from conllu import parse as conllu_parse
from pprint import pprint
from tqdm import tqdm
from torchtext.vocab import Vocab
from collections import Counter
import random
import numpy as np
import os

In [0]:
train_path = os.path.join(root_folder, "train.tsv")
dev_path = os.path.join(root_folder, "dev.tsv")
test_path = os.path.join(root_folder, "test.tsv")

In [0]:
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# POS Tagging Architecture

# Preprocessing

In [0]:
class POSTaggingDataset(Dataset):
  def __init__(self,
               input_file:str,
               window_size:int,
               window_shift:int=-1,
               device="cuda"):
    """
    We assume that the dataset pointed by input_file is already tokenized
    and can fit in memory.
    Args:
        input_file(str): path to load the dataset
        window_size(int): max length of a sentence in terms of number of tokens.
        window_shift(int): The number of tokens we shift the window over the sentence.
        Default value is -1 meaning that the window will be shifted by window_size
        device(str): device where to put tensors(cpu or cuda).
    """

    self.input_file = input_file
    self.window_size = window_size
    self.window_shift = window_shift if window_shift > 0 else window_size
    with open(input_file) as reader:
      # read the entire file with reader.read() and parse it
      sentences = conllu_parse(reader.read())
    self.device = device
    self.data = self.create_windows(sentences)
    self.encoded_data = None

  def index_dataset(self, l_vocabulary, l_label_vocabulary):
    self.encoded_data = list()
    for i in range(len(self.data)):
      # for each wondow
      elem = self.data[i]
      encoded_elem = torch.LongTensor(self.encode_text(elem, l_vocabulary)).to(self.device)
      # for each element d in the elem window (d is a dict with the various fields from CoNLL line)
      encoded_labels = torch.LongTensor([l_label_vocabulary[d["lemma"]] if d is not None
                                         else l_label_vocabulary["<pad>"] for d in elem]).to(self.device)
      self.encoded_data.append({"inputs":encoded_elem,
                                "outputs":encoded_labels})
    
  def create_windows(self, sentences):
    """
    Args:
        sentences(list of list of dictionary,
                    where each dictionary represents a word occurence parsed from CoNLL line)
    """
    data = []
    for sentence in sentences:
      for d in sentence:
        d["form"] = d["form"].lower() if d["lemma"] == 'O' else d["form"]
      for i in range(0, len(sentence), self.window_shift):
        window = sentence[i:i+self.window_size]
        if len(window) < self.window_size:
          window = window + [None]*(self.window_size - len(window))
        assert len(window) == self.window_size
        data.append(window)
    return data

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    if self.encoded_data is None:
      raise RuntimeError("""Trying to retrieve elements but index_dataset
            has not been invoked yet! Be sure to invoce index_dataset on this object
            before trying to retrieve elements. In case you want to retrieve raw
            elements, use the method get_raw_element(idx)""")
    return self.encoded_data[idx]
  
  def get_raw_element(self, idx):
    return self.data[idx]
  
  @staticmethod
  def encode_text(sentence:list,
                  l_vocabulary:Vocab):
    """
    Args:
        sentences(list): list of OrderedDict, each carying the information about one token.
        l_vocabulary(Vocab): vocabulary with mappings from words to inidices and viceversa
    Return:
        The method returns a list indices corresponding to the input tokens.
    """
    indices = list()
    for w in sentence:
      if w is None:
        indices.append(l_vocabulary["<pad>"])
      elif w["form"] in l_vocabulary.stoi: # vocab str to int
        indices.append(l_vocabulary[w["form"]])
      else:
        indices.append(l_vocabulary.unk_index)
    return indices
  
  @staticmethod
  def decode_output(outputs:torch.Tensor,
                    l_label_vocabulary: Vocab):
    """
    Args:
        outputs(Tensor): a Tensor woth shape(batch_size, max_len, label_vocab_size)
          containing the logits outputed by the neural network.
        l_label_vocabulary(Vocab): is the vocabulary containing the mapping from
          a string label to its corresponding index and vicebversa
        Output:
          The method returns a list of batch_size length where each element is a list
          of labels, one for each input token.
    """
    max_indices = torch.argmax(outputs, -1).tolist() # shape = (batch_size, max_len)
    prediction = list()
    for indices in max_indices:
      # Vocab integer to str is used to obtain the corresponding word from the max index
      prediction.append([l_label_vocabulary.itos[i] for i in indices])
    return prediction


In [7]:
# Create simple dataset instace for testing purpose

def test_dataset_class():
  window_size, window_shift = 30, 30
  dataset = POSTaggingDataset(test_path, window_size, window_shift)

  print('Dataset test:')
  for i in range(10):
    print(' sample {}: {}'.format(i, [t["form"] + ":" + t["lemma"] for t in dataset.get_raw_element(i) if t is not None]))

test_dataset_class()

Dataset test:
 sample 0: ['however:O', ',:O', 'on:O', 'may:O', '8th:O', ',:O', '2010:O', ',:O', 'a:O', 'sighting:O', 'of:O', 'a:O', 'gray:O', 'whale:O', 'was:O', 'confirmed:O', 'off:O', 'the:O', 'coast:O', 'of:O', 'Israel:LOC', 'in:O', 'the:O', 'Mediterranean:LOC', 'Sea:LOC', '.:O', ',:O', 'leading:O', 'some:O', 'scientists:O']
 sample 1: ['to:O', 'think:O', 'they:O', 'might:O', 'be:O', 'repopulating:O', 'old:O', 'breeding:O', 'grounds:O', 'that:O', 'have:O', 'not:O', 'been:O', 'used:O', 'for:O', 'centuries:O', '.:O']
 sample 2: ['the:O', 'plot:O', 'focuses:O', 'on:O', 'a:O', 'brutal:O', 'and:O', 'cunning:O', 'group:O', 'of:O', 'recently:O', 'escaped:O', 'replicants:O', 'hiding:O', 'in:O', 'L.A.:LOC', 'and:O', 'the:O', 'semi-retired:O', 'blade:O', 'runner:O', ',:O', 'rick:O', 'deckard:O', ',:O', 'who:O', 'reluctantly:O', 'agrees:O', 'to:O', 'take:O']
 sample 3: ['on:O', 'one:O', 'more:O', 'assignment:O', 'to:O', 'hunt:O', 'them:O', 'all:O', 'down:O', ',:O', 'while:O', 'searching:O', 'f

Build Vocab and label Vocab using TorchText

In [8]:
def build_vocab (dataset, min_freq=1):
  counter = Counter()
  for i in tqdm(range(len(dataset))):
    # for each token in the sentence viewed as a dictionary of items for the CoNLL line
    for token in dataset.get_raw_element(i):
      if token is not None:
        counter[token["form"]]+=1
  # we add special tokens for handling padding and unk words at testing time
  return Vocab(counter, specials=['<pad>', '<unk>'], min_freq=min_freq)

def build_label_vocab(dataset):
  counter = Counter()
  for i in tqdm(range(len(dataset))):
    for token in dataset.get_raw_element(i):
      if token is not None:
        counter[token["lemma"]]+=1
  # no unk token for label
  return Vocab(counter, specials=['<pad>'])
  
window_size, window_shift = 100,100
dataset = POSTaggingDataset(train_path, window_size,window_shift)
vocabulary = build_vocab(dataset, min_freq=2)
label_vocabulary = build_label_vocab(dataset)
dataset.index_dataset(vocabulary, label_vocabulary)

100%|██████████| 100042/100042 [00:01<00:00, 50373.69it/s]
100%|██████████| 100042/100042 [00:01<00:00, 84269.18it/s]


# Model Defenition
## Embeeding Layer

# LSTM

In [9]:
lstm =nn.LSTM(10,5)
lstm

LSTM(10, 5)

# Linear Classifier

In [10]:
classifier = nn.Linear(5, len(label_vocabulary))
classifier

Linear(in_features=5, out_features=5, bias=True)

In [0]:
class POSBiLstmModel(nn.Module):
  # we provide hyperparameters as input
  def __init__(self, hparams):
    super(POSBiLstmModel, self).__init__()
    # Embedding layer: a matrix vocab size x_embedding_dim where each index
    # correspond to a word in the vocabulary and the i-th row corresponds to 
    # a latent representation of the i-th word in the vocab.
    pprint(params)
    self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)
    if hparams.embeddings is not None:
      print("initializing embeddings from pretrained")
      self.word_embedding.weight.data.copy_(hparams.embeddings)

    # LSTM layer: an LSTM nn that process the input text
    # (encoded wiith word embeddings) from left to right as outputs
    # a new **contextual** representation of each word that depend
    # on the preciding words.
    self.lstm = nn.LSTM(hparams.embedding_dim, hparams.hidden_dim,
                        bidirectional = hparams.bidirectional,
                        num_layers = hparams.num_layers,
                        dropout = hparams.dropout if hparams.num_layers > 1 else 0)
    # Hidden layer: transforms the input value/ scalar into
    # a hidden vector representation
    lstm_output_dim = hparams.hidden_dim if hparams.bidirectional is False else hparams.hidden_dim * 2

    # During training, randomly zeroes some of the elements of the input tensor
    # with probability hparams.dropouts using samples from a Bernoulli
    # distribution. Each channel will be zeroed out independently on every fwd call.
    # This has proven to be an effective technique for regularization and preventing
    # the co-adaption of neurons
    self.dropout = nn.Dropout(hparams.dropout)
    self.classifier = nn.Linear(lstm_output_dim, hparams.num_classes)

  def forward(self, x):
    embeddings = self.word_embedding(x)
    embeddings = self.dropout(embeddings)
    o, (h, c) = self.lstm(embeddings)
    o = self.dropout(o)
    output = self.classifier(o)
    return output


# Model Building

In [0]:
class HParams():
  vocab_size = len(vocabulary)
  hidden_dim = 128
  embedding_dim = 100
  num_classes = len(label_vocabulary) # num of different  universal POS tagging
  bidirectional = True
  num_layers = 1
  dropout = 0.0
  embeddings = None
params = HParams()

In [16]:
PosBilstm = POSBiLstmModel(params).cuda()
PosBilstm

<__main__.HParams object at 0x7f64dae74fd0>


POSBiLstmModel(
  (word_embedding): Embedding(50201, 100)
  (lstm): LSTM(100, 128, bidirectional=True)
  (dropout): Dropout(p=0.0, inplace=False)
  (classifier): Linear(in_features=256, out_features=5, bias=True)
)

In [18]:
encoded_input = [x["inputs"] for x in dataset[:10]]
logits = PosBilstm(torch.stack(encoded_input, 0).cuda())
labels = POSTaggingDataset.decode_output(logits, label_vocabulary)
for i in range(2):
  i_sentence = dataset.get_raw_element(i)
  i_labels = labels[i]
  pprint(list(zip([w["form"] for w in i_sentence if w is not None], i_labels)))


[('Burgdorf', 'ORG'),
 ('had', 'O'),
 ('brought', 'ORG'),
 ('a', 'PER'),
 ('capsule', 'ORG'),
 ('of', 'PER'),
 ('cyanide', '<pad>'),
 ('for', 'LOC'),
 ('the', 'ORG'),
 ('occasion', '<pad>'),
 ('.', 'PER')]
[('Terry', 'ORG'),
 ('Daniher', 'PER'),
 ('and', 'ORG'),
 ('his', 'O'),
 ('brother', 'PER'),
 ('Neale', 'PER'),
 ('would', 'O'),
 ('come', 'LOC'),
 ('via', 'LOC'),
 ('a', 'O'),
 ('trade', 'PER'),
 ('with', 'O'),
 ('South', 'O'),
 ('Melbourne', 'O'),
 (',', 'PER'),
 ('and', 'O'),
 ('Roger', 'PER'),
 ('Merrett', 'PER'),
 ('joined', 'PER'),
 ('soon', 'PER'),
 ('afterwards', 'LOC'),
 ('to', '<pad>'),
 ('form', 'LOC'),
 ('the', 'O'),
 ('nucleus', 'O'),
 ('of', 'PER'),
 ('what', 'PER'),
 ('would', '<pad>'),
 ('become', 'PER'),
 ('the', 'ORG'),
 ('formidable', 'LOC'),
 ('Essendon', 'LOC'),
 ('sides', '<pad>'),
 ('of', 'LOC'),
 ('the', 'ORG'),
 ('1980s', 'LOC'),
 ('.', 'PER')]


# Model Training

In [0]:
class Trainer():
  """utility class to train and evaluate a model."""

  def __init__(self,
               model: nn.Module,
               loss_function,
               optimizer,
               label_vocab: Vocab,
               log_steps: int=10_000,
               log_level:int=2):
    """
    Args:
        model: medel we want to train.
        loss_function: the loss_function to be minimized.
        optimizer: to minimize loss function.
    """
    self.model = model
    self.loss_function = loss_function
    self.optimizer = optimizer

    self.label_vocab = label_vocab
    self.log_steps = log_steps
    self.log_level = log_level
  
  def train(self, train_dataset:Dataset,
            valid_dataset:Dataset,
            epochs:int=1):
    """
    Args:
        train_dataset: a Dataset or DataLoader instance containing the training instance.
        valid_dataset: used to evaluate learning progress.
        ephocs: the number of times to iterate over train_dataset.

    Returns:
        avg_train_loss: avg training loss on train_dataset over epochs.
    """
    assert epochs > 1 and isinstance(epochs, int)
    if self.log_level > 0:
      print("Training...")
    train_loss = 0.0
    for epoch in range(epochs):
      if self.log_level > 0:
        print("Epoch {:03d}".format(epoch + 1))

      epoch_loss = 0.0
      self.model.train()

      for step, sample in enumerate(train_dataset):
        inputs = sample['inputs']
        labels = sample['outputs']
        self.optimizer.zero_grad()

        predictions = self.model(inputs)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)

        sample_loss = self.loss_function(predictions, labels)
        sample_loss.backward()
        self.optimizer.step()

        epoch_loss += sample_loss.tolist()

        if self.log_level > 1 and step % self.log_steps == self.log_steps -1:
          print('\t[E: {:2d} 0 step{}] curret avg_loss = {:0.4f}'.format(epoch, step, epoch_loss / (step + 1)))
      
      avg_epoch_loss = epoch_loss / len(train_dataset)
      train_loss += avg_epoch_loss
      if self.log_level > 0:
        print('\t[E: {:2d}] train loss = {:0.4f}'.format(epoch, avg_epoch_loss))

      valid_loss = self.evaluate(valid_dataset)
      if self.log_level > 0:
        print('  [E: {:2d}] valid loss = {:0.4f}'.format(epoch, valid_loss))
      
    if self.log_level > 0:
      print('..... Done!')

    avg_epoch_loss = train_loss / epochs
    return avg_epoch_loss
  
  def evaluate(self, valid_dataset):
    """
    Args:
        valid_dataset: dataset for evaluation.
    Returns:
        avg_valid_loss: the average validation loss over valid_dataset.
    """
    valid_loss = 0.0
    # set the dropout to 0!! Nedded when we are in inference mode.
    self.model.eval()
    with torch.no_grad():
      for sample in valid_dataset:
        inputs = sample['inputs']
        labels = sample['outputs']

        predictions = self.model(inputs)
        predictions = predictions.view(-1, predictions.shape[-1])
        labels = labels.view(-1)
        sample_loss = self.loss_function(predictions, labels)
        valid_loss += sample_loss.tolist()
    return valid_loss / len(valid_dataset)
  
  def predict(self, x):
    """
    Args: 
        x: a tensor of indices.
    Returns:
        A list containing the predicted POS tag for each token in input sentences
    """
    self.model.eval()
    with torch.no_grad():
      logits = self.model(x)
      predictions = torch.argmax(logits, -1)
      return logits, predictions

Define dataset

In [20]:
from torch.utils.data import DataLoader
import torch.optim as optim
window_size, window_shift = 100, 100
device = "cuda"
trainingset = POSTaggingDataset(train_path, window_size, window_shift, device=device)
devset = POSTaggingDataset(dev_path, window_size, window_shift, device=device)
testset = POSTaggingDataset(test_path, window_size, window_shift, device=device)

trainingset.index_dataset(vocabulary, label_vocabulary)
devset.index_dataset(vocabulary, label_vocabulary)
testset.index_dataset(vocabulary, label_vocabulary)

train_dataset = DataLoader(trainingset, batch_size = 128)
valid_dataset = DataLoader(devset, batch_size = 128)
test_dataset = DataLoader(testset, batch_size = 128)

PosBilstm = POSBiLstmModel(params).cuda()

<__main__.HParams object at 0x7f64dae74fd0>


set up a trainer

In [0]:
trainer = Trainer(
    model = PosBilstm,
    loss_function = nn.CrossEntropyLoss(ignore_index=label_vocabulary['<pad>']),
    optimizer = optim.Adam(PosBilstm.parameters()),
    label_vocab=label_vocabulary
)

In [22]:
trainer.train(train_dataset, valid_dataset, 10)

Training...
Epoch 001
	[E:  0] train loss = 0.2824
  [E:  0] valid loss = 0.1718
Epoch 002
	[E:  1] train loss = 0.1392
  [E:  1] valid loss = 0.1224
Epoch 003
	[E:  2] train loss = 0.1040
  [E:  2] valid loss = 0.1049
Epoch 004
	[E:  3] train loss = 0.0878
  [E:  3] valid loss = 0.0975
Epoch 005
	[E:  4] train loss = 0.0790
  [E:  4] valid loss = 0.0943
Epoch 006
	[E:  5] train loss = 0.0735
  [E:  5] valid loss = 0.0935
Epoch 007
	[E:  6] train loss = 0.0693
  [E:  6] valid loss = 0.0942
Epoch 008
	[E:  7] train loss = 0.0652
  [E:  7] valid loss = 0.0962
Epoch 009
	[E:  8] train loss = 0.0608
  [E:  8] valid loss = 0.0993
Epoch 010
	[E:  9] train loss = 0.0559
  [E:  9] valid loss = 0.1037
..... Done!


0.10171731371108603

In [23]:
test_set_loss = trainer.evaluate(test_dataset)
print("test set loss: {}".format(test_set_loss))

test set loss: 0.1020301320582382


In [24]:
def print_outputs(l_trainer, l_testset, num_outputs, l_vocabulary, l_label_vocabulary):

  for i in range(num_outputs):
    print("sentence {}".format(i))
    print()
    test_elem = l_testset[i]

    test_x, test_y = test_elem["inputs"], test_elem["outputs"]

    logits, predictions = l_trainer.predict(test_x.unsqueeze(0))

    decoded_labels = POSTaggingDataset.decode_output(logits, l_label_vocabulary)[0]
    test_y = test_y.tolist()
    print("token\t\tinput\t\tgold\t\tprediction")
    print("-"*100)
    for raw_elem, idx, label, predicted_label in zip(l_testset.get_raw_element(i), test_x.tolist(), test_y, decoded_labels):
      if idx == 0:
        break
      print("{}\t\t{}\t\t{}\t\t{}".format(raw_elem["form"], l_vocabulary.itos[idx], l_label_vocabulary.itos[label], predicted_label))
    print("="*30)

print_outputs(trainer, testset, 3, vocabulary, label_vocabulary)

sentence 0

token		input		gold		prediction
----------------------------------------------------------------------------------------------------
however		however		O		O
,		,		O		O
on		on		O		O
may		may		O		O
8th		8th		O		O
,		,		O		O
2010		2010		O		O
,		,		O		O
a		a		O		O
sighting		sighting		O		O
of		of		O		O
a		a		O		O
gray		gray		O		O
whale		whale		O		O
was		was		O		O
confirmed		confirmed		O		O
off		off		O		O
the		the		O		O
coast		coast		O		O
of		of		O		O
Israel		Israel		LOC		LOC
in		in		O		O
the		the		O		O
Mediterranean		Mediterranean		LOC		LOC
Sea		Sea		LOC		LOC
.		.		O		O
,		,		O		O
leading		leading		O		O
some		some		O		O
scientists		scientists		O		O
to		to		O		O
think		think		O		O
they		they		O		O
might		might		O		O
be		be		O		O
repopulating		<unk>		O		O
old		old		O		O
breeding		breeding		O		O
grounds		grounds		O		O
that		that		O		O
have		have		O		O
not		not		O		O
been		been		O		O
used		used		O		O
for		for		O		O
centuries		centuries		O		O
.		.		O		O
sentence 1

token		input		gold		

In [0]:
from sklearn.metrics import precision_score as sk_precision
from sklearn.metrics import recall_score, f1_score
def evaluation(model:nn.Module, l_dataset:DataLoader, l_label_vocab:Vocab):
  all_predictions = list()
  all_labels = list()
  for indexed_elem in l_dataset:
    indexed_in = indexed_elem["inputs"]
    indexed_labels = indexed_elem["outputs"]
    predictions = model(indexed_in)
    predictions = torch.argmax(predictions, -1).view(-1)
    labels = indexed_labels.view(-1)
    valid_indices = labels != 0

    valid_predictions = predictions[valid_indices]
    valid_labels = labels[valid_indices]

    all_predictions.extend(valid_predictions.tolist())
    all_labels.extend(valid_labels.tolist())
  precision = sk_precision(all_labels, all_predictions, average="macro")
  recall = recall_score(all_labels, all_predictions, average="macro")
  f1 = f1_score(all_labels, all_predictions, average="macro")
  return{"precision":precision,
         "recall":recall,
         "f1":f1
  }

In [27]:
evaluate = evaluation(PosBilstm, test_dataset, label_vocabulary)
print(evaluate)

{'precision': 0.8675922382923663, 'recall': 0.785131802067354, 'f1': 0.8224439846909528}
