In [17]:
# !pip install allennlp==2.5.0
# !pip install allennlp-models==2.5.0
# !git clone https://github.com/mhagiwara/realworldnlp.git
# %cd realworldnlp

In [18]:
%cd /home/admin_paulhykim_altostrat_com/realworldnlp

/home/admin_paulhykim_altostrat_com/realworldnlp


In [19]:
import csv
from itertools import chain
from typing import Dict, List, Tuple

import numpy as np
import torch
import torch.optim as optim
from allennlp.common.file_utils import cached_path
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import TextField, SequenceLabelField
from allennlp.data.instance import Instance
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers.token_class import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy, SpanBasedF1Measure
from allennlp.training import GradientDescentTrainer
from overrides import overrides

In [20]:
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 128

In [21]:
class NERDatasetReader(DatasetReader):
    def __init__(self, file_path: str, token_indexers: Dict[str, TokenIndexer]=None):
        super().__init__()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

        self.instances = []
        file_path = cached_path(file_path)
        sentence = []
        with open(file_path, mode='r', encoding='utf-8', errors='ignore') as csv_file:
            next(csv_file)
            reader = csv.reader(csv_file)

            for row in reader:
                if row[0] and sentence:
                    tokens, labels = self._convert_sentence(sentence)
                    self.instances.append(self.text_to_instance(tokens, labels))

                    sentence = [row]
                else:
                    sentence.append(row)

            if sentence:
                tokens, labels = self._convert_sentence(sentence)
                self.instances.append(self.text_to_instance(tokens, labels))

    @overrides
    def text_to_instance(self, tokens: List[Token], labels: List[str]=None):
        fields = {}

        text_field = TextField(tokens, self.token_indexers)
        fields['tokens'] = text_field
        if labels:
            fields['labels'] = SequenceLabelField(labels, text_field)

        return Instance(fields)

    def _convert_sentence(self, rows: List[Tuple[str]]) -> Tuple[List[Token], List[str]]:
        """Given a list of rows, returns tokens and labels."""
        _, tokens, _, labels = zip(*rows)
        tokens = [Token(t) for t in tokens]

        # NOTE: the original dataset seems to confuse gpe with geo, and the distinction
        # seems arbitrary. Here we replace both with 'gpe'
        labels = [label.replace('geo', 'gpe') for label in labels]
        return tokens, labels

    @overrides
    def _read(self, split: str):
        for i, inst in enumerate(self.instances):
            if split == 'train' and i % 10 != 0:
                yield inst
            elif split == 'dev' and i % 10 == 0:
                yield inst

In [22]:
class LstmTagger(Model):
    def __init__(self,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary, 
                 cuda_device=-1) -> None:
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        self.hidden2labels = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                             out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()
        self.f1 = SpanBasedF1Measure(vocab, tag_namespace='labels')
        
        if cuda_device > -1:
            self.hidden2labels = self.hidden2labels.to(cuda_device)
            self.embedder = self.embedder.to(cuda_device)
            self.encoder = self.encoder.to(cuda_device)

    def forward(self,
                tokens: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(tokens)
        embeddings = self.embedder(tokens)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2labels(encoder_out)
        output = {'logits': logits}
        if labels is not None:
            self.accuracy(logits, labels, mask)
            self.f1(logits, labels, mask)
            output['loss'] = sequence_cross_entropy_with_logits(logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        f1_metrics = self.f1.get_metric(reset)
        return {'accuracy': self.accuracy.get_metric(reset),
                'prec': f1_metrics['precision-overall'],
                'rec': f1_metrics['recall-overall'],
                'f1': f1_metrics['f1-measure-overall']}

In [23]:
reader = NERDatasetReader('https://s3.amazonaws.com/realworldnlpbook/data/entity-annotated-corpus/ner_dataset.csv')

In [24]:
sampler = BucketBatchSampler(batch_size=16, sorting_keys=["tokens"])
train_data_loader = MultiProcessDataLoader(reader, 'train', batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, 'dev', batch_sampler=sampler)

loading instances: 0it [00:00, ?it/s]

loading instances: 0it [00:00, ?it/s]

In [25]:
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(),
                                        dev_data_loader.iter_instances()))

building vocab: 0it [00:00, ?it/s]

In [26]:
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)

In [27]:
def predict(tokens: List[str], model: LstmTagger) -> List[str]:
    token_indexers = {'tokens': SingleIdTokenIndexer()}
    tokens = [Token(t) for t in tokens]
    inst = Instance({'tokens': TextField(tokens, token_indexers)})
    logits = model.forward_on_instance(inst)['logits']
    label_ids = np.argmax(logits, axis=1)
    labels = [model.vocab.get_token_from_index(label_id, 'labels')
              for label_id in label_ids]
    return labels

In [28]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_SIZE)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [29]:
lstm = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, bidirectional=True, batch_first=True))

In [30]:
model = LstmTagger(word_embeddings, lstm, vocab, cuda_device = 0)

In [31]:
optimizer = optim.Adam(model.parameters())

In [32]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=20,
    cuda_device=0)

trainer.train()

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

  0%|          | 0/2698 [00:00<?, ?it/s]

  0%|          | 0/300 [00:00<?, ?it/s]

{'best_epoch': 1,
 'peak_worker_0_memory_MB': 5912.12890625,
 'peak_gpu_0_memory_MB': 112.1611328125,
 'training_duration': '0:06:01.571743',
 'epoch': 11,
 'training_accuracy': 0.996590541879228,
 'training_prec': 0.9733036817395602,
 'training_rec': 0.9774773433389913,
 'training_f1': 0.9753860478015329,
 'training_loss': 0.009491482226074434,
 'training_worker_0_memory_MB': 5912.12890625,
 'training_gpu_0_memory_MB': 112.1611328125,
 'validation_accuracy': 0.9638880931920175,
 'validation_prec': 0.7816926036020283,
 'validation_rec': 0.802027269465375,
 'validation_f1': 0.7917293898874911,
 'validation_loss': 0.1795651229872601,
 'best_validation_accuracy': 0.9699322066265635,
 'best_validation_prec': 0.8283081327726581,
 'best_validation_rec': 0.8304628632938643,
 'best_validation_f1': 0.8293840985441829,
 'best_validation_loss': 0.09080706231839333}

In [33]:
tokens = ['Apple', 'is', 'looking', 'to', 'buy', 'U.K.', 'startup', 'for', '$1', 'billion', '.']
labels = predict(tokens, model)
print(' '.join('{}/{}'.format(token, label) for token, label in zip(tokens, labels)))

Apple/B-org is/O looking/O to/O buy/O U.K./O startup/O for/O $1/O billion/O ./O
