In [17]:
# !pip install allennlp==2.5.0
# !pip install allennlp-models==2.5.0
# !git clone https://github.com/mhagiwara/realworldnlp.git
# %cd realworldnlp

In [18]:
%cd /home/admin_paulhykim_altostrat_com/realworldnlp

/home/admin_paulhykim_altostrat_com/realworldnlp


In [19]:
from itertools import chain
from typing import Dict

import numpy as np
import torch
import torch.optim as optim

from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.samplers import BucketBatchSampler
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.training import GradientDescentTrainer
from allennlp_models.structured_prediction.dataset_readers.universal_dependencies import UniversalDependenciesDatasetReader

from realworldnlp.predictors import UniversalPOSPredictor

In [20]:
EMBEDDING_SIZE = 128
HIDDEN_SIZE = 128

In [21]:
class LstmTagger(Model):
    def __init__(self,
                 embedder: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary,
                 cuda_device=-1) -> None:
        super().__init__(vocab)
        self.embedder = embedder
        self.encoder = encoder
        self.linear = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                      out_features=vocab.get_vocab_size('pos'))
        self.accuracy = CategoricalAccuracy()
        
        if cuda_device > -1:
            self.linear = self.linear.to(cuda_device)
            self.embedder = self.embedder.to(cuda_device)
            self.encoder = self.encoder.to(cuda_device)

    def forward(self,
                words: Dict[str, torch.Tensor],
                pos_tags: torch.Tensor = None,
                **args) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(words)

        embeddings = self.embedder(words)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.linear(encoder_out)

        output = {"tag_logits": tag_logits}
        if pos_tags is not None:
            self.accuracy(tag_logits, pos_tags, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, pos_tags, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [22]:
reader = UniversalDependenciesDatasetReader()

In [23]:
train_path = 'https://s3.amazonaws.com/realworldnlpbook/data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-train.conllu'
dev_path = 'https://s3.amazonaws.com/realworldnlpbook/data/ud-treebanks-v2.3/UD_English-EWT/en_ewt-ud-dev.conllu'

In [24]:
sampler = BucketBatchSampler(batch_size=32, sorting_keys=["words"])
train_data_loader = MultiProcessDataLoader(reader, train_path, batch_sampler=sampler)
dev_data_loader = MultiProcessDataLoader(reader, dev_path, batch_sampler=sampler)

loading instances: 0it [00:00, ?it/s]

loading instances: 0it [00:00, ?it/s]

In [25]:
vocab = Vocabulary.from_instances(chain(train_data_loader.iter_instances(), 
                                        dev_data_loader.iter_instances()))

building vocab: 0it [00:00, ?it/s]

In [26]:
train_data_loader.index_with(vocab)
dev_data_loader.index_with(vocab)

In [27]:
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_SIZE)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [28]:
encoder = PytorchSeq2SeqWrapper(
    torch.nn.LSTM(EMBEDDING_SIZE, HIDDEN_SIZE, batch_first=True))

In [29]:
model = LstmTagger(word_embeddings, encoder, vocab, cuda_device=0)

In [30]:
optimizer = optim.Adam(model.parameters())

In [31]:
trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    patience=10,
    num_epochs=10,
    cuda_device=0)
trainer.train()

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

  0%|          | 0/392 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'best_epoch': 2,
 'peak_worker_0_memory_MB': 5352.85546875,
 'peak_gpu_0_memory_MB': 65.09912109375,
 'training_duration': '0:00:35.590955',
 'epoch': 9,
 'training_accuracy': 0.9784343915731848,
 'training_loss': 0.05888853891876203,
 'training_worker_0_memory_MB': 5352.85546875,
 'training_gpu_0_memory_MB': 65.09912109375,
 'validation_accuracy': 0.880427867027199,
 'validation_loss': 0.5728837497650631,
 'best_validation_accuracy': 0.8833306823604262,
 'best_validation_loss': 0.4643342192210848}

In [32]:
predictor = UniversalPOSPredictor(model, reader)
tokens = ['The', 'dog', 'ate', 'the', 'apple', '.']
logits = predictor.predict(tokens)['tag_logits']
tag_ids = np.argmax(logits, axis=-1)

[vocab.get_token_from_index(tag_id, 'pos') for tag_id in tag_ids]

['DET', 'NOUN', 'NOUN', 'DET', 'NOUN', 'PUNCT']