#### Based on https://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!pip install allennlp==0.9.0

In [3]:
import csv
import logging
import numpy as np
import os
import pandas as pd
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from allennlp.common.checks import ConfigurationError
from allennlp.data import Instance
from allennlp.data.fields import LabelField, TextField, Field, ArrayField
from allennlp.data.instance import Instance
from allennlp.data.iterators import BasicIterator, DataIterator, BucketIterator
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, Token
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.nn import util as nn_util
from allennlp.nn.util import get_text_field_mask
from allennlp.training.trainer import Trainer
from functools import partial
from overrides import overrides
from pathlib import Path
from scipy.special import expit # the sigmoid function
from sklearn.metrics import classification_report
from tqdm import tqdm
from typing import Dict, List, Callable, Iterable
import random

In [4]:
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

class AgentBenchmarkDatasetReader(DatasetReader):
    def __init__(self,
                 config,
                 tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._config = config

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(cached_path(file_path), mode="r", encoding='utf-8') as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            i = 0
            for line in data_file:
                i+=1
                if self._config.testing: 
                    if i == 50000:
                       break
                line = line.strip("\n")
                if not line:
                    continue
                line_data = line.split(";")
                if line_data[0] == "utterance":
                    continue
                utterance = line_data[0]
                label = line_data[1]

                if utterance == "":
                    break

                yield self.text_to_instance(
                        [Token(x) for x in self._tokenizer(utterance)], label)

    @overrides
    def text_to_instance(self,  # type: ignore
                         tokens:List[Token],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        sentence_field = TextField(tokens, self._token_indexers)
        fields = { "tokens" : sentence_field}

        if label is not None:
            fields['label'] = LabelField(label, "labels")

        return Instance(fields)

In [5]:
ELMO_MODEL_PATH="../../models/virtual-operator/elmo-model.th"
ELMO_VOCAB_PATH="../../models/virtual-operator/elmo-vocabulary"

In [6]:
DATA_PATH = '../../data/virtual-operator'
TRAIN_DATASET = os.path.join(DATA_PATH, 'train.csv')
VAL_DATASET = os.path.join(DATA_PATH, 'val.csv')
TEST_DATASET  = os.path.join(DATA_PATH, 'test.csv')

In [7]:
def seed_everything(seed=10):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [8]:
seed_everything()

In [9]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    lazy=False,
    testing=False,
    seed=1,
    batch_size=512,
    lr=3e-2,
    epochs=30,
    hidden_sz=300,
    max_seq_len=82, # necessary to limit memory usage
    max_vocab_size=30000,
)

In [10]:
USE_GPU =torch.cuda.is_available()
gpu=2

In [11]:
 print(torch.rand(2,3).cuda())

tensor([[0.4581, 0.4829, 0.3125],
        [0.6150, 0.2139, 0.4118]], device='cuda:0')


In [12]:
print(torch.__version__)

1.7.1


In [13]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x7f15753b1eb8>

### Prepare token handlers

In [14]:
# the token indexer is responsible for mapping tokens to integers
token_indexer = ELMoTokenCharactersIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='pt_core_news_sm', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]

In [15]:
reader = AgentBenchmarkDatasetReader(config, tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer})
train_dataset = reader.read(TRAIN_DATASET)
val_dataset = reader.read(VAL_DATASET)
test_dataset = reader.read(TEST_DATASET)

482348it [02:25, 3323.14it/s]
53595it [00:14, 3610.10it/s]
133986it [00:38, 3439.69it/s]


In [16]:
vars(train_dataset[0].fields["tokens"])

{'tokens': [para, visita, técnica],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.elmo_indexer.ELMoTokenCharactersIndexer at 0x7f1076976438>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_index_to_indexer_name': None}

In [17]:
vocab = Vocabulary.from_instances(train_dataset)

100%|██████████| 482348/482348 [00:02<00:00, 228711.93it/s]


In [18]:
vars(vocab)

{'_padding_token': '@@PADDING@@',
 '_oov_token': '@@UNKNOWN@@',
 '_non_padded_namespaces': {'*labels', '*tags'},
 '_token_to_index': _TokenToIndexDefaultDict(None,
                          {'labels': {'Sintomas.Genérico.Sem sinal': 0,
                            'Sintomas.Qualificado.Ausência de sinal': 1,
                            'Sintomas.Genérico.Problema com equipamento': 2,
                            'Sintomas.Genérico.Sky não funciona': 3,
                            'Sintomas.Genérico.Falar com atendente': 4,
                            'Sintomas.Genérico.Problema com imagem': 5,
                            'Sintomas.Genérico.Canal não pega': 6,
                            'Sintomas.Genérico.Troca de equipamento': 7,
                            'Sintomas.Genérico.Problema com canal': 8,
                            'Sintomas.Qualificado.Mudança de endereço': 9,
                            'Sintomas.Qualificado.Banda larga': 10,
                            'Sintomas.Genérico.

In [19]:
num_labels = 121

In [20]:
from allennlp.data.iterators import BucketIterator

In [21]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("tokens", "num_tokens")],
                         )

In [22]:
iterator.index_with(vocab)

In [23]:
label_dict = vocab.get_index_to_token_vocabulary('tokens')

In [24]:
batch = next(iter(iterator(train_dataset)))

In [25]:
batch["tokens"]["tokens"]

tensor([[[259, 102, 114,  ..., 261, 261, 261]],

        [[259, 106, 110,  ..., 261, 261, 261]],

        [[259, 100,  98,  ..., 261, 261, 261]],

        ...,

        [[259, 103, 106,  ..., 261, 261, 261]],

        [[259, 101, 102,  ..., 261, 261, 261]],

        [[259, 102, 114,  ..., 261, 261, 261]]])

In [26]:
class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = torch.nn.CrossEntropyLoss()
        self.metrics = {
                "accuracy": CategoricalAccuracy()
                #,"accuracy3": CategoricalAccuracy(top_k=3)
        }
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                label: torch.Tensor) -> torch.Tensor:
        
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        class_probabilities = F.softmax(class_logits)
        
        if label is not None:
            if label.shape[0] == 1:
                loss = self.loss(class_logits, label)
            else:    
                loss = self.loss(class_logits, label.squeeze(-1))
        
            for metric in self.metrics.values():
                if label.shape[0] == 1:
                    metric(class_logits, label)
                else:
                    metric(class_logits, label.squeeze(-1))
        
        output = {"class_logits": class_logits, "class_probabilities": class_probabilities, "loss": loss }
        return output

In [27]:
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_weights.hdf5'

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

In [28]:
encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, 
                                                        bidirectional=True, batch_first=True))

In [29]:
model = BaselineModel(
    word_embeddings, 
    encoder, num_labels)

In [30]:
if USE_GPU: model.cuda(gpu)
else: model

In [31]:
batch = nn_util.move_to_device(batch, gpu if USE_GPU else -1)

In [32]:
tokens = batch["tokens"]
labels = batch

In [33]:
mask = get_text_field_mask(tokens)

In [34]:
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)

In [35]:
embeddings.size()

torch.Size([512, 1, 1024])

In [36]:
state

tensor([[ 0.0060,  0.0451, -0.1665,  ..., -0.0425,  0.2952, -0.1151],
        [-0.0052,  0.0657,  0.1333,  ..., -0.0764,  0.0946,  0.1465],
        [-0.0788,  0.0570, -0.0321,  ...,  0.0499,  0.3829, -0.1072],
        ...,
        [-0.1604,  0.1710, -0.2130,  ..., -0.0664,  0.0458, -0.0040],
        [ 0.1012,  0.0011, -0.1315,  ..., -0.0652,  0.2154,  0.1145],
        [-0.0641,  0.1374, -0.0768,  ..., -0.0208,  0.2029, -0.0149]],
       device='cuda:2', grad_fn=<ViewBackward>)

In [37]:
model(**batch)



{'class_logits': tensor([[-0.0897, -0.0763, -0.0892,  ..., -0.0072, -0.1741,  0.0967],
         [-0.1738, -0.3325, -0.0613,  ..., -0.0716, -0.0088,  0.0845],
         [-0.0331,  0.0664,  0.0095,  ...,  0.1420, -0.0432,  0.0594],
         ...,
         [-0.0523,  0.0139, -0.0854,  ..., -0.0604, -0.0689,  0.1154],
         [-0.0608,  0.0371, -0.0422,  ..., -0.0699, -0.0721,  0.0790],
         [-0.0623,  0.0454, -0.0813,  ..., -0.0589, -0.2096,  0.0404]],
        device='cuda:2', grad_fn=<AddmmBackward>),
 'class_probabilities': tensor([[0.0075, 0.0076, 0.0075,  ..., 0.0081, 0.0069, 0.0090],
         [0.0070, 0.0060, 0.0078,  ..., 0.0078, 0.0083, 0.0091],
         [0.0079, 0.0087, 0.0083,  ..., 0.0094, 0.0078, 0.0087],
         ...,
         [0.0079, 0.0084, 0.0076,  ..., 0.0078, 0.0077, 0.0093],
         [0.0077, 0.0085, 0.0079,  ..., 0.0077, 0.0077, 0.0089],
         [0.0078, 0.0087, 0.0076,  ..., 0.0078, 0.0067, 0.0086]],
        device='cuda:2', grad_fn=<SoftmaxBackward>),
 'loss': te

In [38]:
loss = model(**batch)["loss"]



In [39]:
loss

tensor(4.8112, device='cuda:2', grad_fn=<NllLossBackward>)

In [40]:
loss.backward()

In [41]:
[x.grad for x in list(model.encoder.parameters())]

[tensor([[ 7.3666e-05,  2.3069e-06, -1.0146e-04,  ...,  1.5169e-05,
          -1.4795e-04,  1.1037e-04],
         [ 2.0416e-04,  8.2249e-05,  2.6574e-04,  ...,  1.2385e-04,
          -1.1094e-04, -1.6953e-04],
         [ 1.2842e-04,  1.4631e-04,  3.8783e-04,  ...,  8.8625e-05,
          -1.6765e-04, -3.8572e-05],
         ...,
         [-7.3214e-05, -1.7743e-05,  4.3286e-05,  ..., -4.4244e-05,
           1.0873e-04, -2.3249e-05],
         [ 1.2628e-04,  4.2931e-05,  3.7484e-05,  ...,  1.3265e-04,
          -9.6574e-05, -1.9282e-04],
         [ 5.0728e-05, -7.2969e-05, -2.4717e-04,  ...,  8.9491e-05,
          -1.4963e-05,  6.7783e-05]], device='cuda:2'),
 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:2'),
 tensor([ 1.8792e-04, -2.6222e-04, -4.7218e-04,  ..., -1.2

## TRAIN

In [42]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [43]:
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    patience=5,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    cuda_device=gpu if USE_GPU else -1,
    num_epochs=config.epochs,
)

In [44]:
metrics = trainer.train()

loss: 1.0044 ||: 100%|██████████| 943/943 [11:20<00:00,  1.39it/s]
loss: 0.5356 ||: 100%|██████████| 105/105 [01:15<00:00,  1.38it/s]
loss: 0.5280 ||: 100%|██████████| 943/943 [10:56<00:00,  1.44it/s]
loss: 0.4674 ||: 100%|██████████| 105/105 [00:44<00:00,  2.36it/s]
loss: 0.4574 ||: 100%|██████████| 943/943 [11:48<00:00,  1.33it/s]
loss: 0.4030 ||: 100%|██████████| 105/105 [00:35<00:00,  2.99it/s]
loss: 0.4169 ||: 100%|██████████| 943/943 [11:15<00:00,  1.40it/s]
loss: 0.3834 ||: 100%|██████████| 105/105 [01:09<00:00,  1.52it/s]
loss: 0.3970 ||: 100%|██████████| 943/943 [10:25<00:00,  1.51it/s]
loss: 0.3660 ||: 100%|██████████| 105/105 [01:09<00:00,  1.51it/s]
loss: 0.3965 ||: 100%|██████████| 943/943 [10:42<00:00,  1.47it/s]
loss: 0.3614 ||: 100%|██████████| 105/105 [00:35<00:00,  2.92it/s]
loss: 0.3660 ||: 100%|██████████| 943/943 [11:53<00:00,  1.32it/s]
loss: 0.3494 ||: 100%|██████████| 105/105 [00:34<00:00,  3.02it/s]
loss: 0.3611 ||: 100%|██████████| 943/943 [11:33<00:00,  1.36i

loss: 0.3175 ||: 100%|██████████| 943/943 [10:36<00:00,  1.48it/s]
loss: 0.3293 ||: 100%|██████████| 105/105 [01:10<00:00,  1.48it/s]
loss: 0.3086 ||: 100%|██████████| 943/943 [10:57<00:00,  1.43it/s]
loss: 0.3222 ||: 100%|██████████| 105/105 [00:36<00:00,  2.87it/s]
loss: 0.3057 ||: 100%|██████████| 943/943 [12:13<00:00,  1.29it/s]
loss: 0.3278 ||: 100%|██████████| 105/105 [00:36<00:00,  2.86it/s]
loss: 0.2998 ||: 100%|██████████| 943/943 [12:02<00:00,  1.30it/s]
loss: 0.3259 ||: 100%|██████████| 105/105 [00:50<00:00,  2.09it/s]
loss: 0.3111 ||: 100%|██████████| 943/943 [11:33<00:00,  1.36it/s]
loss: 0.3228 ||: 100%|██████████| 105/105 [01:13<00:00,  1.43it/s]
loss: 0.2936 ||: 100%|██████████| 943/943 [11:17<00:00,  1.39it/s]
loss: 0.3405 ||: 100%|██████████| 105/105 [01:10<00:00,  1.48it/s]
loss: 0.2952 ||: 100%|██████████| 943/943 [10:56<00:00,  1.44it/s]
loss: 0.3417 ||: 100%|██████████| 105/105 [01:09<00:00,  1.51it/s]


In [45]:
# Here's how to save the model.
with open(ELMO_MODEL_PATH, 'wb+') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files(ELMO_VOCAB_PATH)

In [46]:
# And here's how to reload the model.
vocab2 = Vocabulary.from_files(ELMO_VOCAB_PATH)

model = BaselineModel(
    word_embeddings, 
    encoder, num_labels)

with open(ELMO_MODEL_PATH, 'rb') as f:
    model.load_state_dict(torch.load(f))

In [47]:
model.cuda(gpu)

BaselineModel(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): ElmoTokenEmbedder(
      (_elmo): Elmo(
        (_elmo_lstm): _ElmoBiLm(
          (_token_embedder): _ElmoCharacterEncoder(
            (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
            (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
            (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
            (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
            (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
            (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
            (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
            (_highways): Highway(
              (_layers): ModuleList(
                (0): Linear(in_features=2048, out_features=4096, bias=True)
                (1): Linear(in_features=2048, out_features=4096, bias=True)
              )
            )
            (_pro

### Generating Predictions

In [48]:
from tqdm import tqdm
from scipy.special import expit # the sigmoid function

def tonp(tsr): return tsr.detach().cpu().numpy()

class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return tonp(out_dict["class_probabilities"])
    
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [49]:
from allennlp.data.iterators import BasicIterator
# iterate over the dataset without changing its order
seq_iterator = BasicIterator(batch_size=64)
seq_iterator.index_with(vocab)

In [50]:
predictor = Predictor(model, seq_iterator, cuda_device=gpu if USE_GPU else -1)
test_preds = predictor.predict(test_dataset)

100%|██████████| 2094/2094 [09:11<00:00,  3.79it/s]


In [51]:
Y_pred=test_preds.argmax(axis=1)

In [52]:
Y_test = []
for x in test_dataset:
    Y_test.append(vars(x.fields['label'])['_label_id'])

In [53]:
label_dict = model.vocab.get_index_to_token_vocabulary('labels')

In [54]:
label_dict[26]

'Sintomas.Genérico.Equipamento quebrado G'

In [55]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.942     0.941     0.942     14552
           1      0.984     0.985     0.985     10158
           2      0.982     0.975     0.979      9645
           3      0.926     0.904     0.915      8357
           4      0.971     0.958     0.964      8262
           5      0.963     0.947     0.955      6605
           6      0.851     0.919     0.884      5967
           7      0.958     0.957     0.958      3737
           8      0.957     0.940     0.948      3547
           9      0.957     0.948     0.952      3343
          10      0.973     0.981     0.977      2590
          11      0.986     0.990     0.988      2501
          12      0.921     0.950     0.935      2318
          13      0.998     0.995     0.997      2193
          14      0.972     0.972     0.972      2106
          15      0.855     0.851     0.853      2049
          16      0.960     0.922     0.940      1975
          17      0.877    