#### Based on https://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!pip install allennlp==0.9.0

In [3]:
import csv
import logging
import numpy as np
import os
import pandas as pd
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from allennlp.common.checks import ConfigurationError
from allennlp.data import Instance
from allennlp.data.fields import LabelField, TextField, Field, ArrayField
from allennlp.data.instance import Instance
from allennlp.data.iterators import BasicIterator, DataIterator, BucketIterator
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, Token
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.nn import util as nn_util
from allennlp.nn.util import get_text_field_mask
from allennlp.training.trainer import Trainer
from functools import partial
from overrides import overrides
from pathlib import Path
from scipy.special import expit # the sigmoid function
from sklearn.metrics import classification_report
from tqdm import tqdm
from typing import Dict, List, Callable, Iterable
import random

In [26]:
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

class AgentBenchmarkDatasetReader(DatasetReader):
    def __init__(self,
                 config,
                 tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._config = config

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(cached_path(file_path), mode="r", encoding='utf-8') as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            i = 0
            for line in data_file:
                i+=1
                if self._config.testing: 
                    if i == 50000:
                       break
                line = line.strip("\n")
                if not line:
                    continue
                line_data = line.split(",")
                if line_data[0] == "utterance":
                    continue
                utterance = line_data[0]
                label = line_data[1]

                if utterance == "":
                    break

                yield self.text_to_instance(
                        [Token(x) for x in self._tokenizer(utterance)], label)

    @overrides
    def text_to_instance(self,  # type: ignore
                         tokens:List[Token],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        sentence_field = TextField(tokens, self._token_indexers)
        fields = { "tokens" : sentence_field}

        if label is not None:
            fields['label'] = LabelField(label, "labels")

        return Instance(fields)

In [70]:
ELMO_MODEL_PATH="../../models/mercado-livre-pt-only/elmo-model.th"
ELMO_VOCAB_PATH="../../models/mercado-livre-pt-only/elmo-vocabulary"

In [28]:
DATA_PATH = '../../data/mercado-livre-pt-only'
TRAIN_DATASET = os.path.join(DATA_PATH, 'train.csv')
VAL_DATASET = os.path.join(DATA_PATH, 'val.csv')
TEST_DATASET  = os.path.join(DATA_PATH, 'test.csv')

In [29]:
def seed_everything(seed=10):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [30]:
seed_everything()

In [31]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    lazy=False,
    testing=False,
    seed=1,
    batch_size=512,
    lr=3e-2,
    epochs=30,
    hidden_sz=300,
    max_seq_len=82, # necessary to limit memory usage
    max_vocab_size=30000,
)

In [32]:
USE_GPU =torch.cuda.is_available()
gpu=2

In [33]:
 print(torch.rand(2,3).cuda())

tensor([[0.4581, 0.4829, 0.3125],
        [0.6150, 0.2139, 0.4118]], device='cuda:0')


In [34]:
print(torch.__version__)

1.7.1


In [35]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x7fee2b3d0eb8>

### Prepare token handlers

In [36]:
# the token indexer is responsible for mapping tokens to integers
token_indexer = ELMoTokenCharactersIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='pt_core_news_sm', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]

In [37]:
reader = AgentBenchmarkDatasetReader(config, tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer})
train_dataset = reader.read(TRAIN_DATASET)
val_dataset = reader.read(VAL_DATASET)
test_dataset = reader.read(TEST_DATASET)

498780it [03:41, 2246.81it/s]
55420it [00:25, 2159.96it/s]
138550it [01:01, 2254.30it/s]


In [38]:
vars(train_dataset[0].fields["tokens"])

{'tokens': [cabo,
  fio,
  de,
  aco,
  pesca,
  flexivel,
  marine,
  sports,
  90lbs,
  -,
  10,
  m],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.elmo_indexer.ELMoTokenCharactersIndexer at 0x7fe93128e080>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_index_to_indexer_name': None}

In [39]:
vocab = Vocabulary.from_instances(train_dataset)

100%|██████████| 498780/498780 [00:02<00:00, 168831.44it/s]


In [40]:
vars(vocab)

{'_padding_token': '@@PADDING@@',
 '_oov_token': '@@UNKNOWN@@',
 '_non_padded_namespaces': {'*labels', '*tags'},
 '_token_to_index': _TokenToIndexDefaultDict(None,
                          {'labels': {'CAR_SEAT_COVERS': 0,
                            'AUTOMOTIVE_SHIFT_LEVER_KNOBS': 1,
                            'CAR_ANTENNAS': 2,
                            'FOOTBALL_SHIRTS': 3,
                            'SURVEILLANCE_CAMERAS': 4,
                            'VIDEO_GAMES': 5,
                            'WALLPAPERS': 6,
                            'WRISTWATCHES': 7,
                            'SUNGLASSES': 8,
                            'CARPETS': 9,
                            'HANDBAGS': 10,
                            'DOLLS': 11,
                            'BOOKS': 12,
                            'LIGHT_BULBS': 13,
                            'RAM_MEMORY_MODULES': 14,
                            'JACKETS_AND_COATS': 15,
                            'MOBILE_DEVICE_CHARGERS': 16

In [41]:
num_labels = 1048

In [42]:
from allennlp.data.iterators import BucketIterator

In [43]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("tokens", "num_tokens")],
                         )

In [44]:
iterator.index_with(vocab)

In [45]:
label_dict = vocab.get_index_to_token_vocabulary('tokens')

In [46]:
batch = next(iter(iterator(train_dataset)))

In [47]:
batch["tokens"]["tokens"]

tensor([[[259, 108, 106,  ..., 261, 261, 261],
         [259,  51,  50,  ..., 261, 261, 261],
         [259, 116, 113,  ..., 261, 261, 261],
         ...,
         [259, 104, 102,  ..., 261, 261, 261],
         [259, 116,  98,  ..., 261, 261, 261],
         [259,  99, 106,  ..., 261, 261, 261]],

        [[259,  99,  98,  ..., 261, 261, 261],
         [259, 106, 111,  ..., 261, 261, 261],
         [259, 113, 105,  ..., 261, 261, 261],
         ...,
         [259,  52,  54,  ..., 261, 261, 261],
         [259, 113,  98,  ..., 261, 261, 261],
         [  0,   0,   0,  ...,   0,   0,   0]],

        [[259, 117, 102,  ..., 261, 261, 261],
         [259, 100,  98,  ..., 261, 261, 261],
         [259, 100, 117,  ..., 261, 261, 261],
         ...,
         [259, 101, 102,  ..., 261, 261, 261],
         [259, 116, 112,  ..., 261, 261, 261],
         [259,  98, 110,  ..., 261, 261, 261]],

        ...,

        [[259, 113, 115,  ..., 261, 261, 261],
         [259, 111,  98,  ..., 261, 261, 261]

In [48]:
class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = torch.nn.CrossEntropyLoss()
        self.metrics = {
                "accuracy": CategoricalAccuracy()
                #,"accuracy3": CategoricalAccuracy(top_k=3)
        }
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                label: torch.Tensor) -> torch.Tensor:
        
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        class_probabilities = F.softmax(class_logits)
        
        if label is not None:
            if label.shape[0] == 1:
                loss = self.loss(class_logits, label)
            else:    
                loss = self.loss(class_logits, label.squeeze(-1))
        
            for metric in self.metrics.values():
                if label.shape[0] == 1:
                    metric(class_logits, label)
                else:
                    metric(class_logits, label.squeeze(-1))
        
        output = {"class_logits": class_logits, "class_probabilities": class_probabilities, "loss": loss }
        return output

In [49]:
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_weights.hdf5'

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

In [50]:
encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, 
                                                        bidirectional=True, batch_first=True))

In [51]:
model = BaselineModel(
    word_embeddings, 
    encoder, num_labels)

In [52]:
if USE_GPU: model.cuda(gpu)
else: model

In [53]:
batch = nn_util.move_to_device(batch, gpu if USE_GPU else -1)

In [54]:
tokens = batch["tokens"]
labels = batch

In [55]:
mask = get_text_field_mask(tokens)

In [56]:
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)

In [57]:
embeddings.size()

torch.Size([512, 10, 1024])

In [58]:
state

tensor([[ 0.2972, -0.0752,  0.0504,  ..., -0.2361,  0.0104, -0.1632],
        [ 0.0572,  0.0513, -0.0315,  ..., -0.2705,  0.0430,  0.3052],
        [-0.1446, -0.0170, -0.4060,  ..., -0.1722,  0.0446, -0.2055],
        ...,
        [-0.0524,  0.0836,  0.0408,  ..., -0.1700,  0.0982, -0.0396],
        [ 0.0246,  0.0489, -0.2029,  ..., -0.1581,  0.0872,  0.2152],
        [ 0.0175,  0.1782,  0.0237,  ...,  0.0011,  0.1411,  0.1077]],
       device='cuda:2', grad_fn=<ViewBackward>)

In [59]:
model(**batch)



{'class_logits': tensor([[-5.9148e-02, -7.0721e-02, -1.1920e-01,  ...,  4.7724e-02,
           8.3995e-02, -8.2435e-02],
         [-1.0090e-01,  5.3150e-02,  1.0017e-01,  ..., -4.9621e-02,
           1.6763e-01, -2.0815e-01],
         [-3.4649e-02,  2.2156e-02, -9.0239e-02,  ..., -2.0660e-04,
           1.0688e-02,  8.4100e-02],
         ...,
         [-1.7105e-01,  1.2732e-02, -1.0934e-05,  ..., -8.7087e-02,
          -9.6925e-02, -2.4802e-02],
         [-1.2206e-01,  8.8779e-02,  1.3225e-01,  ...,  2.9692e-03,
          -4.1607e-03, -7.0190e-02],
         [-5.3652e-02, -5.3688e-03,  1.5957e-03,  ...,  2.1658e-02,
          -5.5652e-02,  5.7587e-02]], device='cuda:2', grad_fn=<AddmmBackward>),
 'class_probabilities': tensor([[0.0009, 0.0009, 0.0008,  ..., 0.0010, 0.0010, 0.0009],
         [0.0009, 0.0010, 0.0011,  ..., 0.0009, 0.0011, 0.0008],
         [0.0009, 0.0010, 0.0009,  ..., 0.0009, 0.0010, 0.0010],
         ...,
         [0.0008, 0.0010, 0.0010,  ..., 0.0009, 0.0009, 0.0009],

In [60]:
loss = model(**batch)["loss"]



In [61]:
loss

tensor(6.9584, device='cuda:2', grad_fn=<NllLossBackward>)

In [62]:
loss.backward()

In [63]:
[x.grad for x in list(model.encoder.parameters())]

[tensor([[-6.0827e-05, -4.1661e-05,  3.9743e-05,  ..., -1.4868e-05,
           4.9867e-05,  2.2017e-05],
         [-1.2223e-04,  2.1713e-05,  1.8538e-05,  ..., -1.1245e-04,
           2.5176e-05,  2.6623e-05],
         [-2.1081e-05,  1.6477e-05, -5.2541e-05,  ..., -4.5968e-05,
           3.5557e-05, -2.8829e-06],
         ...,
         [-1.6109e-05,  4.5520e-05, -1.2063e-06,  ...,  7.5589e-05,
          -4.6699e-05, -3.2060e-05],
         [ 7.7857e-05,  8.7021e-05,  1.1329e-05,  ...,  4.2701e-05,
           8.5330e-06,  1.2885e-07],
         [-8.2162e-05, -1.2985e-05,  2.9444e-05,  ..., -9.2555e-05,
           2.9029e-05,  4.5379e-05]], device='cuda:2'),
 tensor([[-1.2705e-05,  2.3131e-06, -1.3766e-05,  ...,  1.3586e-06,
          -1.9021e-05, -6.3431e-06],
         [-1.4106e-05, -3.4165e-06,  1.7656e-05,  ...,  5.5108e-06,
           1.1438e-05,  1.2764e-05],
         [ 2.6794e-06, -6.0544e-07,  1.2590e-05,  ...,  1.3125e-05,
          -1.3010e-06,  1.4601e-05],
         ...,
        

## TRAIN

In [64]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [65]:
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    patience=5,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    cuda_device=gpu if USE_GPU else -1,
    num_epochs=config.epochs,
)

In [66]:
metrics = trainer.train()

loss: 1.3008 ||: 100%|██████████| 975/975 [12:16<00:00,  1.32it/s]
loss: 0.5782 ||: 100%|██████████| 109/109 [00:59<00:00,  1.83it/s]
loss: 0.5884 ||: 100%|██████████| 975/975 [12:34<00:00,  1.29it/s]
loss: 0.4622 ||: 100%|██████████| 109/109 [00:39<00:00,  2.76it/s]
loss: 0.4820 ||: 100%|██████████| 975/975 [12:30<00:00,  1.30it/s]
loss: 0.4289 ||: 100%|██████████| 109/109 [01:00<00:00,  1.79it/s]
loss: 0.4289 ||: 100%|██████████| 975/975 [12:19<00:00,  1.32it/s]
loss: 0.4074 ||: 100%|██████████| 109/109 [01:21<00:00,  1.34it/s]
loss: 0.4010 ||: 100%|██████████| 975/975 [12:03<00:00,  1.35it/s]
loss: 0.4176 ||: 100%|██████████| 109/109 [01:19<00:00,  1.38it/s]
loss: 0.3719 ||: 100%|██████████| 975/975 [11:56<00:00,  1.36it/s]
loss: 0.4212 ||: 100%|██████████| 109/109 [01:17<00:00,  1.40it/s]
loss: 0.3551 ||: 100%|██████████| 975/975 [11:38<00:00,  1.40it/s]
loss: 0.4035 ||: 100%|██████████| 109/109 [01:16<00:00,  1.42it/s]
loss: 0.3508 ||: 100%|██████████| 975/975 [11:52<00:00,  1.37i

In [71]:
# Here's how to save the model.
with open(ELMO_MODEL_PATH, 'wb+') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files(ELMO_VOCAB_PATH)

In [72]:
# And here's how to reload the model.
vocab2 = Vocabulary.from_files(ELMO_VOCAB_PATH)

model = BaselineModel(
    word_embeddings, 
    encoder, num_labels)

with open(ELMO_MODEL_PATH, 'rb') as f:
    model.load_state_dict(torch.load(f))

In [73]:
model.cuda(gpu)

BaselineModel(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): ElmoTokenEmbedder(
      (_elmo): Elmo(
        (_elmo_lstm): _ElmoBiLm(
          (_token_embedder): _ElmoCharacterEncoder(
            (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
            (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
            (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
            (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
            (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
            (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
            (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
            (_highways): Highway(
              (_layers): ModuleList(
                (0): Linear(in_features=2048, out_features=4096, bias=True)
                (1): Linear(in_features=2048, out_features=4096, bias=True)
              )
            )
            (_pro

### Generating Predictions

In [74]:
from tqdm import tqdm
from scipy.special import expit # the sigmoid function

def tonp(tsr): return tsr.detach().cpu().numpy()

class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return tonp(out_dict["class_probabilities"])
    
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [75]:
from allennlp.data.iterators import BasicIterator
# iterate over the dataset without changing its order
seq_iterator = BasicIterator(batch_size=64)
seq_iterator.index_with(vocab)

In [76]:
predictor = Predictor(model, seq_iterator, cuda_device=gpu if USE_GPU else -1)
test_preds = predictor.predict(test_dataset)

100%|██████████| 2165/2165 [03:37<00:00,  9.94it/s]


In [77]:
Y_pred=test_preds.argmax(axis=1)

In [78]:
Y_test = []
for x in test_dataset:
    Y_test.append(vars(x.fields['label'])['_label_id'])

In [79]:
label_dict = model.vocab.get_index_to_token_vocabulary('labels')

In [80]:
label_dict[26]

'FISHING_REELS'

In [81]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0      0.991     0.997     0.994       942
           1      1.000     1.000     1.000       938
           2      0.999     0.993     0.996       934
           3      0.945     0.960     0.953       921
           4      0.901     0.910     0.905       909
           5      0.963     0.981     0.972       908
           6      0.971     0.903     0.936       885
           7      0.950     0.951     0.950       876
           8      0.955     0.975     0.965       875
           9      0.940     0.967     0.953       857
          10      0.944     0.935     0.940       850
          11      0.890     0.903     0.896       843
          12      0.816     0.765     0.790       834
          13      0.923     0.942     0.933       829
          14      0.969     0.988     0.978       822
          15      0.919     0.886     0.902       815
          16      0.926     0.970     0.948       804
          17      0.612    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
