#### Based on https://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#!pip install allennlp==0.9.0

In [28]:
import csv
import logging
import numpy as np
import os
import pandas as pd
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from allennlp.common.checks import ConfigurationError
from allennlp.data import Instance
from allennlp.data.fields import LabelField, TextField, Field, ArrayField
from allennlp.data.instance import Instance
from allennlp.data.iterators import BasicIterator, DataIterator, BucketIterator
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, Token
from allennlp.data.tokenizers.word_splitter import JustSpacesWordSplitter
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.vocabulary import Vocabulary
from allennlp.training.metrics import CategoricalAccuracy
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.models import Model
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer
from allennlp.modules.token_embedders import ElmoTokenEmbedder
from allennlp.nn import util as nn_util
from allennlp.nn.util import get_text_field_mask
from allennlp.training.trainer import Trainer
from functools import partial
from overrides import overrides
from pathlib import Path
from scipy.special import expit # the sigmoid function
from sklearn.metrics import classification_report
from tqdm import tqdm
from typing import Dict, List, Callable, Iterable
import random

In [4]:
logger = logging.getLogger(__name__)  # pylint: disable=invalid-name

class AgentBenchmarkDatasetReader(DatasetReader):
    def __init__(self,
                 config,
                 tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self._tokenizer = tokenizer or WordTokenizer(JustSpacesWordSplitter())
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self._config = config

    @overrides
    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        with open(cached_path(file_path), mode="r", encoding='utf-8') as data_file:
            logger.info("Reading instances from lines in file at: %s", file_path)
            i = 0
            for line in data_file:
                i+=1
                if self._config.testing: 
                    if i == 50000:
                       break
                line = line.strip("\n")
                if not line:
                    continue
                line_data = line.split(";")
                if line_data[0] == "utterance":
                    continue
                utterance = line_data[0]
                label = line_data[1]

                if utterance == "":
                    break

                yield self.text_to_instance(
                        [Token(x) for x in self._tokenizer(utterance)], label)

    @overrides
    def text_to_instance(self,  # type: ignore
                         tokens:List[Token],
                         label: str = None) -> Instance:
        # pylint: disable=arguments-differ
        sentence_field = TextField(tokens, self._token_indexers)
        fields = { "tokens" : sentence_field}

        if label is not None:
            fields['label'] = LabelField(label, "labels")

        return Instance(fields)

In [5]:
ELMO_MODEL_PATH="../../models/agent-benchmark/elmo-model.th"
ELMO_VOCAB_PATH="../../models/agent-benchmark/elmo-vocabulary"

In [6]:
DATA_PATH = '../../data/agent-benchmark'
TRAIN_DATASET = os.path.join(DATA_PATH, 'train.csv')
VAL_DATASET = os.path.join(DATA_PATH, 'val.csv')
TEST_DATASET  = os.path.join(DATA_PATH, 'test.csv')

In [7]:
def seed_everything(seed=10):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [8]:
seed_everything()

In [9]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    lazy=False,
    testing=False,
    seed=1,
    batch_size=512,
    lr=3e-2,
    epochs=30,
    hidden_sz=300,
    max_seq_len=82, # necessary to limit memory usage
    max_vocab_size=30000,
)

In [10]:
USE_GPU =torch.cuda.is_available()
gpu=1

In [11]:
 print(torch.rand(2,3).cuda())

tensor([[0.4581, 0.4829, 0.3125],
        [0.6150, 0.2139, 0.4118]], device='cuda:0')


In [12]:
print(torch.__version__)

1.7.1


In [13]:
torch.manual_seed(config.seed)

<torch._C.Generator at 0x7fbd7c98ceb8>

### Prepare token handlers

In [14]:
# the token indexer is responsible for mapping tokens to integers
token_indexer = ELMoTokenCharactersIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_sm', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]

In [15]:
reader = AgentBenchmarkDatasetReader(config, tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer})
train_dataset = reader.read(TRAIN_DATASET)
val_dataset = reader.read(VAL_DATASET)
test_dataset = reader.read(TEST_DATASET)

18415it [00:06, 3055.40it/s]
2047it [00:00, 3281.18it/s]
5116it [00:01, 3188.80it/s]


In [16]:
vars(train_dataset[0].fields["tokens"])

{'tokens': [where, is, my, meeting, today, located],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.elmo_indexer.ELMoTokenCharactersIndexer at 0x7fbd96a42e10>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_index_to_indexer_name': None}

In [17]:
vocab = Vocabulary.from_instances(train_dataset)

100%|██████████| 18415/18415 [00:00<00:00, 290636.52it/s]


In [18]:
vars(vocab)

{'_padding_token': '@@PADDING@@',
 '_oov_token': '@@UNKNOWN@@',
 '_non_padded_namespaces': {'*labels', '*tags'},
 '_token_to_index': _TokenToIndexDefaultDict(None,
                          {'labels': {'music_play': 0,
                            'IOT_hue': 1,
                            'QA_factoid': 2,
                            'calendar_set_event': 3,
                            'email_query': 4,
                            'weather_request': 5,
                            'general_conversation': 6,
                            'news_query': 7,
                            'calendar_delete_event': 8,
                            'radio_play': 9,
                            'general_feedback': 10,
                            'datetime_query': 11,
                            'QA_definition': 12,
                            'calendar_query_event': 13,
                            'QA_open_query': 14,
                            'email_send_email': 15,
                            'social_

In [19]:
num_labels = 64

In [20]:
from allennlp.data.iterators import BucketIterator

In [21]:
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("tokens", "num_tokens")],
                         )

In [22]:
iterator.index_with(vocab)

In [23]:
label_dict = vocab.get_index_to_token_vocabulary('tokens')

In [24]:
batch = next(iter(iterator(train_dataset)))

In [25]:
batch["tokens"]["tokens"]

tensor([[[259, 120, 105,  ..., 261, 261, 261],
         [259, 106, 116,  ..., 261, 261, 261],
         [259, 117, 105,  ..., 261, 261, 261],
         ...,
         [259, 107, 112,  ..., 261, 261, 261],
         [259, 116, 110,  ..., 261, 261, 261],
         [  0,   0,   0,  ...,   0,   0,   0]],

        [[259, 116, 105,  ..., 261, 261, 261],
         [259, 110, 102,  ..., 261, 261, 261],
         [259, 120, 112,  ..., 261, 261, 261],
         ...,
         [259, 113, 116,  ..., 261, 261, 261],
         [259, 105, 112,  ..., 261, 261, 261],
         [  0,   0,   0,  ...,   0,   0,   0]],

        [[259, 120, 105,  ..., 261, 261, 261],
         [259,  40, 116,  ..., 261, 261, 261],
         [259,  99, 102,  ..., 261, 261, 261],
         ...,
         [259, 117, 120,  ..., 261, 261, 261],
         [259, 110, 112,  ..., 261, 261, 261],
         [  0,   0,   0,  ...,   0,   0,   0]],

        ...,

        [[259, 120, 105,  ..., 261, 261, 261],
         [259,  40, 116,  ..., 261, 261, 261]

In [26]:
class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = torch.nn.CrossEntropyLoss()
        self.metrics = {
                "accuracy": CategoricalAccuracy()
                #,"accuracy3": CategoricalAccuracy(top_k=3)
        }
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                label: torch.Tensor) -> torch.Tensor:
        
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        class_probabilities = F.softmax(class_logits)
        
        if label is not None:
            if label.shape[0] == 1:
                loss = self.loss(class_logits, label)
            else:    
                loss = self.loss(class_logits, label.squeeze(-1))
        
            for metric in self.metrics.values():
                if label.shape[0] == 1:
                    metric(class_logits, label)
                else:
                    metric(class_logits, label.squeeze(-1))
        
        output = {"class_logits": class_logits, "class_probabilities": class_probabilities, "loss": loss }
        return output

In [29]:
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

In [30]:
encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, 
                                                        bidirectional=True, batch_first=True))

In [31]:
model = BaselineModel(
    word_embeddings, 
    encoder, num_labels)

In [32]:
if USE_GPU: model.cuda(gpu)
else: model

In [33]:
batch = nn_util.move_to_device(batch, gpu if USE_GPU else -1)

In [34]:
tokens = batch["tokens"]
labels = batch

In [35]:
mask = get_text_field_mask(tokens)

In [36]:
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)

In [37]:
embeddings.size()

torch.Size([512, 10, 1024])

In [38]:
state

tensor([[ 0.0715, -0.0903,  0.1213,  ..., -0.1529, -0.4159,  0.0479],
        [ 0.0992, -0.2256,  0.5123,  ...,  0.1568, -0.1132,  0.0020],
        [ 0.1715, -0.0616,  0.2655,  ...,  0.0372, -0.0193,  0.0616],
        ...,
        [ 0.0394, -0.0180,  0.3718,  ...,  0.2085,  0.0173,  0.0364],
        [-0.1303,  0.0124,  0.3682,  ..., -0.0596, -0.1920,  0.1393],
        [ 0.1596, -0.0093,  0.2025,  ..., -0.1315, -0.1240, -0.1617]],
       device='cuda:1', grad_fn=<ViewBackward>)

In [39]:
model(**batch)



{'class_logits': tensor([[ 0.0067, -0.1262, -0.0408,  ..., -0.1524,  0.2003, -0.1028],
         [-0.1304, -0.0588, -0.0402,  ..., -0.0188,  0.1288,  0.0141],
         [ 0.0522,  0.0221, -0.1178,  ..., -0.0933,  0.2221,  0.0395],
         ...,
         [ 0.1004,  0.1618, -0.0251,  ..., -0.0758,  0.3149, -0.1048],
         [-0.1265, -0.0896,  0.0097,  ..., -0.0556,  0.0498,  0.0794],
         [ 0.0222,  0.0245,  0.0781,  ...,  0.0726,  0.0300, -0.0694]],
        device='cuda:1', grad_fn=<AddmmBackward>),
 'class_probabilities': tensor([[0.0154, 0.0135, 0.0147,  ..., 0.0131, 0.0187, 0.0138],
         [0.0138, 0.0148, 0.0150,  ..., 0.0154, 0.0178, 0.0159],
         [0.0164, 0.0159, 0.0138,  ..., 0.0142, 0.0194, 0.0162],
         ...,
         [0.0172, 0.0183, 0.0152,  ..., 0.0144, 0.0213, 0.0140],
         [0.0139, 0.0144, 0.0159,  ..., 0.0149, 0.0165, 0.0170],
         [0.0157, 0.0157, 0.0166,  ..., 0.0165, 0.0158, 0.0143]],
        device='cuda:1', grad_fn=<SoftmaxBackward>),
 'loss': te

In [40]:
loss = model(**batch)["loss"]



In [41]:
loss

tensor(4.1651, device='cuda:1', grad_fn=<NllLossBackward>)

In [42]:
loss.backward()

In [43]:
[x.grad for x in list(model.encoder.parameters())]

[tensor([[-7.8140e-06,  2.9064e-05,  2.7688e-05,  ..., -1.7780e-05,
           6.6326e-05,  3.1826e-05],
         [-1.0603e-05,  2.6026e-05,  1.3244e-05,  ...,  7.4283e-06,
          -1.2003e-05, -2.1034e-05],
         [ 6.7452e-06,  9.7185e-06,  3.0878e-05,  ..., -2.3322e-05,
          -1.6964e-05, -2.6364e-05],
         ...,
         [-1.1221e-05, -3.6577e-05, -5.3905e-06,  ...,  9.7437e-06,
           1.5629e-05, -1.7851e-05],
         [ 5.1697e-05, -4.1432e-05,  3.5090e-05,  ...,  1.2223e-04,
          -5.5800e-05,  1.1447e-04],
         [ 5.3943e-05, -1.0062e-04, -5.4458e-05,  ..., -2.4138e-05,
           4.2068e-05,  3.3619e-05]], device='cuda:1'),
 tensor([[-4.9624e-06, -6.7475e-06,  1.0555e-05,  ...,  6.0959e-07,
          -1.1160e-05,  3.8650e-06],
         [-5.5328e-06,  3.0699e-06,  7.6127e-08,  ...,  8.8970e-06,
           1.5602e-05, -8.0203e-08],
         [ 7.8553e-06,  1.5743e-05, -6.6295e-06,  ..., -3.2441e-06,
          -2.4200e-05, -2.0758e-07],
         ...,
        

## TRAIN

In [44]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)

In [45]:
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    patience=5,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_dataset,
    validation_dataset=val_dataset,
    cuda_device=gpu if USE_GPU else -1,
    num_epochs=config.epochs,
)

In [46]:
metrics = trainer.train()

loss: 2.9354 ||: 100%|██████████| 36/36 [00:17<00:00,  2.06it/s]
loss: 1.4570 ||: 100%|██████████| 4/4 [00:01<00:00,  2.13it/s]
loss: 1.3101 ||: 100%|██████████| 36/36 [00:21<00:00,  1.70it/s]
loss: 1.1525 ||: 100%|██████████| 4/4 [00:01<00:00,  2.32it/s]
loss: 1.0423 ||: 100%|██████████| 36/36 [00:22<00:00,  1.58it/s]
loss: 1.0346 ||: 100%|██████████| 4/4 [00:02<00:00,  1.72it/s]
loss: 0.9033 ||: 100%|██████████| 36/36 [00:18<00:00,  1.90it/s]
loss: 0.9961 ||: 100%|██████████| 4/4 [00:01<00:00,  2.25it/s]
loss: 0.8208 ||: 100%|██████████| 36/36 [00:18<00:00,  1.98it/s]
loss: 0.9628 ||: 100%|██████████| 4/4 [00:01<00:00,  2.25it/s]
loss: 0.7407 ||: 100%|██████████| 36/36 [00:19<00:00,  1.87it/s]
loss: 0.9785 ||: 100%|██████████| 4/4 [00:01<00:00,  2.39it/s]
loss: 0.6964 ||: 100%|██████████| 36/36 [00:18<00:00,  1.96it/s]
loss: 0.9520 ||: 100%|██████████| 4/4 [00:01<00:00,  2.26it/s]
loss: 0.6443 ||: 100%|██████████| 36/36 [00:18<00:00,  1.98it/s]
loss: 0.9131 ||: 100%|██████████| 4/4 [

In [47]:
# Here's how to save the model.
with open(ELMO_MODEL_PATH, 'wb+') as f:
    torch.save(model.state_dict(), f)

vocab.save_to_files(ELMO_VOCAB_PATH)



In [48]:
# And here's how to reload the model.
vocab2 = Vocabulary.from_files(ELMO_VOCAB_PATH)

model = BaselineModel(
    word_embeddings, 
    encoder, num_labels)

with open(ELMO_MODEL_PATH, 'rb') as f:
    model.load_state_dict(torch.load(f))

In [49]:
model.cuda(gpu)

BaselineModel(
  (word_embeddings): BasicTextFieldEmbedder(
    (token_embedder_tokens): ElmoTokenEmbedder(
      (_elmo): Elmo(
        (_elmo_lstm): _ElmoBiLm(
          (_token_embedder): _ElmoCharacterEncoder(
            (char_conv_0): Conv1d(16, 32, kernel_size=(1,), stride=(1,))
            (char_conv_1): Conv1d(16, 32, kernel_size=(2,), stride=(1,))
            (char_conv_2): Conv1d(16, 64, kernel_size=(3,), stride=(1,))
            (char_conv_3): Conv1d(16, 128, kernel_size=(4,), stride=(1,))
            (char_conv_4): Conv1d(16, 256, kernel_size=(5,), stride=(1,))
            (char_conv_5): Conv1d(16, 512, kernel_size=(6,), stride=(1,))
            (char_conv_6): Conv1d(16, 1024, kernel_size=(7,), stride=(1,))
            (_highways): Highway(
              (_layers): ModuleList(
                (0): Linear(in_features=2048, out_features=4096, bias=True)
                (1): Linear(in_features=2048, out_features=4096, bias=True)
              )
            )
            (_pro

### Generating Predictions

In [50]:
from tqdm import tqdm
from scipy.special import expit # the sigmoid function

def tonp(tsr): return tsr.detach().cpu().numpy()

class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return tonp(out_dict["class_probabilities"])
    
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [51]:
from allennlp.data.iterators import BasicIterator
# iterate over the dataset without changing its order
seq_iterator = BasicIterator(batch_size=64)
seq_iterator.index_with(vocab)

In [52]:
predictor = Predictor(model, seq_iterator, cuda_device=gpu if USE_GPU else -1)
test_preds = predictor.predict(test_dataset)

100%|██████████| 80/80 [00:09<00:00,  8.59it/s]


In [53]:
Y_pred=test_preds.argmax(axis=1)

In [54]:
Y_test = []
for x in test_dataset:
    Y_test.append(vars(x.fields['label'])['_label_id'])

In [55]:
label_dict = model.vocab.get_index_to_token_vocabulary('labels')

In [56]:
label_dict[26]

'audiobook_play'

In [57]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred, digits=3))

              precision    recall  f1-score   support

           0      0.792     0.795     0.793       244
           1      0.937     0.972     0.954       214
           2      0.661     0.831     0.736       195
           3      0.812     0.786     0.799       192
           4      0.944     0.859     0.899       177
           5      0.733     0.786     0.759       168
           6      0.617     0.558     0.586       165
           7      0.741     0.705     0.723       146
           8      0.855     0.890     0.872       146
           9      0.826     0.784     0.804       139
          10      0.609     0.741     0.669       139
          11      0.765     0.844     0.803       135
          12      0.828     0.895     0.860       124
          13      0.612     0.582     0.597       122
          14      0.453     0.325     0.379       120
          15      0.865     0.776     0.818       116
          16      0.892     0.853     0.872       116
          17      0.718    