### Predict question type 

In [11]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import json
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper 
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper, Seq2VecEncoder
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import BooleanAccuracy, CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

from allennlp.common import JsonDict
from allennlp.data import DatasetReader, Instance
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.models import Model
from allennlp.predictors import Predictor
from overrides import overrides

torch.manual_seed(1)

<torch._C.Generator at 0x117b51eb0>

In [2]:
class DropTypeDatasetReader(DatasetReader):
    """
    DatasetReader
    """
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        
    def text_to_instance(self, tokens: List[Token], tag: List[str] = None) -> Instance:
        question_field = TextField(tokens, self.token_indexers)
        fields = {"question": question_field}

        if tag:
            label_field = LabelField(tag)
            fields["label"] = label_field
        return Instance(fields)
    
    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as json_file:  
            data = json.load(json_file)
            for k in data:
                for elem in data[k]['qa_pairs']:
                    question = elem['question']
                    answer_type = "" 
                    for key in elem['answer']:
                        if (key == 'number') and (len(elem['answer'][key]) != 0):
                            answer_type = "number"
                            break

                        if (key == 'spans') and (len(elem['answer'][key]) != 0):
                            answer_type = "spans"
                            break

                        answer_type = 'date'
                    yield self.text_to_instance([Token(word) for word in question], answer_type)

In [3]:
reader = DropTypeDatasetReader()

In [4]:
data_train = reader.read('../data/drop_dataset/drop_dataset_train.json')
data_dev = reader.read('../data/drop_dataset/drop_dataset_dev.json')

77409it [00:14, 5301.04it/s]
9536it [00:02, 3754.08it/s]


In [5]:
vocab = Vocabulary.from_instances(data_train + data_dev)

100%|██████████| 86945/86945 [00:05<00:00, 16222.83it/s]


In [6]:
# Model in AllenNLP represents a model that is trained.
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
        # (usually a sequence of embedded word vectors), processes it, and returns it as a single
        # vector. Oftentimes, this is an RNN-based architecture (e.g., LSTM or GRU), but
        # AllenNLP also supports CNNs and other simple architectures (for example,
        # just averaging over the input vectors).
        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

        # We use the cross-entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                question: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them of equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(question)

        # Forward pass
        embeddings = self.word_embeddings(question)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [7]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [8]:
lstm = PytorchSeq2VecWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = LstmClassifier(word_embeddings, lstm, vocab)

In [21]:
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
iterator = BucketIterator(batch_size=64, sorting_keys=[("question", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=data_train,
                  validation_dataset=data_dev,
                  patience=10,
                  num_epochs=40,
                  cuda_device=-1)
trainer.train()

accuracy: 0.9634, loss: 0.1710 ||: 100%|██████████| 1210/1210 [00:38<00:00, 31.35it/s]
accuracy: 0.9682, loss: 0.1512 ||: 100%|██████████| 149/149 [00:01<00:00, 82.29it/s]
accuracy: 0.9641, loss: 0.1647 ||: 100%|██████████| 1210/1210 [00:39<00:00, 27.80it/s]
accuracy: 0.9688, loss: 0.1463 ||: 100%|██████████| 149/149 [00:01<00:00, 84.66it/s]
accuracy: 0.9622, loss: 0.1686 ||: 100%|██████████| 1210/1210 [00:38<00:00, 34.38it/s]
accuracy: 0.9659, loss: 0.1533 ||: 100%|██████████| 149/149 [00:01<00:00, 86.49it/s]
accuracy: 0.9636, loss: 0.1647 ||: 100%|██████████| 1210/1210 [00:40<00:00, 29.73it/s]
accuracy: 0.9661, loss: 0.1644 ||: 100%|██████████| 149/149 [00:01<00:00, 87.19it/s]
accuracy: 0.9653, loss: 0.1591 ||: 100%|██████████| 1210/1210 [00:38<00:00, 31.54it/s]
accuracy: 0.9700, loss: 0.1403 ||: 100%|██████████| 149/149 [00:01<00:00, 86.95it/s]
accuracy: 0.9651, loss: 0.1579 ||: 100%|██████████| 1210/1210 [00:43<00:00, 27.62it/s]
accuracy: 0.9689, loss: 0.1427 ||: 100%|██████████| 1

{'best_epoch': 37,
 'peak_cpu_memory_MB': 1307.824128,
 'training_duration': '00:27:24',
 'training_start_epoch': 0,
 'training_epochs': 39,
 'epoch': 39,
 'training_accuracy': 0.9703006110400599,
 'training_loss': 0.1322935859033884,
 'training_cpu_memory_MB': 1307.824128,
 'validation_accuracy': 0.9736786912751678,
 'validation_loss': 0.11910912504532194,
 'best_validation_accuracy': 0.9749370805369127,
 'best_validation_loss': 0.11396335590405753}

In [20]:
# You need to name your predictor and register so that `allennlp` command can recognize it
# Note that you need to use "@Predictor.register", not "@Model.register"!
@Predictor.register("sentence_classifier_predictor2")
class SentenceClassifierPredictor(Predictor):
    def __init__(self, model: Model, dataset_reader: DatasetReader) -> None:
        super().__init__(model, dataset_reader)
        self._tokenizer = SpacyWordSplitter(language='en_core_web_sm', pos_tags=True)

    def predict(self, sentence: str) -> JsonDict:
        return self.predict_json({"question" : question})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        sentence = json_dict["question"]
        tokens = self._tokenizer.split_words(question)
        return self._dataset_reader.text_to_instance([str(t) for t in tokens])

ConfigurationError: 'Cannot register sentence_classifier_predictor2 as Predictor; name already in use for SentenceClassifierPredictor'

In [17]:
tokens = ['This', 'is', 'the', 'best', 'movie', 'ever', '!']
predictor = SentenceClassifierPredictor(model, dataset_reader=reader)
logits = predictor.predict(tokens)['logits']
label_id = np.argmax(logits)

print(model.vocab.get_token_from_index(label_id, 'labels'))

TypeError: Argument 'string' has incorrect type (expected str, got list)