### Predict question type 

In [77]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import json
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import BooleanAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor
torch.manual_seed(1)

<torch._C.Generator at 0x11939b470>

In [45]:
class DropTypeDatasetReader(DatasetReader):
    """
    DatasetReader
    """
    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None) -> None:
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
    def text_to_instance(self, tokens: List[Token], tag: List[str] = None) -> Instance:
        question_field = TextField(tokens, self.token_indexers)
        fields = {"question": question_field}

        if tag:
            label_field = LabelField(tag)
            fields["label"] = label_field

        return Instance(fields)
    def _read(self, file_path: str) -> Iterator[Instance]:
        data_pair = []
        with open('../data/drop_dataset/drop_dataset_dummy.json') as json_file:  
            data = json.load(json_file)
            for k in data:
                for elem in data[k]['qa_pairs']:
                    question = elem['question']
                    answer_type = "" 
                    for key in elem['answer']:
                        if (key == 'number') and (len(elem['answer'][key]) != 0):
                            answer_type = "number"
                            break

                        if (key == 'spans') and (len(elem['answer'][key]) != 0):
                            answer_type = "spans"
                            break

                        answer_type = 'date'
                    yield self.text_to_instance([Token(word) for word in question], answer_type)

In [46]:
reader = DropTypeDatasetReader()

In [57]:
data_train = reader.read('../data/drop_dataset/drop_dataset_dev.json')
data_dev = reader.read('../data/drop_dataset/drop_dataset_dummy.json')

541it [00:00, 12660.94it/s]
541it [00:00, 4322.33it/s]


In [58]:
for i in data:
    print(i)
    break

Instance with fields:
 	 question: TextField of length 43 with text: 
 		[H, o, w,  , l, o, n, g,  , w, a, s,  , t, h, e,  , L, i, o, n, ', s,  , l, o, n, g, e, s, t,  , f,
		i, e, l, d,  , g, o, a, l, ?]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 label: LabelField with label: spans in namespace: 'labels'.' 



In [59]:
vocab = Vocabulary.from_instances(data_train + data_dev)

100%|██████████| 1082/1082 [00:00<00:00, 27478.49it/s]


In [78]:
class QuestionClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = BooleanAccuracy()
    def forward(self,
                question: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(question)
        embeddings = self.word_embeddings(question)
        encoder_out = self.encoder(embeddings, mask)
        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}
        if label is not None:
            self.accuracy(tag_logits, label, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, label, mask)

        return output
    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [79]:
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

In [84]:
lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))
model = QuestionClassifier(word_embeddings, lstm, vocab)

In [83]:
optimizer = optim.SGD(model.parameters(), lr=0.1)
iterator = BucketIterator(batch_size=2, sorting_keys=[("question", "num_tokens")])
iterator.index_with(vocab)
trainer = Trainer(model=model,
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=data_train,
                  validation_dataset=data_dev,
                  patience=10,
                  num_epochs=1000,
                  cuda_device=-1)
trainer.train()






  0%|          | 0/271 [00:00<?, ?it/s][A[A[A[A[A

RuntimeError: The size of tensor a (3) must match the size of tensor b (46) at non-singleton dimension 2