In [1]:
from typing import Iterator, List, Dict
import torch
import torch.optim as optim
import json
import numpy as np
from allennlp.data import Instance
from allennlp.data.fields import TextField, SequenceLabelField, LabelField
from allennlp.data.dataset_readers import DatasetReader
from allennlp.common.file_utils import cached_path
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.dataset_readers.reading_comprehension.drop import DropReader
from allennlp.data.tokenizers import Token
from allennlp.data.vocabulary import Vocabulary
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper 
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper, Seq2VecEncoder
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
from allennlp.training.metrics import BooleanAccuracy, CategoricalAccuracy
from allennlp.data.iterators import BucketIterator
from allennlp.training.trainer import Trainer
from allennlp.predictors import SentenceTaggerPredictor

from allennlp.common import JsonDict
from allennlp.data import DatasetReader, Instance
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.models import Model
from allennlp.predictors import Predictor
from overrides import overrides

torch.manual_seed(1)

<torch._C.Generator at 0x124a3c130>

In [2]:
# Model in AllenNLP represents a model that is trained.
class LstmClassifier(Model):
    def __init__(self,
                 word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        # We need the embeddings to convert word IDs to their vector representations
        self.word_embeddings = word_embeddings

        # Seq2VecEncoder is a neural network abstraction that takes a sequence of something
        # (usually a sequence of embedded word vectors), processes it, and returns it as a single
        # vector. Oftentimes, this is an RNN-based architecture (e.g., LSTM or GRU), but
        # AllenNLP also supports CNNs and other simple architectures (for example,
        # just averaging over the input vectors).
        self.encoder = encoder

        # After converting a sequence of vectors to a single vector, we feed it into
        # a fully-connected linear layer to reduce the dimension to the total number of labels.
        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))
        self.accuracy = CategoricalAccuracy()

        # We use the cross-entropy loss because this is a classification task.
        # Note that PyTorch's CrossEntropyLoss combines softmax and log likelihood loss,
        # which makes it unnecessary to add a separate softmax layer.
        self.loss_function = torch.nn.CrossEntropyLoss()

    # Instances are fed to forward after batching.
    # Fields are passed through arguments with the same name.
    def forward(self,
                question: Dict[str, torch.Tensor],
                label: torch.Tensor = None) -> torch.Tensor:
        # In deep NLP, when sequences of tensors in different lengths are batched together,
        # shorter sequences get padded with zeros to make them of equal length.
        # Masking is the process to ignore extra zeros added by padding
        mask = get_text_field_mask(question)

        # Forward pass
        embeddings = self.word_embeddings(question)
        encoder_out = self.encoder(embeddings, mask)
        logits = self.hidden2tag(encoder_out)

        # In AllenNLP, the output of forward() is a dictionary.
        # Your output dictionary must contain a "loss" key for your model to be trained.
        output = {"logits": logits}
        if label is not None:
            self.accuracy(logits, label)
            output["loss"] = self.loss_function(logits, label)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}

In [3]:
reader = DropReader()

In [6]:
dummy = reader.read("../data/drop_dataset/drop_dataset_dummy.json")

In [23]:
for x in dummy:
    print(x["metadata"]['answer_info'])
    cnt = 0
    for y in x["metadata"]['answer_info']:
        if y == 'answer_texts': continue
        if  x["metadata"]['answer_info'][y] != []:
            cnt += 1
    print("number of possible answers: ", cnt)
    print()

{'answer_texts': ['3'], 'answer_passage_spans': [], 'answer_question_spans': [], 'signs_for_add_sub_expressions': [[0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0]], 'counts': [3]}
number of possible answers:  2

{'answer_texts': ['2'], 'answer_passage_spans': [], 'answer_question_spans': [], 'signs_for_add_sub_expressions': [[0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]], 'counts': [2]}
number of possible answers:  2

{'answer_texts': ['28-yard'], 'answer_passage_spans': [(81, 83)], 'answer_question_spans': [], 'signs_for_add_sub_expressions': [], 'counts': []}
number of possible answers:  1

{'answer_texts': ['Mike Williams'], 'answer_passage_spans': [(176, 177)], 'answer_question_spans': [], 'signs_for_add_sub_expressions': [], 'counts': []}
number of possible answers:  1

{'answer_texts': ['Calvin Johnson'], 'answer_passage_spans': [(104, 105), (156, 157)], 'answer_question_spans': [], 'signs_for_add_sub_expressions': [], 'coun