## Parameters

In [2]:
# as is in https://github.com/allenai/allennlp/blob/master/training_config/naqanet.jsonnet

ROOT_DIR = "../"

# for dataloader
TEXT_LENGTH_LIMIT_PASSAGE = 400
TEXT_LENGTH_LIMIT_QUESTION = 50

# for vocab
MIN_COUNT = 2
EMBEDDING_DIM = 300

# for training
BATCH_SIZE = 8
LR = 5e-4
EPOCHS = 50
PATIENCE = 10

## Packages

In [3]:
import itertools
from overrides import overrides

import numpy as np
import pandas as pd

from typing import *

import torch
import torch.nn as nn
import torch.optim as optim

## DropReader

In [4]:
from allennlp.data.dataset_readers.reading_comprehension.drop import DropReader
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer


reader = DropReader(token_indexers = {"tokens": SingleIdTokenIndexer(lowercase_tokens = True), 
                                      "token_characters" : TokenCharactersIndexer(min_padding_length = 5)
                                     },
                    passage_length_limit = TEXT_LENGTH_LIMIT_PASSAGE, 
                    question_length_limit = TEXT_LENGTH_LIMIT_QUESTION
                   )


In [5]:
train_dataset = reader.read(ROOT_DIR + "data/drop_dataset/drop_dataset_train.json")
dev_dataset = reader.read(ROOT_DIR + "data/drop_dataset/drop_dataset_dev.json")

77409it [01:38, 787.04it/s] 
9536it [00:09, 976.61it/s] 


In [6]:
#vars(vars(train_dataset[0].fields["question"])['_token_indexers']['tokens'])

#tmp = next(iter(train_dataset))
#vars(vars(tmp)['fields']['passage'])

## Model

In [7]:
from allennlp.data.vocabulary import Vocabulary

#vocab = Vocabulary.from_instances(train_dataset, min_count={'tokens': MIN_COUNT})
vocab = Vocabulary.from_instances(train_dataset, 
                                  min_count={"token_characters": 200},
                                  pretrained_files = {"tokens": "https://allennlp.s3.amazonaws.com/datasets/glove/glove.840B.300d.lower.converted.zip"},
                                  only_include_pretrained_words = True
                                 )
vocab

100%|██████████| 77409/77409 [01:09<00:00, 1115.24it/s]
1702926it [00:28, 60353.26it/s]


Vocabulary with namespaces:  tokens, Size: 37918 || token_characters, Size: 120 || Non Padded Namespaces: {'*tags', '*labels'}

In [8]:
from allennlp.modules.token_embedders import Embedding,TokenCharactersEncoder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.common.params import Params



#embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM)
#source_embedder = BasicTextFieldEmbedder({"tokens": embedding})

char_embedding_params = Params({
    'embedding': {"embedding_dim": 64},
    'encoder': {"type": "cnn",
                "embedding_dim": 64,
                "num_filters": 200,
                "ngram_filter_sizes": [5]
               }
})

word_embedding_params = Params({
    'pretrained_file': "https://allennlp.s3.amazonaws.com/datasets/glove/glove.840B.300d.lower.converted.zip",
    'embedding_dim': 300,
    'trainable': False
})

word_embedding = Embedding.from_params(vocab, word_embedding_params)
char_embedding = TokenCharactersEncoder.from_params(vocab, char_embedding_params)

embedder = BasicTextFieldEmbedder({"tokens": word_embedding, "token_characters": char_embedding})


1702926it [00:31, 54066.29it/s]


In [9]:
from allennlp.modules.seq2seq_encoders import QaNetEncoder

phrase_layer_encoder = QaNetEncoder(input_dim = 128,
                                    hidden_dim = 128,
                                    attention_projection_dim = 128,
                                    feedforward_hidden_dim = 128,
                                    num_blocks = 1,
                                    num_convs_per_block = 4,
                                    conv_kernel_size = 7,
                                    num_attention_heads = 8,
                                    dropout_prob = 0.1,
                                    layer_dropout_undecayed_prob = 0.1,
                                    attention_dropout_prob = 0
                                    )

modeling_layer_encoder = QaNetEncoder(input_dim = 128,
                                    hidden_dim = 128,
                                    attention_projection_dim = 128,
                                    feedforward_hidden_dim = 128,
                                    num_blocks = 6,
                                    num_convs_per_block = 2,
                                    conv_kernel_size = 5,
                                    num_attention_heads = 8,
                                    dropout_prob = 0.1,
                                    layer_dropout_undecayed_prob = 0.1,
                                    attention_dropout_prob = 0
                                    )

from allennlp.modules.matrix_attention.linear_matrix_attention import LinearMatrixAttention

matrix_attention_layer = LinearMatrixAttention(tensor_1_dim = 128,
                                               tensor_2_dim = 128,
                                               combination = "x,y,x*y"
                                              )

In [10]:
from allennlp.models.reading_comprehension.naqanet import NumericallyAugmentedQaNet

model = NumericallyAugmentedQaNet(vocab = vocab,
                                  text_field_embedder = embedder, 
                                  num_highway_layers = 2,
                                  phrase_layer = phrase_layer_encoder, 
                                  matrix_attention_layer = matrix_attention_layer,
                                  modeling_layer = modeling_layer_encoder
                                 )

In [11]:
from allennlp.data.iterators import BucketIterator
iterator = BucketIterator(batch_size=BATCH_SIZE, 
                          sorting_keys=[("passage","num_tokens"),
                                        ("question","num_tokens")],
                          max_instances_in_memory = 600)

iterator.index_with(vocab)

In [12]:
optimizer = optim.Adam(model.parameters(), lr=LR, betas = [0.8, 0.999], eps = 1e-7 )

In [13]:
if torch.cuda.is_available():
    print("cuda")
    cuda_device = 0
    model = model.cuda(cuda_device)
else:
    print("cpu")
    cuda_device = -1

cuda


## Training

In [14]:
from allennlp.training.trainer import Trainer

trainer = Trainer(model = model,
                  optimizer = optimizer,
                  iterator = iterator,
                  train_dataset = train_dataset,
                  validation_dataset = dev_dataset,
                  patience = PATIENCE,
                  validation_metric = "+f1",
                  num_epochs = EPOCHS,
                  cuda_device = cuda_device
                 )

In [15]:
trainer.train()

em: 0.1182, f1: 0.1485, loss: 1093317.9238 ||: 100%|██████████| 9677/9677 [1:28:00<00:00,  1.83it/s] 
em: 0.1758, f1: 0.2114, loss: 1986160.8119 ||: 100%|██████████| 1192/1192 [04:01<00:00,  4.95it/s]
em: 0.1669, f1: 0.2007, loss: 1093317.2938 ||: 100%|██████████| 9677/9677 [1:27:24<00:00,  1.85it/s]
em: 0.1912, f1: 0.2319, loss: 1986160.5231 ||: 100%|██████████| 1192/1192 [03:45<00:00,  5.29it/s]
em: 0.2090, f1: 0.2455, loss: 1093317.0304 ||: 100%|██████████| 9677/9677 [1:27:19<00:00,  1.85it/s] 
em: 0.2457, f1: 0.2797, loss: 1986160.4187 ||: 100%|██████████| 1192/1192 [03:45<00:00,  5.29it/s]
em: 0.2373, f1: 0.2745, loss: 1093316.8601 ||: 100%|██████████| 9677/9677 [1:27:06<00:00,  1.85it/s]
em: 0.2537, f1: 0.2864, loss: 1986160.2655 ||: 100%|██████████| 1192/1192 [03:45<00:00,  5.29it/s]
em: 0.2610, f1: 0.3009, loss: 1093316.7173 ||: 100%|██████████| 9677/9677 [1:26:59<00:00,  1.85it/s] 
em: 0.2711, f1: 0.3074, loss: 1986160.1613 ||: 100%|██████████| 1192/1192 [03:46<00:00,  5.26it/

{'best_epoch': 28,
 'peak_cpu_memory_MB': 0,
 'training_duration': '2 days, 9:01:32.943592',
 'training_start_epoch': 0,
 'training_epochs': 37,
 'epoch': 37,
 'training_em': 0.41761293906393315,
 'training_f1': 0.467515017633601,
 'training_loss': 1093315.7951999207,
 'training_cpu_memory_MB': 0.0,
 'validation_em': 0.4185192953020134,
 'validation_f1': 0.45564282718120835,
 'validation_loss': 1986159.748327336,
 'best_validation_em': 0.4286912751677852,
 'best_validation_f1': 0.4699538590604034,
 'best_validation_loss': 1986159.8392908622}

## Saving

In [16]:
with open(ROOT_DIR + "save/naqanet_model.th", 'wb') as f:
    torch.save(model.state_dict(), f)

In [17]:
vocab.save_to_files(ROOT_DIR + "save/naqanet_vocabulary")

## Results

In [19]:
vocab2 = Vocabulary.from_files(ROOT_DIR + "save/naqanet_vocabulary")

model2 = NumericallyAugmentedQaNet(vocab = vocab2,
                                  text_field_embedder = embedder, 
                                  num_highway_layers = 2,
                                  phrase_layer = phrase_layer_encoder, 
                                  matrix_attention_layer = matrix_attention_layer,
                                  modeling_layer = modeling_layer_encoder
                                 )

with open(ROOT_DIR + "save/naqanet_model.th", 'rb') as f:
    model2.load_state_dict(torch.load(f))

if cuda_device > -1:
    model2.cuda(cuda_device)

In [20]:
from allennlp.predictors.predictor import Predictor

predictor = Predictor(model2, reader)

In [21]:
for instance in itertools.islice(dev_dataset, 20):
    ans = predictor.predict_instance(instance)['answer']
    if 'value' in ans:
        print('Passage:', instance.fields['passage'].tokens)
        print('Question:', instance.fields['question'].tokens)
        print('GOLD:', instance.fields['metadata'].metadata['answer_texts'])
        print('PRED:', ans['value'])
        print("=" * 10)
        print()

Passage: [Hoping, to, rebound, from, their, loss, to, the, Patriots, ,, the, Raiders, stayed, at, home, for, a, Week, 16, duel, with, the, Houston, Texans, ., Oakland, would, get, the, early, lead, in, the, first, quarter, as, quarterback, JaMarcus, Russell, completed, a, 20, -, yard, touchdown, pass, to, rookie, wide, receiver, Chaz, Schilens, ., The, Texans, would, respond, with, fullback, Vonta, Leach, getting, a, 1, -, yard, touchdown, run, ,, yet, the, Raiders, would, answer, with, kicker, Sebastian, Janikowski, getting, a, 33, -, yard, and, a, 30, -, yard, field, goal, ., Houston, would, tie, the, game, in, the, second, quarter, with, kicker, Kris, Brown, getting, a, 53, -, yard, and, a, 24, -, yard, field, goal, ., Oakland, would, take, the, lead, in, the, third, quarter, with, wide, receiver, Johnnie, Lee, Higgins, catching, a, 29, -, yard, touchdown, pass, from, Russell, ,, followed, up, by, an, 80, -, yard, punt, return, for, a, touchdown, ., The, Texans, tried, to, rally, in

In [22]:
from allennlp.training.util import evaluate

metrics = evaluate(model2, dev_dataset, iterator, cuda_device, batch_weight_key="")

em: 0.42, f1: 0.46, loss: 1986159.87 ||: 100%|██████████| 1192/1192 [03:52<00:00,  5.12it/s]


In [23]:
print(metrics)

{'em': 0.41900376726663874, 'f1': 0.4572833821682715, 'loss': 1986159.865894376}
