In [None]:
# https://github.com/huggingface/transformers/blob/master/examples/run_squad.py

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 3.4MB/s eta 0:00:01
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 38.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 31.7MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884629 sha256=88bfa

In [2]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json https://rawcdn.githack.com/allenai/bi-att-flow/49004549e9a88b78c359b31481afa7792dbb3f4a/squad/evaluate-v1.1.py

--2020-01-15 15:14:06--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30288272 (29M) [application/json]
Saving to: ‘train-v1.1.json’


2020-01-15 15:14:07 (59.0 MB/s) - ‘train-v1.1.json’ saved [30288272/30288272]

--2020-01-15 15:14:07--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
Reusing existing connection to rajpurkar.github.io:443.
HTTP request sent, awaiting response... 200 OK
Length: 4854279 (4.6M) [application/json]
Saving to: ‘dev-v1.1.json’


2020-01-15 15:14:07 (171 MB/s) - ‘dev-v1.1.json’ saved [4854279/4854279]

--2020-01-15 15:14:07--  https://rawcdn.githack.com/allenai/bi-att-flow/49004549e9a88b78c359b31481afa7792dbb3f4a/squad/evaluate-v1.1.py
Resolving rawcdn.githack.com (rawc

In [1]:
class Args():
    model_type = 'bert'
    model_name_or_path = 'bert-base-cased'
    output_dir = 'output/'
    train_file = 'train-v1.1.json'
    predict_file = 'dev-v1.1.json'
    max_seq_length = 384
    doc_stride = 128
    max_query_length = 64
    evaluate_during_training = False
    per_gpu_train_batch_size = 12
    per_gpu_eval_batch_size = 8
    learning_rate = 3e-5
    gradient_accumulation_steps = 1
    weight_decay = 0.0
    adam_epsilon = 1e-8
    max_grad_norm = 1.0
    num_train_epochs = 2.0
    warmup_steps = 0
    n_best_size = 20
    max_answer_length = 30
    logging_steps = 50
    save_steps = 1000
    no_cuda = False
    seed = 42
args = Args()

In [2]:
import glob
import logging
import os
import random
import timeit

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
# from tqdm import tqdm, trange
from tqdm import trange
from tqdm import tqdm_notebook as tqdm

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    BertConfig,
    BertForQuestionAnswering,
    BertTokenizer,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
)

from transformers.data.metrics.squad_metrics import (
    compute_predictions_log_probs,
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor

from torch.utils.tensorboard import SummaryWriter

In [3]:
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

def to_list(tensor):
    return tensor.detach().cpu().tolist()

In [4]:
# Setup CUDA, GPU & distributed training
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
args.device = device

In [5]:
 # Setup logging
logger = logging.getLogger(__name__)
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)
logger.warning(
    "Device: %s, n_gpu: %s",
    device,
    args.n_gpu,
)

# Set seed
set_seed(args)



In [6]:
MODEL_CLASSES = {"bert": (BertConfig, BertForQuestionAnswering, BertTokenizer)}
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(
    args.model_name_or_path,
)
tokenizer = tokenizer_class.from_pretrained(
    args.model_name_or_path,
    do_lower_case=True,
)
#the nn.module BertForQuestionAnswering has a single untrained layer qa_output: Linear(hidden_size,2) on top of the trained BERT-base.
model = model_class.from_pretrained(
    args.model_name_or_path,
    config=config,
)

model.to(args.device)

logger.info("Training/evaluation parameters %s", args)

01/15/2020 19:10:18 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at C:\Users\lt\.cache\torch\transformers\b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.d7a3af18ce3a2ab7c0f48f04dc8daff45ed9a3ed333b9e9a79d012a0dedf87a6
01/15/2020 19:10:18 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "

In [8]:
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
    input_dir = "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        processor = SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(filename=args.predict_file)
        else:
            examples = processor.get_train_examples(filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
        )

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset

In [9]:
train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)

01/15/2020 19:11:05 - INFO - __main__ -   Loading features from cached file .\cached_train_bert-base-cased_384


In [None]:
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps,
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=False
    )
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=False)
        for step, batch in enumerate(epoch_iterator):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if  args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

    tb_writer.close()

    return global_step, tr_loss / global_step

In [None]:
# Training
global_step, tr_loss = train(args, train_dataset, model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

01/14/2020 21:17:37 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7f2b799ab518>
01/14/2020 21:17:37 - INFO - __main__ -   Loading features from cached file ./cached_train_bert-base-cased_384
01/14/2020 21:18:03 - INFO - __main__ -   ***** Running training *****
01/14/2020 21:18:03 - INFO - __main__ -     Num examples = 89632
01/14/2020 21:18:03 - INFO - __main__ -     Num Epochs = 2
01/14/2020 21:18:03 - INFO - __main__ -     Instantaneous batch size per GPU = 12
01/14/2020 21:18:03 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 12
01/14/2020 21:18:03 - INFO - __main__ -     Gradient Accumulation steps = 1
01/14/2020 21:18:03 - INFO - __main__ -     Total optimization steps = 14940


Epoch:   0%|          | 0/2 [00:00<?, ?it/s][A[A

HBox(children=(IntProgress(value=0, description='Iteration', max=7470, style=ProgressStyle(description_width='…

01/14/2020 21:26:38 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-1000/config.json
01/14/2020 21:26:39 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-1000/pytorch_model.bin
01/14/2020 21:26:39 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-1000
01/14/2020 21:26:43 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-1000
01/14/2020 21:35:20 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-2000/config.json
01/14/2020 21:35:21 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-2000/pytorch_model.bin
01/14/2020 21:35:21 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-2000
01/14/2020 21:35:24 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-2000
01/14/2020 21:44:00 - INFO - trans




HBox(children=(IntProgress(value=0, description='Iteration', max=7470, style=ProgressStyle(description_width='…

01/14/2020 22:27:27 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-8000/config.json
01/14/2020 22:27:28 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-8000/pytorch_model.bin
01/14/2020 22:27:28 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-8000
01/14/2020 22:27:32 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-8000
01/14/2020 22:36:08 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-9000/config.json
01/14/2020 22:36:09 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-9000/pytorch_model.bin
01/14/2020 22:36:09 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-9000
01/14/2020 22:36:12 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-9000
01/14/2020 22:44:49 - INFO - trans




In [None]:
# Save the trained model and the tokenizer

# Create output directory if needed
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

logger.info("Saving model checkpoint to %s", args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
# Take care of distributed/parallel training
model_to_save = model.module if hasattr(model, "module") else model
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)

# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

In [21]:
# Load a trained model and vocabulary that you have fine-tuned
model = model_class.from_pretrained(args.output_dir)  # , force_download=True)
tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=True)
model.to(args.device)

01/15/2020 15:43:50 - INFO - transformers.configuration_utils -   loading configuration file output/config.json
01/15/2020 15:43:50 - INFO - transformers.configuration_utils -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 28996
}

01/15/2020 15:43:50 - INFO - transformers.modeling_utils -   loading weights file output/pytorch_model.bin
01/15/2020 15:44:05

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [None]:
def evaluate(args, model, tokenizer, prefix=""):
    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)

    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)

    all_results = []
    start_time = timeit.default_timer()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            example_indices = batch[3]

            outputs = model(**inputs)

        for i, example_index in enumerate(example_indices):
            eval_feature = features[example_index.item()]
            unique_id = int(eval_feature.unique_id)

            output = [to_list(output[i]) for output in outputs]

            start_logits, end_logits = output
            result = SquadResult(unique_id, start_logits, end_logits)

            all_results.append(result)

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))

    output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        examples,
        features,
        all_results,
        args.n_best_size,
        args.max_answer_length,
        do_lower_case=False,
        output_prediction_file=output_prediction_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=output_null_log_odds_file,
        verbose_logging=False,
        version_2_with_negative=False,
        null_score_diff_threshold=0.0,
    )

    # Compute the F1 and exact scores.
    results = squad_evaluate(examples, predictions)
    return results

In [None]:
# Evaluate
result = evaluate(args, model, tokenizer)
logger.info("Results: {}".format(result))

01/14/2020 23:57:28 - INFO - __main__ -   Loading features from cached file ./cached_dev_bert-base-cased_384
01/14/2020 23:57:31 - INFO - __main__ -   ***** Running evaluation  *****
01/14/2020 23:57:31 - INFO - __main__ -     Num examples = 10970
01/14/2020 23:57:31 - INFO - __main__ -     Batch size = 8


HBox(children=(IntProgress(value=0, description='Evaluating', max=1372, style=ProgressStyle(description_width=…

01/15/2020 00:00:14 - INFO - __main__ -     Evaluation done in total 162.810710 secs (0.014841 sec per example)
01/15/2020 00:00:14 - INFO - transformers.data.metrics.squad_metrics -   Writing predictions to: tmp/debug_squad/predictions_.json
01/15/2020 00:00:14 - INFO - transformers.data.metrics.squad_metrics -   Writing nbest to: tmp/debug_squad/nbest_predictions_.json





01/15/2020 00:00:55 - INFO - __main__ -   Results: {'exact': 80.26490066225166, 'f1': 87.99811557687403, 'total': 10570, 'HasAns_exact': 80.26490066225166, 'HasAns_f1': 87.99811557687403, 'HasAns_total': 10570, 'best_exact': 80.26490066225166, 'best_exact_thresh': 0.0, 'best_f1': 87.99811557687403, 'best_f1_thresh': 0.0}


In [None]:
#run model for single input pair of (question, document)
def predict(args,tokenizer,model,q,doc,device):
    indexed_tokens = tokenizer.encode(q,doc) #in the format (question...)[SEP](document...)[SEP]
    seg_idx = indexed_tokens.index(102)+1 #102 is index for [SEP]
    segment_ids = [0]*seg_idx+[1]*(len(indexed_tokens)-seg_idx) #0 for (question...), 1 for (document...)
    attention_mask = [1]*len(indexed_tokens) #mask out padding
    
    #padding
    indexed_tokens += [0]*(args.max_seq_length-len(indexed_tokens))
    segment_ids += [0]*(args.max_seq_length-len(segment_ids))
    attention_mask += [0]*(args.max_seq_length-len(attention_mask))
    
    # for debugging
    # ind2word = {v:k for k,v in tokenizer.vocab.items()}
    # [ind2word[ind] for ind in indexed_tokens]

    #convert to pytorch tensor
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segment_tensor = torch.tensor([segment_ids]).to(device)
    attention_tensor = torch.tensor([attention_mask]).to(device)

    # Predict the start and end positions logits
    with torch.no_grad():
        start_logits, end_logits = model(tokens_tensor, token_type_ids=segment_tensor, attention_mask=attention_tensor)

    # get the highest prediction
    answer = tokenizer.decode(indexed_tokens[torch.argmax(start_logits):torch.argmax(end_logits)+1])
    return answer

In [24]:
doc = "Jim Henson was a puppeteer"
q = "Who was Jim Henson?"
ans = predict(args,tokenizer,model,q,doc,device)
print(ans)

a puppeteer


In [25]:
doc = "Once upon a time there was an old mother pig who had three little pigs and not enough food to feed them. So when they were old enough, she sent them out into the world to seek their fortunes. The first little pig was very lazy. He didn't want to work at all and he built his house out of straw. The second little pig worked a little bit harder but he was somewhat lazy too and he built his house out of sticks. Then, they sang and danced and played together the rest of the day. The third little pig worked hard all day and built his house with bricks. It was a sturdy house complete with a fine fireplace and chimney. It looked like it could withstand the strongest winds. The next day, a wolf happened to pass by the lane where the three little pigs lived; and he saw the straw house, and he smelled the pig inside. He thought the pig would make a mighty fine meal and his mouth began to water."
q = 'What did the pigs do for the rest of the day?'
ans = predict(args,tokenizer,model,q,doc,device)
print(ans)

sang and danced and played together
