In [None]:
# https://github.com/huggingface/transformers/blob/master/examples/run_squad.py

In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 2.8MB/s eta 0:00:01
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/74/f4/2d5214cbf13d06e7cb2c20d84115ca25b53ea76fa1f0ade0e3c9749de214/sentencepiece-0.1.85-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 9.1MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/a6/b4/7a41d630547a4afd58143597d5a49e07bfd4c42914d8335b2a5657efc14b/sacremoses-0.0.38.tar.gz (860kB)
[K     |████████████████████████████████| 870kB 18.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.38-cp36-none-any.whl size=884629 sha256=5813bc

In [2]:
!wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json

--2020-01-20 21:35:50--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.111.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 30288272 (29M) [application/json]
Saving to: ‘train-v1.1.json’


2020-01-20 21:35:50 (137 MB/s) - ‘train-v1.1.json’ saved [30288272/30288272]

--2020-01-20 21:35:50--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
Reusing existing connection to rajpurkar.github.io:443.
HTTP request sent, awaiting response... 200 OK
Length: 4854279 (4.6M) [application/json]
Saving to: ‘dev-v1.1.json’


2020-01-20 21:35:50 (259 MB/s) - ‘dev-v1.1.json’ saved [4854279/4854279]

FINISHED --2020-01-20 21:35:50--
Total wall clock time: 0.4s
Downloaded: 2 files, 34M in 0.2s (147 MB/s)


In [None]:
import os
import random
import timeit

import numpy as np
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tqdm_notebook

from transformers import (
    AdamW,
    BertConfig,
    BertForQuestionAnswering,
    BertTokenizer,
    get_linear_schedule_with_warmup,
    squad_convert_examples_to_features,
)

from transformers.data.metrics.squad_metrics import (
    compute_predictions_logits,
    squad_evaluate,
)
from transformers.data.processors.squad import SquadResult, SquadV1Processor

In [None]:
# Set seed
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
# Setup CUDA, GPU & distributed training
device = torch.device("cuda")

In [8]:
config = BertConfig.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased',do_lower_case=True,)
#the nn.module BertForQuestionAnswering has a single untrained layer qa_output: Linear(hidden_size,2) on top of the trained BERT-base.
model = BertForQuestionAnswering.from_pretrained('bert-base-cased',config=config,)

model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [None]:
max_seq_length=384
def load_and_cache_examples(tokenizer, is_training=True):
    # Load data features from cache or dataset file
    cached_features_file = "cached_{}".format("train" if is_training else "dev")

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        print("Loading features from cached file ", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        print("Creating features from dataset file")
        
        if is_training:
            examples = SquadV1Processor().get_train_examples('')
        else:
            examples = SquadV1Processor().get_dev_examples('')

        features, dataset = squad_convert_examples_to_features(
            examples,tokenizer,max_seq_length,
            doc_stride=128,
            max_query_length=64,
            is_training=is_training,
            return_dataset="pt")

        print("Saving features into cached file", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    return dataset, examples, features

In [14]:
train_dataset = load_and_cache_examples(tokenizer, is_training=True)[0]

Loading features from cached file %s cached_train


In [None]:
# Training
""" Train the model """
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=12)
epochs = 2
t_total = len(train_dataloader) * epochs

# Prepare optimizer and schedule (linear warmup and decay)
optimizer = AdamW(model.parameters(), lr=3e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=t_total)

# Train!
print("***** Running training *****")
print("  Num examples = ", len(train_dataset))
print("  Total optimization steps = ", t_total)

global_step = 1
tr_loss = 0.0
model.zero_grad()

for epoch in range(epochs):
    print('Epoch:{}'.format(epoch+1))
    epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration", disable=False)
    for step, batch in enumerate(epoch_iterator):

        model.train()
        batch = tuple(t.to(device) for t in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
            "start_positions": batch[3],
            "end_positions": batch[4],
        }

        outputs = model(**inputs)
        # model outputs are always tuple in transformers (see doc)
        loss = outputs[0]

        loss.backward()

        tr_loss += loss.item()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        model.zero_grad()
        global_step += 1
        
        # Log metrics
        if global_step % 50 == 0:
            # Only evaluate when single GPU otherwise metrics may not average well
            print('Global step = {}, logging_loss = {}'.format(global_step,tr_loss))

print(" global_step = %s, average loss = %s", global_step, tr_loss / global_step)

01/14/2020 21:17:37 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7f2b799ab518>
01/14/2020 21:17:37 - INFO - __main__ -   Loading features from cached file ./cached_train_bert-base-cased_384
01/14/2020 21:18:03 - INFO - __main__ -   ***** Running training *****
01/14/2020 21:18:03 - INFO - __main__ -     Num examples = 89632
01/14/2020 21:18:03 - INFO - __main__ -     Num Epochs = 2
01/14/2020 21:18:03 - INFO - __main__ -     Instantaneous batch size per GPU = 12
01/14/2020 21:18:03 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 12
01/14/2020 21:18:03 - INFO - __main__ -     Gradient Accumulation steps = 1
01/14/2020 21:18:03 - INFO - __main__ -     Total optimization steps = 14940


Epoch:   0%|          | 0/2 [00:00<?, ?it/s][A[A

HBox(children=(IntProgress(value=0, description='Iteration', max=7470, style=ProgressStyle(description_width='…

01/14/2020 21:26:38 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-1000/config.json
01/14/2020 21:26:39 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-1000/pytorch_model.bin
01/14/2020 21:26:39 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-1000
01/14/2020 21:26:43 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-1000
01/14/2020 21:35:20 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-2000/config.json
01/14/2020 21:35:21 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-2000/pytorch_model.bin
01/14/2020 21:35:21 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-2000
01/14/2020 21:35:24 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-2000
01/14/2020 21:44:00 - INFO - trans




HBox(children=(IntProgress(value=0, description='Iteration', max=7470, style=ProgressStyle(description_width='…

01/14/2020 22:27:27 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-8000/config.json
01/14/2020 22:27:28 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-8000/pytorch_model.bin
01/14/2020 22:27:28 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-8000
01/14/2020 22:27:32 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-8000
01/14/2020 22:36:08 - INFO - transformers.configuration_utils -   Configuration saved in tmp/debug_squad/checkpoint-9000/config.json
01/14/2020 22:36:09 - INFO - transformers.modeling_utils -   Model weights saved in tmp/debug_squad/checkpoint-9000/pytorch_model.bin
01/14/2020 22:36:09 - INFO - __main__ -   Saving model checkpoint to tmp/debug_squad/checkpoint-9000
01/14/2020 22:36:12 - INFO - __main__ -   Saving optimizer and scheduler states to tmp/debug_squad/checkpoint-9000
01/14/2020 22:44:49 - INFO - trans




In [18]:
# Save the trained model and the tokenizer
output_dir = 'output/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model checkpoint to %s", output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

Saving model checkpoint to %s output/


('output/vocab.txt',
 'output/special_tokens_map.json',
 'output/added_tokens.json')

In [19]:
# Load a trained model and vocabulary that you have fine-tuned
model = BertForQuestionAnswering.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=True)
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [22]:
# Evaluate
dataset, examples, features = load_and_cache_examples(tokenizer, is_training=False)
eval_sampler = SequentialSampler(dataset)
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=8)

# Eval!
print("***** Running evaluation *****")
print("  Num examples = ", len(dataset))

all_results = []
start_time = timeit.default_timer()

for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"):
    model.eval()
    batch = tuple(t.to(device) for t in batch)

    with torch.no_grad():
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
        }

        example_indices = batch[3]

        outputs = model(**inputs)

    for i, example_index in enumerate(example_indices):
        eval_feature = features[example_index.item()]
        unique_id = int(eval_feature.unique_id)
        output = [output[i].detach().cpu().tolist() for output in outputs]
        start_logits, end_logits = output
        result = SquadResult(unique_id, start_logits, end_logits)
        all_results.append(result)

evalTime = timeit.default_timer() - start_time
print("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))

predictions = compute_predictions_logits(
    examples,
    features,
    all_results,
    n_best_size = 20,
    max_answer_length = 30,
    do_lower_case=False,
    output_prediction_file="predictions.json",
    output_nbest_file="nbest_predictions.json",
    output_null_log_odds_file=None,
    verbose_logging=False,
    version_2_with_negative=False,
    null_score_diff_threshold=0.0,
)

# Compute the F1 and exact scores.
results = squad_evaluate(examples, predictions)

print("Results: {}".format(results))

Loading features from cached file %s cached_dev
***** Running evaluation *****
  Num examples =  10970


HBox(children=(IntProgress(value=0, description='Evaluating', max=1372, style=ProgressStyle(description_width=…

  Evaluation done in total %f secs (%f sec per example) 325.7774384559998 0.029697122922151303
Results: OrderedDict([('exact', 79.75402081362347), ('f1', 87.70043473208098), ('total', 10570), ('HasAns_exact', 79.75402081362347), ('HasAns_f1', 87.70043473208098), ('HasAns_total', 10570), ('best_exact', 79.75402081362347), ('best_exact_thresh', 0.0), ('best_f1', 87.70043473208098), ('best_f1_thresh', 0.0)])


In [None]:
def predict(q,doc):
    indexed_tokens = tokenizer.encode(q,doc)
    seg_idx = indexed_tokens.index(102)+1
    attention_mask = [1]*len(indexed_tokens)
    segment_ids = [0]*seg_idx+[1]*(len(indexed_tokens)-seg_idx)
    indexed_tokens += [0]*(max_seq_length-len(indexed_tokens))
    attention_mask += [0]*(max_seq_length-len(attention_mask))
    segment_ids += [0]*(max_seq_length-len(segment_ids))
    
    # for debugging
    # ind2word = {v:k for k,v in tokenizer.vocab.items()}
    # [ind2word[ind] for ind in indexed_tokens]

    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    segment_tensor = torch.tensor([segment_ids]).to(device)
    attention_tensor = torch.tensor([attention_mask]).to(device)

    # Predict the start and end positions logits
    with torch.no_grad():
        start_logits, end_logits = model(tokens_tensor, token_type_ids=segment_tensor, attention_mask=attention_tensor)

    # get the highest prediction
    answer = tokenizer.decode(indexed_tokens[torch.argmax(start_logits):torch.argmax(end_logits)+1])
    return answer

In [24]:
doc = "Jim Henson was a puppeteer"
q = "Who was Jim Henson?"
ans = predict(q,doc)
print(ans)

a puppeteer


In [25]:
doc = "Once upon a time there was an old mother pig who had three little pigs and not enough food to feed them. So when they were old enough, she sent them out into the world to seek their fortunes. The first little pig was very lazy. He didn't want to work at all and he built his house out of straw. The second little pig worked a little bit harder but he was somewhat lazy too and he built his house out of sticks. Then, they sang and danced and played together the rest of the day. The third little pig worked hard all day and built his house with bricks. It was a sturdy house complete with a fine fireplace and chimney. It looked like it could withstand the strongest winds. The next day, a wolf happened to pass by the lane where the three little pigs lived; and he saw the straw house, and he smelled the pig inside. He thought the pig would make a mighty fine meal and his mouth began to water."
q = 'What did the pigs do for the rest of the day?'
ans = predict(q,doc)
print(ans)

sang and danced and played together


In [None]:
test