# expanding the provided SQuAD training script

In [1]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser, pipeline
from helpers import prepare_dataset_nli, prepare_train_dataset_qa, \
    prepare_validation_dataset_qa, QuestionAnsweringTrainer, compute_accuracy
import os
import json

NUM_PREPROCESSING_WORKERS = 2

In [2]:
MODEL='google/electra-small-discriminator'
TASK='qa'
DATASET='squad'
MAX_LENGTH=128
TRAIN_MAX_SAMPLES=None

### compare new Checklist-based dataset with Squad

(you can make a basic Checklist dataset using the make_checklist_dataset notebook)

In [3]:
dataset = datasets.Dataset.from_file('./new_dataset/dataset.arrow')
dataset = dataset.shuffle()
dataset = dataset.train_test_split(test_size=0.1)

In [4]:
dataset['train'][:30]

OrderedDict([('id',
              ['56e7a89837bdd419002c42e0',
               '3fb01864d8757d4f',
               '5c519404bb373f50',
               '573416fcd058e614000b6917',
               '56e0a85e7aa994140058e69f',
               '571aa18a4faf5e1900b8ab68',
               '571df47eb64a571400c71e24',
               '5727af18ff5b5019007d9284',
               '570e69cb0dc6ce1900205046',
               '5735e8d3012e2f140011a0d5',
               '5711252ab654c5140001fbc1',
               '56f88690a6d7ea1400e17723',
               '571a7fbe4faf5e1900b8a9eb',
               '5726f906f1498d1400e8f175',
               '57282fdf3acd2414000df694',
               '56df6a8d5ca0a614008f99d9',
               'x31389474e271a6a1',
               '572f8332a23a5019007fc6bb',
               '5728fb8a6aef05140015493f',
               '56bf725c3aeaaa14008c9647',
               '5723eae00dadf01500fa1fa1',
               '5727fbf84b864d1900164158',
               '5730146f947a6a140053d07e',
              

In [5]:
sqd = datasets.load_dataset('squad')
for col in ['id', 'title', 'context', 'question', 'answers']:
    print()
    print(col)
    print(sqd['train'][col][0:2])
    print(dataset['train'][col][0:2])    

Reusing dataset squad (/home/sambeck/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)



id
['5733be284776f41900661182', '5733be284776f4190066117f']
['56e7a89837bdd419002c42e0', '3fb01864d8757d4f']

title
['University_of_Notre_Dame', 'University_of_Notre_Dame']
['Daylight_saving_time', 'who_is_more_x']

context
['Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'Architecturally, the school has a Catholic character. Atop the Main Build

### Model

In [6]:
model_class = AutoModelForQuestionAnswering
# Initialize the model and tokenizer from the specified pretrained model/checkpoint
model = model_class.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['qa_outputs.bias', 'qa_outputs.

In [7]:
# model pipeline (includes tokenization)
mp = pipeline("question-answering", tokenizer=tokenizer, model=model, device=0)

In [8]:
pred = mp(question="Which color is the dog?", context="There is a black dog.", truncation=True, )
print(pred)

{'score': 0.030282270163297653, 'start': 0, 'end': 5, 'answer': 'There'}


### Set up dataset for training

In [9]:
train_dataset_featurized = None
eval_dataset_featurized = None
train_dataset = dataset['train']
eval_dataset = dataset['test']

In [10]:
prepare_train_dataset = lambda exs: prepare_train_dataset_qa(exs, tokenizer)
prepare_eval_dataset = lambda exs: prepare_validation_dataset_qa(exs, tokenizer)

if TRAIN_MAX_SAMPLES:
    train_dataset = train_dataset.select(range(TRAIN_MAX_SAMPLES))

print('featurize train...')
train_dataset_featurized = train_dataset.map(
    prepare_train_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=train_dataset.column_names
)

print('featurize test...')
eval_dataset_featurized = eval_dataset.map(
    prepare_eval_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=eval_dataset.column_names
)


featurize train...
featurize test...


In [1]:
trainer_class = Trainer
eval_kwargs = {}
# If you want to use custom metrics, you should define your own "compute_metrics" function.
# For an example of a valid compute_metrics function, see compute_accuracy in helpers.py.
compute_metrics = None
# For QA, we need to use a tweaked version of the Trainer (defined in helpers.py)
# to enable the question-answering specific evaluation metrics
trainer_class = QuestionAnsweringTrainer
eval_kwargs['eval_examples'] = eval_dataset
metric = datasets.load_metric('squad')
compute_metrics = lambda eval_preds: metric.compute(
    predictions=eval_preds.predictions, references=eval_preds.label_ids)

NameError: name 'Trainer' is not defined

In [12]:
# This function wraps the compute_metrics function, storing the model's predictions
# so that they can be dumped along with the computed metrics
eval_predictions = None
def compute_metrics_and_store_predictions(eval_preds):
    global eval_predictions
    eval_predictions = eval_preds
    return compute_metrics(eval_preds)

### Train

In [13]:
# Initialize the Trainer object with the specified arguments and the model and dataset we loaded above
trainer = trainer_class(
    model=model,
    train_dataset=train_dataset_featurized,
    eval_dataset=eval_dataset_featurized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_and_store_predictions
)

In [14]:
trainer.train()


***** Running training *****
  Num examples = 96522
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 36198


Step,Training Loss
500,3.0626
1000,2.1191
1500,1.6927
2000,1.535
2500,1.434
3000,1.3597
3500,1.3091
4000,1.3504
4500,1.3183
5000,1.2858


Saving model checkpoint to tmp_trainer/checkpoint-500
Configuration saved in tmp_trainer/checkpoint-500/config.json
Model weights saved in tmp_trainer/checkpoint-500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-500/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-500/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-1000
Configuration saved in tmp_trainer/checkpoint-1000/config.json
Model weights saved in tmp_trainer/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-1000/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-1500
Configuration saved in tmp_trainer/checkpoint-1500/config.json
Model weights saved in tmp_trainer/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-1500/tokenizer_config.json
Special tokens file saved in tmp_traine

tokenizer config file saved in tmp_trainer/checkpoint-12000/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-12000/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-12500
Configuration saved in tmp_trainer/checkpoint-12500/config.json
Model weights saved in tmp_trainer/checkpoint-12500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-12500/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-12500/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-13000
Configuration saved in tmp_trainer/checkpoint-13000/config.json
Model weights saved in tmp_trainer/checkpoint-13000/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-13000/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-13000/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-13500
Configuration saved in tmp_trainer/checkpoint-13500/config.json
Model we

Special tokens file saved in tmp_trainer/checkpoint-23500/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-24000
Configuration saved in tmp_trainer/checkpoint-24000/config.json
Model weights saved in tmp_trainer/checkpoint-24000/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-24000/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-24000/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-24500
Configuration saved in tmp_trainer/checkpoint-24500/config.json
Model weights saved in tmp_trainer/checkpoint-24500/pytorch_model.bin
tokenizer config file saved in tmp_trainer/checkpoint-24500/tokenizer_config.json
Special tokens file saved in tmp_trainer/checkpoint-24500/special_tokens_map.json
Saving model checkpoint to tmp_trainer/checkpoint-25000
Configuration saved in tmp_trainer/checkpoint-25000/config.json
Model weights saved in tmp_trainer/checkpoint-25000/pytorch_model.bin
tokenizer config fil

RuntimeError: [enforce fail at inline_container.cc:274] . unexpected pos 74166976 vs 74166864

In [16]:
pred = mp(question="Which color is the dog?", context="There is a black dog.", truncation=True, )
print(pred)

{'score': 0.9823248386383057, 'start': 11, 'end': 16, 'answer': 'black'}


In [17]:
pred = mp(question='Who is the most awesome?', context='William is awesome, but John is more awesome', truncation=True, )
print(pred)

{'score': 0.995335042476654, 'start': 24, 'end': 28, 'answer': 'John'}


In [18]:
pred = mp(question="Which thing is hot?", context="There is a cold gopher, a polar bear, and a hot snake.", truncation=True, )
print(pred)

{'score': 0.8586492538452148, 'start': 48, 'end': 53, 'answer': 'snake'}


In [25]:
pred = mp(question="Which thing is least hot?", context="There is a cold gopher, a polar bear, and a hot snake.", truncation=True, )
print(pred)

{'score': 0.4307215213775635, 'start': 11, 'end': 22, 'answer': 'cold gopher'}


### Save

In [27]:
trainer.save_model('./trained_on_expanded_data_model/')

Saving model checkpoint to ./trained_on_expanded_data_model/
Configuration saved in ./trained_on_expanded_data_model/config.json
Model weights saved in ./trained_on_expanded_data_model/pytorch_model.bin
tokenizer config file saved in ./trained_on_expanded_data_model/tokenizer_config.json
Special tokens file saved in ./trained_on_expanded_data_model/special_tokens_map.json
