# expanding the provided SQuAD training script

In [1]:
import datasets
from transformers import AutoTokenizer, AutoModelForSequenceClassification, \
    AutoModelForQuestionAnswering, Trainer, TrainingArguments, HfArgumentParser, pipeline
from helpers import prepare_dataset_nli, prepare_train_dataset_qa, \
    prepare_validation_dataset_qa, QuestionAnsweringTrainer, compute_accuracy
import os
import json

NUM_PREPROCESSING_WORKERS = 2

In [2]:
MODEL='google/electra-small-discriminator'
TASK='qa'
DATASET='squad'
MAX_LENGTH=128
TRAIN_MAX_SAMPLES=1000   # or None

### dataset

In [3]:
eval_split = 'validation'
dataset = datasets.load_dataset('squad')

Reusing dataset squad (/home/sambeck/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


In [4]:
dataset['train'][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [5]:
dataset['train'][0]['context'][515:]

'Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

### Model

In [6]:
model_class = AutoModelForQuestionAnswering
# Initialize the model and tokenizer from the specified pretrained model/checkpoint
model = model_class.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['qa_outputs.bias', 'qa_outputs.

In [7]:
# model pipeline (includes tokenization)
mp = pipeline("question-answering", tokenizer=tokenizer, model=model, device=0)

In [8]:
pred = mp(question="Which color is the dog?", context="There is a black dog.", truncation=True, )
print(pred)

{'score': 0.031878624111413956, 'start': 20, 'end': 21, 'answer': '.'}


### Set up dataset for training

In [9]:
prepare_train_dataset = lambda exs: prepare_train_dataset_qa(exs, tokenizer)
prepare_eval_dataset = lambda exs: prepare_validation_dataset_qa(exs, tokenizer)

train_dataset = None
eval_dataset = None
train_dataset_featurized = None
eval_dataset_featurized = None
train_dataset = dataset['train']

if TRAIN_MAX_SAMPLES:
    train_dataset = train_dataset.select(range(TRAIN_MAX_SAMPLES))

train_dataset_featurized = train_dataset.map(
    prepare_train_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=train_dataset.column_names
)

eval_dataset = dataset[eval_split]
eval_dataset_featurized = eval_dataset.map(
    prepare_eval_dataset,
    batched=True,
    num_proc=NUM_PREPROCESSING_WORKERS,
    remove_columns=eval_dataset.column_names
)


Loading cached processed dataset at /home/sambeck/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-c34bc99166e32dad.arrow
Loading cached processed dataset at /home/sambeck/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-49b59acbfe130203.arrow


In [10]:
trainer_class = Trainer
eval_kwargs = {}
# If you want to use custom metrics, you should define your own "compute_metrics" function.
# For an example of a valid compute_metrics function, see compute_accuracy in helpers.py.
compute_metrics = None
# For QA, we need to use a tweaked version of the Trainer (defined in helpers.py)
# to enable the question-answering specific evaluation metrics
trainer_class = QuestionAnsweringTrainer
eval_kwargs['eval_examples'] = eval_dataset
metric = datasets.load_metric('squad')
compute_metrics = lambda eval_preds: metric.compute(
    predictions=eval_preds.predictions, references=eval_preds.label_ids)

In [11]:
# This function wraps the compute_metrics function, storing the model's predictions
# so that they can be dumped along with the computed metrics
eval_predictions = None
def compute_metrics_and_store_predictions(eval_preds):
    global eval_predictions
    eval_predictions = eval_preds
    return compute_metrics(eval_preds)

### Train

In [12]:
# Initialize the Trainer object with the specified arguments and the model and dataset we loaded above
trainer = trainer_class(
    model=model,
    train_dataset=train_dataset_featurized,
    eval_dataset=eval_dataset_featurized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics_and_store_predictions
)

In [13]:
trainer.train()


***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=375, training_loss=3.32203125, metrics={'train_runtime': 109.7682, 'train_samples_per_second': 27.33, 'train_steps_per_second': 3.416, 'total_flos': 87652583424000.0, 'train_loss': 3.32203125, 'epoch': 3.0})

In [14]:
pred = mp(question="Which color is the dog?", context="There is a black dog.", truncation=True, )
print(pred)

{'score': 0.3775675594806671, 'start': 11, 'end': 16, 'answer': 'black'}


### Save

In [15]:
# trainer.save_model()