### Load a trained model and evaluate it on the evaluation dataset

In [6]:
from datasets import load_dataset
from datasets import load_metric
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import TrainingArguments
from transformers import Trainer
from transformers import pipeline

from tqdm.auto import tqdm

# to compute metrics in evaluation phase
import numpy as np
import collections

In [2]:
# il dataset squad_it (in italiano), ottenuto mediante traduzione semi-automatica da SQuAd, è presente in HF Hub
raw_datasets = load_dataset("squad_it")

# carichiamo le metriche (EM, f1) usate comunemente per Squad
metric = load_metric("squad")

Using custom data configuration default
Reusing dataset squad_it (/home/datascience/.cache/huggingface/datasets/squad_it/default/0.1.0/d442bdb4794b4bae227ab19105b76d706ed7cf2ac342e4c9da4a5c36bde19d71)


In [3]:
# here we define the model, pre-trained on italian language, we start from
# we can take the tokenizer from here (we don't train it!)
BASE_MODEL_CHECKPOINT = "dbmdz/bert-base-italian-xxl-cased"

MAX_LENGTH = 384
STRIDE = 128

In [4]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_CHECKPOINT)

In [7]:
#
# this code has to be duplicated 
#
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

#
# compute EM e F1 in fase di evaluation
#

def compute_metrics(start_logits, end_logits, features, examples):
    # questi parm. governano solo il calcolo di EM ed f1
    n_best = 20
    max_answer_length = 30
    
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [15]:
#
# valuta il modello su eval dataset
#
def evaluate_model(trainer, val_dataset, raw_val_dataset):
    # calcola i logits in inference
    predictions, _, _ = trainer.predict(val_dataset)
    start_logits, end_logits = predictions

    eval_metrics = compute_metrics(start_logits, end_logits, val_dataset, raw_val_dataset)

    print()
    print("Prestazioni su Evaluation set:")
    print(f"EM: {round(eval_metrics['exact_match'], 2)}, F1-score: {round(eval_metrics['f1'], 2)}")

### Preprocess validation (test) dataset

In [16]:
validation_dataset = raw_datasets["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["test"].column_names,
)

print(f'# rec. in raw dataset {len(raw_datasets["test"])}, in final validation set: {len(validation_dataset)}')

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))


# rec. in raw dataset 7609, in final validation set: 7853


In [17]:
validation_dataset

Dataset({
    features: ['attention_mask', 'example_id', 'input_ids', 'offset_mapping', 'token_type_ids'],
    num_rows: 7853
})

### Load the model and create a trainer

In [18]:
MODEL_DIR = "best_model4"

model = AutoModelForQuestionAnswering.from_pretrained(MODEL_DIR)

loading configuration file best_model4/config.json
Model config BertConfig {
  "_name_or_path": "best_model4",
  "architectures": [
    "BertForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.20.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 32102
}

loading weights file best_model4/pytorch_model.bin
All model checkpoint weights were used when initializing BertForQuestionAnswering.

All the weights of BertForQuestionAnswering were initialized from the model checkpoint at best_model4.
If your task is similar to the task the model of 

In [19]:
FINAL_MODEL_NAME = "best_model4"

# vediamo con 2 o 3 epochs
EPOCHS = 2

args = TrainingArguments(
    # il nome che diamo al modello custom
    FINAL_MODEL_NAME,
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=False,
    report_to="none",
    seed=1234
)

PyTorch: setting up devices


In [22]:
# we don't pass a train_dataset because we don't train, only validate
# we're suing the trainer as a soimple way to do inference
trainer = Trainer(
    model=model,
    args=args,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

Using amp half precision backend


In [23]:
evaluate_model(trainer, validation_dataset, raw_datasets["test"])

The following columns in the test set don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping. If example_id, offset_mapping are not expected by `BertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 7853
  Batch size = 8


HBox(children=(FloatProgress(value=0.0, max=7609.0), HTML(value='')))



Prestazioni su Evaluation set:
EM: 63.37, F1-score: 75.29
