### load lib

In [1]:
from transformers import DefaultDataCollator, DataCollatorWithPadding
from transformers import create_optimizer
from transformers.keras_callbacks import PushToHubCallback
import tensorflow as tf
from transformers import TFAutoModelForQuestionAnswering
import numpy as np
import collections
from datasets import load_metric, load_dataset
from transformers import AutoTokenizer
from transformers import pipeline

### load dataset

In [2]:
raw_datasets = load_dataset("squad_v1_pt")


Using custom data configuration default
Reusing dataset squad_v1_pt (/home/studio-lab-user/.cache/huggingface/datasets/squad_v1_pt/default/1.1.0/65162e0fbe44f19a4d2ad9f5f507d2e965e74249fc3239dc78b4e3bd93bab7c4)


  0%|          | 0/2 [00:00<?, ?it/s]

### sep train test

In [3]:
train = raw_datasets['train'].select(range(5000))
validation = raw_datasets['validation'].select(range(1000))

### load tokenizer / Load model finetuning

In [4]:
#model_checkpoint = 'distilbert-base-uncased'
model_checkpoint = 'neuralmind/bert-base-portuguese-cased'

#model_checkpoint = "pierreguillou/bert-base-cased-squad-v1.1-portuguese"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

### prepare train 

In [5]:
max_length = 384
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
train_dataset = train.map(
    preprocess_training_examples,
    batched=True,
    remove_columns=train.column_names,
)
len(train), len(train_dataset)



  0%|          | 0/5 [00:00<?, ?ba/s]

(5000, 5252)

### prepare test

In [7]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [8]:
validation_dataset = validation.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=validation.column_names,
)
len(validation), len(validation_dataset)

  0%|          | 0/1 [00:00<?, ?ba/s]

(1000, 1025)

### data collator

In [9]:
#data collator
data_collator = DefaultDataCollator(return_tensors="tf")
#data_collator = DataCollatorWithPadding(return_tensors="tf")

### transform train e test for tensorflow

In [10]:
tf_train_dataset = train_dataset.to_tf_dataset(
    columns=[
        "input_ids",
        "start_positions",
        "end_positions",
        "attention_mask",
        "token_type_ids",
    ],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

In [11]:
tf_eval_dataset = validation_dataset.to_tf_dataset(
    columns=["input_ids", "attention_mask", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

### Create model

In [12]:
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

Some layers of TFBertForQuestionAnswering were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['qa_outputs']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Login for save model 

In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Train model

In [13]:
num_train_epochs = 3
num_train_steps = len(tf_train_dataset) * num_train_epochs
optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
#callback = PushToHubCallback(output_dir="bert-squad", tokenizer=tokenizer)
model.compile(optimizer=optimizer)


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [14]:
model.fit(tf_train_dataset, validation_data=tf_eval_dataset  , epochs=num_train_epochs)#, callbacks=[callback])

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fd8ac055520>

In [15]:
#1000 / 100
#loss 2.7530 distilbert-base-uncased pt  
#loss 2.2220 neuralmind/bert-base-portuguese-cased
#loss 0.9221 pierreguillou/bert-base-cased-squad-v1.1-portuguese pt


#5000 / 1000
#loss 0.7623 pierreguillou/bert-base-cased-squad-v1.1-portuguese pt
##loss 2.1048 distilbert-base-uncased pt
#loss 1.2895 neuralmind/bert-base-portuguese-cased




### Metric

In [16]:
small_eval_set = validation.select(range(100))
trained_checkpoint = "distilbert-base-cased-distilled-squad"
#trained_checkpoint = 'pierreguillou/bert-base-cased-squad-v1.1-portuguese'

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
eval_set = small_eval_set.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=validation.column_names,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
eval_set_for_model = eval_set.remove_columns(["example_id", "offset_mapping"])
eval_set_for_model.set_format("numpy")

batch = {k: eval_set_for_model[k] for k in eval_set_for_model.column_names}
trained_model = TFAutoModelForQuestionAnswering.from_pretrained(trained_checkpoint)

outputs = trained_model(**batch)

Some layers from the model checkpoint at distilbert-base-cased-distilled-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased-distilled-squad and are newly initialized: ['dropout_39']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
n_best = 20
max_answer_length = 30
metric = load_metric("squad")

In [19]:
from tqdm.auto import tqdm

def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [20]:
predictions = model.predict(tf_eval_dataset)
compute_metrics(
    predictions["start_logits"],
    predictions["end_logits"],
    validation_dataset,
    raw_datasets["validation"],
)



  0%|          | 0/10570 [00:00<?, ?it/s]

{'exact_match': 1.542100283822138, 'f1': 2.2966459313435545}

In [38]:
#1000/100
#f1 0.1683 neuralmind/bert-base-portuguese-cased
#f1 0.0856 distilbert-base-uncased
#f1 0.5701 pierreguillou/bert-base-cased-squad-v1.1-portuguese

#5000/1000
#f1 5.6934 pierreguillou/bert-base-cased-squad-v1.1-portuguese
#f1 2.2966 distilbert-base-uncased

'pierreguillou/bert-base-cased-squad-v1.1-portuguese'

### Previsão / Score

In [17]:
model_save = 'bert-squad'
question_answerer = pipeline("question-answering", model=model_save)

All model checkpoint layers were used when initializing TFBertForQuestionAnswering.

All the layers of TFBertForQuestionAnswering were initialized from the model checkpoint at bert-squad.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForQuestionAnswering for predictions without further training.


In [18]:
contexto = context = r"""
A pandemia de COVID-19, também conhecida como pandemia de coronavírus, é uma pandemia em curso de COVID-19, 
uma doença respiratória aguda causada pelo coronavírus da síndrome respiratória aguda grave 2 (SARS-CoV-2). 
A doença foi identificada pela primeira vez em Wuhan, na província de Hubei, República Popular da China, 
em 1 de dezembro de 2019, mas o primeiro caso foi reportado em 31 de dezembro do mesmo ano. 
Acredita-se que o vírus tenha uma origem zoonótica, porque os primeiros casos confirmados 
tinham principalmente ligações ao Mercado Atacadista de Frutos do Mar de Huanan, que também vendia animais vivos. 
Em 11 de março de 2020, a Organização Mundial da Saúde declarou o surto uma pandemia. Até 8 de fevereiro de 2021, 
pelo menos 105 743 102 casos da doença foram confirmados em pelo menos 191 países e territórios, 
com cerca de 2 308 943 mortes e 58 851 440 pessoas curadas.
"""

In [19]:
question = "Quando começou a pandemia de Covid-19 no mundo?"

In [62]:
result = question_answerer(question=question, context=contexto)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")


Answer: 'A pandemia', score: 0.031, start: 1, end: 11


In [None]:
#'score': 0.0014 distilbert-base-uncased

In [63]:
question = "Qual é a data de início da pandemia Covid-19 em todo o mundo?"

In [64]:
result = question_answerer(question=question, context=contexto)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'coronavírus', score: 0.0254, start: 59, end: 70


In [65]:
question = "O qué a pandemia de Covid-19?"

In [66]:
result = question_answerer(question=question, context=contexto)
print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")

Answer: 'coronavírus', score: 0.0205, start: 59, end: 70
