In [None]:
! pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Question answering

## Load SQuAD dataset

In [None]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")
# squad = load_dataset("squad")

In [None]:
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5000
})

In [None]:
# squad['test'] = squad.pop('validation')

Split the dataset's `train` split into a train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method:

In [None]:
squad = squad.train_test_split(test_size=0.2)

In [None]:
squad["train"][0]

{'id': '56d3aa352ccc5a1400d82de5',
 'title': 'Frédéric_Chopin',
 'context': "Chopin's music remains very popular and is regularly performed, recorded and broadcast worldwide. The world's oldest monographic music competition, the International Chopin Piano Competition, founded in 1927, is held every five years in Warsaw. The Fryderyk Chopin Institute of Poland lists on its website over eighty societies world-wide devoted to the composer and his music. The Institute site also lists nearly 1,500 performances of Chopin works on YouTube as of January 2014.",
 'question': "How many known works of Chopin's music were on YouTube up to the beginning of 2014?",
 'answers': {'text': ['nearly 1,500'], 'answer_start': [408]}}

There are several important fields here:

- `answers`: the starting location of the answer token and the answer text.
- `context`: background information from which the model needs to extract the answer.
- `question`: the question a model should answer.

## Preprocess

In [None]:
"""
AutoTokenizer.from_pretrained('distilbert-base-uncased'): This line of code is using the from_pretrained()
method of the AutoTokenizer class to load a pre-trained tokenizer. The argument 'distilbert-base-uncased'
is the identifier of the pre-trained tokenizer that we want to load. In this case, it’s the tokenizer for
the ‘distilbert-base-uncased’ model.
"""

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') 



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

There are a few preprocessing steps particular to question answering tasks you should be aware of:

1. Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`.
2. Next, map the start and end positions of the answer to the original `context` by setting
   `return_offset_mapping=True`.
3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the [sequence_ids](https://huggingface.co/docs/tokenizers/main/en/api/encoding#tokenizers.Encoding.sequence_ids) method to
   find which part of the offset corresponds to the `question` and which corresponds to the `context`.

Here is how you can create a function to truncate and map the start and end tokens of the `answer` to the `context`:

In [None]:
"""
start_char = answer['answer_start'][0]: This line of code is finding the start character position of the answer in the context. The answer_start field in the answer dictionary contains the character position where the answer starts in the context. The [0] is accessing the first element of the list, which is the start character position.
end_char = start_char + len(answer['text'][0]): This line of code is finding the end character position of the answer in the context. It does this by adding the length of the answer text to the start character position. The text field in the answer dictionary contains the actual answer text. The len(answer['text'][0]) is calculating the length of the answer text, and adding this to start_char gives the end character position.
"""

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer['answer_start'][0]
        end_char = start_char + len(answer['text'][0]) 
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove any columns you don't need:

In [None]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Now create a batch of examples using [DefaultDataCollator](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DefaultDataCollator). Unlike other data collators in 🤗 Transformers, the [DefaultDataCollator](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DefaultDataCollator) does not apply any additional preprocessing such as padding.

In [None]:
"""
DefaultDataCollator(return_tensors="pt"): This line of code is creating an instance of the DefaultDataCollator class. The DefaultDataCollator is a class that collates batches of data in a way that they can be directly fed into a model.
"""

from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="pt") 

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load DistilBERT with [AutoModelForQuestionAnswering](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForQuestionAnswering):

In [None]:
"""
AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased'): This line of code is using the from_pretrained() method of the AutoModelForQuestionAnswering class to load a pre-trained model. The argument 'distilbert-base-uncased' is the identifier of the pre-trained model that we want to load. In this case, it’s the ‘distilbert-base-uncased’ model.
"""

from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
"""
Trainer(...): This line of code is creating an instance of the Trainer class. The Trainer class is a simple but feature-complete training and eval loop for PyTorch, optimized for Transformers.
Here are the arguments it’s passing to the Trainer:

model=model: This is the model that will be trained. In this case, it’s the ‘distilbert-base-uncased’ model that was loaded earlier.
args=training_args: These are the training arguments that control the training process. They were defined earlier in the TrainingArguments.
train_dataset=tokenized_squad["train"]: This is the training dataset. It’s a version of the SQuAD dataset that has been tokenized.
eval_dataset=tokenized_squad["test"]: This is the evaluation dataset. It’s also a version of the SQuAD dataset that has been tokenized.
tokenizer=tokenizer: This is the tokenizer that was used to tokenize the dataset. It will be used to decode the model’s predictions.
data_collator=data_collator: This is the data collator that will be used to form batches of data.
trainer.train(): This line of code is starting the training process.
"""

training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.758417
2,1.675900,1.625011
3,1.675900,1.642631


TrainOutput(global_step=750, training_loss=1.4554161376953125, metrics={'train_runtime': 519.4554, 'train_samples_per_second': 23.101, 'train_steps_per_second': 1.444, 'total_flos': 1175877900288000.0, 'train_loss': 1.4554161376953125, 'epoch': 3.0})

In [None]:
"""
model.save_pretrained("qa_model"): This line of code is used to save the trained model. The model contains the weights and biases that your machine learning model has learned during training. By saving the model, you can reuse it later without having to retrain it. This can save a lot of time, especially when training the model takes a long time.
tokenizer.save_pretrained("qa_model"): This line of code is used to save the tokenizer. The tokenizer is responsible for converting the input data into a format that the model can understand. It includes details like the vocabulary of your model, and the specific method used to convert words into numbers. By saving the tokenizer, you ensure that you can accurately preprocess any new input data in the same way as the training data.
"""

model.save_pretrained("qa_model")
tokenizer.save_pretrained("qa_model")

('qa_model/tokenizer_config.json',
 'qa_model/special_tokens_map.json',
 'qa_model/vocab.txt',
 'qa_model/added_tokens.json',
 'qa_model/tokenizer.json')

## Evaluate

Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) still calculates the evaluation loss during training so you're not completely in the dark about your model's performance.

If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) chapter from the 🤗 Hugging Face Course!

## Inference

In [None]:
question = "What is the tallest mountain in the world?" 
context = "Mount Everest is Earth's highest mountain above sea level, located in the Mahalangur Himal sub-range of the Himalayas. The international border between China and Nepal runs across its summit point."


The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for question answering with your model, and pass your text to it:

In [None]:
"""
pipeline('question-answering'): This line of code initializes a pipeline for question answering. The pipeline function is a high-level, easy to use, API for doing inference over a model. In this case, it loads a model and tokenizer that have been trained for question answering tasks.
question_answerer(question=question, context=context): This line of code uses the question answering pipeline to answer a question given a context. The question and context are strings that you would provide. The question is what you want to know, and the context is the document or text snippet in which the model looks for the answer.
"""

from transformers import pipeline

question_answerer = pipeline('question-answering') 
question_answerer(question=question, context=context)

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.9576919674873352, 'start': 0, 'end': 13, 'answer': 'Mount Everest'}

You can also manually replicate the results of the `pipeline` if you'd like:

Tokenize the text and return PyTorch tensors:

In [None]:
"""
AutoTokenizer.from_pretrained("qa_model"): This line of code loads a tokenizer that was previously saved using the save_pretrained method. The tokenizer is loaded from the directory specified, in this case “qa_model”. The tokenizer is responsible for converting the input data into a format that the model can understand. It includes details like the vocabulary of your model, and the specific method used to convert words into numbers.
tokenizer(question, context, return_tensors="pt"): This line of code uses the loaded tokenizer to prepare the inputs for the model. The question and context are strings that you would provide. The return_tensors="pt" argument specifies that the inputs should be returned as PyTorch tensors.
"""

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("qa_model") 
inputs = tokenizer(question, context, return_tensors="pt")

Pass your inputs to the model and return the `logits`:

In [None]:
"""
AutoModelForQuestionAnswering.from_pretrained("qa_model"): This line of code loads a model that was previously saved using the save_pretrained method. The model is loaded from the directory specified, in this case “qa_model”. The model contains the weights and biases that your machine learning model has learned during training.
with torch.no_grad():: This line of code is used to temporarily set all the requires_grad flags to false. torch.no_grad() impacts the autograd engine and deactivate it. It will reduce memory usage and speed up computations but you won’t be able to backprop (which you don’t want in an eval script).
outputs = model(**inputs): This line of code uses the loaded model to make predictions on the inputs. The **inputs syntax is used to pass the inputs dictionary to the model. The model returns its predictions, which are then stored in the outputs variable.
"""

import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained("qa_model")
with torch.no_grad():
    outputs = model(**inputs) 

Get the highest probability from the model output for the start and end positions:

In [None]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

Decode the predicted tokens to get the answer:

In [None]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'mount everest'