In [1]:
! pip install transformers datasets evaluate accelerate

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Question answering

## Load SQuAD dataset

In [13]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")
# squad = load_dataset("squad")

In [9]:
squad

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5000
})

In [10]:
# squad = load_dataset("squad")
# squad['test'] = squad.pop('validation')

In [11]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

Split the dataset's `train` split into a train and test set with the [train_test_split](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.train_test_split) method:

In [14]:
squad = squad.train_test_split(test_size=0.2)

In [15]:
squad["train"][0]

{'id': '56cda10262d2951400fa6794',
 'title': 'The_Legend_of_Zelda:_Twilight_Princess',
 'context': "The game's score was composed by Toru Minegishi and Asuka Ohta, with series regular Koji Kondo serving as the sound supervisor. Minegishi took charge of composition and sound design in Twilight Princess, providing all field and dungeon music under the supervision of Kondo. For the trailers, three pieces were written by different composers, two of which were created by Mahito Yokota and Kondo. Michiru Ōshima created orchestral arrangements for the three compositions, later to be performed by an ensemble conducted by Yasuzo Takemoto. Kondo's piece was later chosen as music for the E3 2005 trailer and for the demo movie after the game's title screen.",
 'question': 'Who worked adapting the score for performance by an orchestra?',
 'answers': {'text': ['Michiru Ōshima'], 'answer_start': [396]}}

There are several important fields here:

- `answers`: the starting location of the answer token and the answer text.
- `context`: background information from which the model needs to extract the answer.
- `question`: the question a model should answer.

## Preprocess

In [16]:
from transformers import AutoTokenizer

# Load the tokenizer for the DistilBERT model pre-trained on uncased English text
# This tokenizer will convert text to tokens that are compatible with the DistilBERT model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") #TODO



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

There are a few preprocessing steps particular to question answering tasks you should be aware of:

1. Some examples in a dataset may have a very long `context` that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the `context` by setting `truncation="only_second"`.
2. Next, map the start and end positions of the answer to the original `context` by setting
   `return_offset_mapping=True`.
3. With the mapping in hand, now you can find the start and end tokens of the answer. Use the [sequence_ids](https://huggingface.co/docs/tokenizers/main/en/api/encoding#tokenizers.Encoding.sequence_ids) method to
   find which part of the offset corresponds to the `question` and which corresponds to the `context`.

Here is how you can create a function to truncate and map the start and end tokens of the `answer` to the `context`:

In [17]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        # Find start character position of the answer in the context
        start_char = answer["answer_start"][0]  # TODO: This retrieves the starting character index of the first answer
        # Find end character position by adding the length of the answer text to the start character position
        end_char = start_char + len(answer["text"][0])  # TODO: This calculates the ending character index of the first answer
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) function. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once. Remove any columns you don't need:

In [18]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Now create a batch of examples using [DefaultDataCollator](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DefaultDataCollator). Unlike other data collators in 🤗 Transformers, the [DefaultDataCollator](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DefaultDataCollator) does not apply any additional preprocessing such as padding.

In [19]:
from transformers import DefaultDataCollator

# Create an instance of DefaultDataCollator
# This collator handles batching of inputs and outputs to ensure that they are of uniform size.
# It applies necessary padding to the inputs, which is especially useful when dealing with variable-length sequences.
data_collator = DefaultDataCollator() # TODO: make an instance

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load DistilBERT with [AutoModelForQuestionAnswering](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForQuestionAnswering):

In [20]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

# Load the DistilBERT model pre-trained for the question answering task
# This specific line loads the 'distilbert-base-uncased' model, which is a smaller version of BERT.
# The model is pre-trained on a large corpus of text and fine-tuned for the task of question answering.
# The 'from_pretrained' method downloads the model weights and configuration.
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased") # TODO: load distilbert-base-uncased model



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
training_args = TrainingArguments(
    output_dir="qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Create an instance of the Trainer class
# This Trainer class is responsible for orchestrating the training and evaluation of the model.
# It requires several arguments to be passed, including the model, training arguments, datasets, tokenizer, and data collator.
trainer = Trainer(
    model=model,  # The model to be trained (in this case, the DistilBERT model for question answering)
    args=training_args,  # The training arguments specifying the hyperparameters and configuration for training
    train_dataset=tokenized_squad["train"],  # The tokenized training dataset
    eval_dataset=tokenized_squad["test"],  # The tokenized evaluation dataset
    tokenizer=tokenizer,  # The tokenizer used to process the input text
    data_collator=data_collator,  # The data collator to ensure uniform batch sizes by padding sequences
) # TODO: pass the required arguments

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.210247
2,2.673700,1.653095
3,2.673700,1.60051


TrainOutput(global_step=750, training_loss=2.2326613362630208, metrics={'train_runtime': 451.8767, 'train_samples_per_second': 26.556, 'train_steps_per_second': 1.66, 'total_flos': 1175877900288000.0, 'train_loss': 2.2326613362630208, 'epoch': 3.0})

In [22]:
# TODO: save both model and tokenizer
# This ensures that the trained model can be easily loaded and used for inference or further training later.
trainer.save_model("qa_model")
# Similarly, the tokenizer is saved in the same directory, ensuring that the correct tokenizer is used with the model.
tokenizer.save_pretrained("qa_model")

('qa_model/tokenizer_config.json',
 'qa_model/special_tokens_map.json',
 'qa_model/vocab.txt',
 'qa_model/added_tokens.json',
 'qa_model/tokenizer.json')

## Evaluate

Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) still calculates the evaluation loss during training so you're not completely in the dark about your model's performance.

If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) chapter from the 🤗 Hugging Face Course!

## Inference

In [25]:
question = "Who was Mahsa Amini?"
context = "Mahsa Amini was a young Iranian woman who died in police custody in September 2022. Her death sparked widespread protests across Iran and drew international attention to the issues of women's rights and state violence in the country. Amini was arrested by the morality police for allegedly violating Iran's strict dress code for women. Her death became a symbol of the struggle for greater freedoms and rights for Iranian women, highlighting the oppressive measures enforced by the government."

The simplest way to try out your finetuned model for inference is to use it in a [pipeline()](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline). Instantiate a `pipeline` for question answering with your model, and pass your text to it:

In [27]:
from transformers import pipeline

# Create a question-answering pipeline using the trained model and tokenizer
# The pipeline simplifies the process of performing question-answering tasks by wrapping the model and tokenizer
# The "question-answering" pipeline takes a context and a question as input and returns the answer found in the context
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer) # TODO: call QA pipeline
question_answerer(question=question, context=context)

{'score': 0.23650994896888733,
 'start': 18,
 'end': 37,
 'answer': 'young Iranian woman'}

You can also manually replicate the results of the `pipeline` if you'd like:

Tokenize the text and return PyTorch tensors:

In [28]:
from transformers import AutoTokenizer

# Load the tokenizer from the saved directory "qa_model"
# The 'from_pretrained' method loads the tokenizer configuration and vocabulary from the specified directory.
tokenizer = AutoTokenizer.from_pretrained("qa_model")  # TODO: load your tokenizer
inputs = tokenizer(question, context, return_tensors="pt")

Pass your inputs to the model and return the `logits`:

In [29]:
import torch
from transformers import AutoModelForQuestionAnswering

# Load the trained question-answering model from the saved directory "qa_model"
# The 'from_pretrained' method loads the model's architecture and weights from the specified directory.
model = AutoModelForQuestionAnswering.from_pretrained("qa_model") # TODO: load your model
with torch.no_grad():
    # Pass the preprocessed inputs to the model to obtain outputs
    # 'inputs' should be a dictionary containing the tokenized input data such as input IDs, attention masks, etc.
    # The model returns outputs which typically include start and end logits for the question-answering task.
    outputs = model(**inputs) # TODO: pass your inputs to the model

Get the highest probability from the model output for the start and end positions:

In [30]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

Decode the predicted tokens to get the answer:

In [31]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'young iranian woman'