<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Transformers-Hub/blob/main/Question-Answering-Distilber-Model/QA_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers datasets evaluate

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset

In [None]:
# Load the SQuAD dataset
data = load_dataset("squad")

# Load the pretrained model and tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define tokenization function
def preprocess_function(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

In [None]:
# Preprocess the dataset
train_dataset = data["train"].map(preprocess_function, batched=True, remove_columns=data["train"].column_names)
validation_dataset = data["validation"].map(preprocess_function, batched=True, remove_columns=data["validation"].column_names)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=500,
    push_to_hub=False,
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,1.1913,1.134409
2,0.9274,1.093885
3,0.7454,1.146137


TrainOutput(global_step=16599, training_loss=1.0749494749218937, metrics={'train_runtime': 9787.4547, 'train_samples_per_second': 27.134, 'train_steps_per_second': 1.696, 'total_flos': 2.602335381127373e+16, 'train_loss': 1.0749494749218937, 'epoch': 3.0})

In [None]:
# Save the model
trainer.save_model("./qa_model")

# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

Evaluation Results: {'eval_loss': 1.146136999130249, 'eval_runtime': 123.3738, 'eval_samples_per_second': 87.409, 'eval_steps_per_second': 5.463, 'epoch': 3.0}


# Save Model

In [None]:
model.save_pretrained("qa-trained-model")
tokenizer.save_pretrained("qa-trained-model")

('qa-trained-model/tokenizer_config.json',
 'qa-trained-model/special_tokens_map.json',
 'qa-trained-model/vocab.txt',
 'qa-trained-model/added_tokens.json',
 'qa-trained-model/tokenizer.json')

# Load & Test Model

In [None]:
# Load the model and tokenizer
model = AutoModelForQuestionAnswering.from_pretrained("qa-trained-model")
tokenizer = AutoTokenizer.from_pretrained("qa-trained-model")

In [None]:
context = "Hugging Face is a technology company based in New York and Paris."
question = "Where is Hugging Face based?"

# Tokenize the input
inputs = tokenizer.encode_plus(
    question,
    context,
    return_tensors="pt",
    truncation=True
)


In [None]:
outputs = model(**inputs)
start_logits = outputs.start_logits
end_logits = outputs.end_logits

# Get the start and end token positions
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits)

In [None]:
# Decode the tokens to get the answer
answer = tokenizer.convert_tokens_to_string(
    tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][start_idx:end_idx+1])
)

print("Question:", question)
print("Answer:", answer)


Question: Where is Hugging Face based?
Answer: new york and paris
