This notebook contains ex, that teaches how to use HF datasets. This is an important skill for training purposes. We will be taking guide from ChatGPT for taking suggessions for training a BERT model on small dataset, which takes less resources.

My quarry:
`I want to use a small dataset to pretrain BERT, to do Q & A using huggingface datasets. Can you give me an example that will run small amout of CPU?`

Our Goal: to use finetuning with a basic model BERT 

This script uses only a small portion of the SQuAD training data (e.g. 100 examples) for demonstration purposes. In real-world use, you would use the full dataset (or another dataset) and adjust the training parameters accordingly.

In [1]:
%pip install -q transformers python-dotenv torch datasets transformers[torch]
#%pip install --upgrade jupyter ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [2]:
from dotenv import load_dotenv
import torch

load_dotenv()

True

In [3]:

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    default_data_collator,
)

# 1. Load a small subset of the SQuAD dataset
dataset = load_dataset("squad")
# For demonstration, use only the first 100 training examples
small_train_dataset = dataset["train"].select(range(100))
small_eval_dataset = dataset["validation"].select(range(100))



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 2. Load the tokenizer and model
model_checkpoint = "bert-base-uncased"  # a small BERT model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 3. Preprocessing the data
max_length = 384  # max length for the context + question
doc_stride = 128  # stride for splitting up long documents

def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    
    for i, offsets in enumerate(offset_mapping):
        # The corresponding example index
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set start and end positions to the CLS index.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(0)
            tokenized_examples["end_positions"].append(0)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            
            # Find the start and end token indices that span the answer.
            token_start_index = 0
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= 0:
                token_start_index += 1

            token_end_index = len(offsets) - 1
            while token_end_index >= 0 and offsets[token_end_index][1] >= max_length:
                token_end_index -= 1

            # If the answer is not fully inside the context, label it (0, 0)
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(0)
                tokenized_examples["end_positions"].append(0)
            else:
                # Otherwise find the token indices corresponding to the answer
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_position = token_start_index - 1

                while token_end_index >= 0 and offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_position = token_end_index + 1

                tokenized_examples["start_positions"].append(start_position)
                tokenized_examples["end_positions"].append(end_position)
    
    return tokenized_examples

# Preprocess the train and evaluation datasets
train_dataset = small_train_dataset.map(
    prepare_train_features,
    batched=True,
    remove_columns=small_train_dataset.column_names,
)
eval_dataset = small_eval_dataset.map(
    prepare_train_features,
    batched=True,
    remove_columns=small_eval_dataset.column_names,
)



In [6]:
# 4. Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=50,
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # small batch size for CPU
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=50,
    disable_tqdm=False,
)





In [7]:
%pip install evaluate

Note: you may need to restart the kernel to use updated packages.


In [8]:

# Cell 2: Import the load_metric function and define the metric
from evaluate import load as load_metric

# 5. Define a metric (using squad metric for example)
squad_metric = load_metric("squad")

def compute_metrics(p):
    predictions, labels = p
    # In a real scenario, you would post-process the predictions to convert token indices to text
    # For this small demo, we simply return an empty dictionary.
    return {}



In [None]:
# 6. Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)



: 

In [None]:
# 7. Start training
trainer.train()

In [None]:

# 8. Evaluate the model (if desired)
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)
