In [1]:
!pip install datasets transformers peft torch numpy

Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━

# **Importing Libraries**

In [2]:
import torch
from datasets import load_dataset
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, TrainingArguments, Trainer, DefaultDataCollator, pipeline
import numpy as np
import peft

# **Loading the model**

Why this model

1.   This model is specifically fine-tuned for question answering tasks
2.   BERT (Bidirectional Encoder Representations from Transformers) is a popular and well-performing model for various NLP tasks, including QA. "deepset/bert-base-cased-squad2" utilizes this architecture.
3.   Fine-tuning on the SQuAD2 dataset (Stanford Question Answering Dataset 2.0) equips the model to handle complex question formats and answer extraction from longer contexts.
4.   "cased" indicates that the model considers the case of letters (uppercase vs lowercase) during processing, potentially better capturing the intended meaning of words.



In [3]:
# Load pre-trained model and tokenizer
model_name = "deepset/bert-base-cased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/152 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# **Loading the dataset**

In [4]:
# Load the dataset
squad = load_dataset("squad")

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply the function to our data
tokenized_squad = squad.map(preprocess_function, batched=True)

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

# **WHY PEFT ?**

Imagine you have a large, complex model to understand language. This model is like a **big box with lots of internal connections**.

**PEFT (Parameter-Efficient Fine-Tuning) is a technique that helps make this model smaller and more efficient. It does this by using a special trick called LoRA (Low-Rank Adaptation).**

LoRAConfig is like a set of instructions for applying LoRA. Here's what each setting does:

1.   r=8: This controls the "rank" of the LoRA decomposition. Think of it as the number of simpler "sub-boxes" you create within the big box. A lower value (like 8) means fewer sub-boxes, which can be faster but might lose some accuracy.
2.   lora_alpha=16: This controls the size of these sub-boxes. A higher value (like 16) allows for more complex sub-boxes, potentially improving accuracy but increasing memory usage.
3.   lora_dropout=0.1: This is a common technique used to prevent the model from overfitting. It's like randomly turning off some connections during training, making the model more robust and generalizable.
4.   bias="none": This controls how the model handles "bias" terms, which are basically offsets that help the model make predictions. Setting it to "none" means LoRA focuses on the main connections and doesn't use separate bias terms.


In summary, peft.LoraConfig helps you fine-tune a large language model by making it more efficient and reducing its size. It does this by using LoRA, which breaks down the model into smaller, more manageable components with some configuration options for balancing efficiency and accuracy.

In [5]:
# Set up PEFT
peft_config = peft.LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=peft.TaskType.QUESTION_ANS,
)

# This line prepares the existing model (`model`) for training with PEFT (Parameter-Efficient Fine-Tuning).
# This might involve tasks like freezing certain layers or applying specific weight transformations
# to make the model more suitable for PEFT.
model = peft.prepare_model_for_kbit_training(model)

# This line applies LoRA (Low-Rank Adaptation) to the model (`model`) based on the provided configuration (`peft_config`).
# LoRA is a technique used by PEFT to decompose the model's weights into smaller, lower-rank components.
# The `peft_config` object likely contains settings like rank, alpha, and dropout that control
# how LoRA is applied to the model.
model = peft.get_peft_model(model, peft_config)

#After applying PEFT with LoRA, the number of trainable parameters might be reduced compared
# to the original model. This line helps visualize the change in model size and efficiency.
model.print_trainable_parameters()

trainable params: 296,450 || all params: 108,017,668 || trainable%: 0.27444584343368716


In [6]:
data_collator = DefaultDataCollator()

In [7]:
training_args = TrainingArguments(
     output_dir="results2",
     evaluation_strategy="epoch",
     learning_rate=2e-5,
     per_device_train_batch_size=2,
     per_device_eval_batch_size=2,
     num_train_epochs=2,
     weight_decay=0.01,
)

In [8]:
trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=tokenized_squad["train"].select(range(1000)),
     eval_dataset=tokenized_squad["validation"].select(range(100)),
     tokenizer=tokenizer,
     data_collator=data_collator,
)

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.2531,1.253845
2,1.4671,1.216178


TrainOutput(global_step=1000, training_loss=1.8601077270507813, metrics={'train_runtime': 162.9897, 'train_samples_per_second': 12.271, 'train_steps_per_second': 6.135, 'total_flos': 524414902272000.0, 'train_loss': 1.8601077270507813, 'epoch': 2.0})

**These results showcase the potential of using PEFT with a modest dataset. Despite training on only 1000 records, the model has demonstrated learning capabilities, achieving a relatively low training loss. This highlights the efficiency of PEFT in leveraging prescription data to enhance fine-tuning outcomes. With further refinement and additional data, even greater performance improvements can be anticipated.**

In [10]:
# To save the model
#trainer.save_model("bert-base-QUESTION_ANS/model")

# **Inferencing**

In [14]:
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

The model 'PeftModelForQuestionAnswering' is not supported for question-answering. Supported models are ['AlbertForQuestionAnswering', 'BartForQuestionAnswering', 'BertForQuestionAnswering', 'BigBirdForQuestionAnswering', 'BigBirdPegasusForQuestionAnswering', 'BloomForQuestionAnswering', 'CamembertForQuestionAnswering', 'CanineForQuestionAnswering', 'ConvBertForQuestionAnswering', 'Data2VecTextForQuestionAnswering', 'DebertaForQuestionAnswering', 'DebertaV2ForQuestionAnswering', 'DistilBertForQuestionAnswering', 'ElectraForQuestionAnswering', 'ErnieForQuestionAnswering', 'ErnieMForQuestionAnswering', 'FalconForQuestionAnswering', 'FlaubertForQuestionAnsweringSimple', 'FNetForQuestionAnswering', 'FunnelForQuestionAnswering', 'GPT2ForQuestionAnswering', 'GPTNeoForQuestionAnswering', 'GPTNeoXForQuestionAnswering', 'GPTJForQuestionAnswering', 'IBertForQuestionAnswering', 'LayoutLMv2ForQuestionAnswering', 'LayoutLMv3ForQuestionAnswering', 'LEDForQuestionAnswering', 'LiltForQuestionAnswering

**Inferencing with Data from a Dataset**

In [15]:
inst = tokenized_squad['train'][2000]
print("context :",inst['context'])
print("question :",inst['question'])
print("answers :",inst['answers'])

context : With his health further deteriorating, Chopin desired to have a family member with him. In June 1849 his sister Ludwika came to Paris with her husband and daughter, and in September, supported by a loan from Jane Stirling, he took an apartment at Place Vendôme 12. After 15 October, when his condition took a marked turn for the worse, only a handful of his closest friends remained with him, although Viardot remarked sardonically that "all the grand Parisian ladies considered it de rigueur to faint in his room."
question : Who accompanied Chopin's sister to Paris?
answers : {'text': ['her husband and daughter'], 'answer_start': [139]}


In [16]:
question_answerer(question=inst['question'], context=inst['context'])

{'score': 0.3496159315109253,
 'start': 139,
 'end': 163,
 'answer': 'her husband and daughter'}

In [17]:
inst2 = tokenized_squad['train'][2020]
print("context :",inst2['context'])
print("question :",inst2['question'])
print("answers :",inst2['answers'])

context : The funeral, held at the Church of the Madeleine in Paris, was delayed almost two weeks, until 30 October. Entrance was restricted to ticket holders as many people were expected to attend. Over 3,000 people arrived without invitations, from as far as London, Berlin and Vienna, and were excluded.
question : How many people arrived without an invitation?
answers : {'text': ['Over 3,000'], 'answer_start': [189]}


In [18]:
question_answerer(question=inst2['question'], context=inst2['context'])

{'score': 0.8891415596008301, 'start': 189, 'end': 199, 'answer': 'Over 3,000'}

**Inferencing with User-Provided Data**

In [19]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [20]:
question_answerer(question=question, context=context)

{'score': 0.6968227028846741, 'start': 93, 'end': 95, 'answer': '13'}