In [1]:
!pip install datasets trl evaluate
!pip install -U bitsandbytes
!pip install -U transformers

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting trl
  Downloading trl-0.12.0-py3-none-any.whl.metadata (10 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting transformers>=4.46.0 (from trl)
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting t

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
import json
from transformers import pipeline
import torch
from transformers import DistilBertForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator, BitsAndBytesConfig
from peft import LoraConfig, PeftModel
import os
import wandb
from trl import SFTTrainer
import evaluate
import numpy as np
from tqdm.auto import tqdm
import collections
from torch.utils.data import DataLoader

Loading and preprocessing the data

In [3]:
# Loading the dataset
# The MRQA dataset is included in huggingface's datasets library, so we just have to load it
# Loading dataset (smaller fraction than in the final becasue had to train on local GPU)
mrqa = load_dataset("mrqa", split="train[:5%]")
# Creating the train-test-validation split
mrqa = mrqa.train_test_split(test_size=0.2)
mrqa["train"] = mrqa["train"].train_test_split(test_size=0.2)
mrqa["val"] = mrqa["train"]["test"]
mrqa["train"] = mrqa["train"]["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

train-00000-of-00009.parquet:   0%|          | 0.00/40.9M [00:00<?, ?B/s]

train-00001-of-00009.parquet:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

train-00002-of-00009.parquet:   0%|          | 0.00/169M [00:00<?, ?B/s]

train-00003-of-00009.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

train-00004-of-00009.parquet:   0%|          | 0.00/313M [00:00<?, ?B/s]

train-00005-of-00009.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

train-00006-of-00009.parquet:   0%|          | 0.00/96.2M [00:00<?, ?B/s]

train-00007-of-00009.parquet:   0%|          | 0.00/80.2M [00:00<?, ?B/s]

train-00008-of-00009.parquet:   0%|          | 0.00/77.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/516819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/58221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9633 [00:00<?, ? examples/s]

In [4]:
# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-distilled-squad")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [5]:
max_length = 384
stride = 128

# Preprocessing function for training
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["detected_answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["char_spans"][0]["start"][0]
        end_char = answer["char_spans"][0]["end"][0]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Preprocessing function for the validation
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["qid"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [6]:
# Tokenizing datasets
#TODO: Exclude eval set from the preprocess for training examples
tokenized_mrqa = mrqa.map(preprocess_training_examples, batched=True, remove_columns=mrqa["train"].column_names)
tokenized_mrqa.set_format(type="torch")

# Tokenizing evaluation dataset
tokenized_eval = mrqa["test"].map(preprocess_validation_examples, batched=True, remove_columns=mrqa["test"].column_names)
tokenized_eval.set_format(type="torch")

Map:   0%|          | 0/16537 [00:00<?, ? examples/s]

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

Map:   0%|          | 0/4135 [00:00<?, ? examples/s]

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

Loading the model and testing it

In [7]:
# Defining data collator
data_collator = DefaultDataCollator()

# Configuring parameters for the quantation
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False,
)

# Configuring parameters of the low-rank adaptation
peft_config = LoraConfig(
    lora_alpha=6,
    lora_dropout=0.15,
    r=2,
    bias="none",
    task_type="QUESTION_ANS",
    target_modules=["q_lin", "k_lin", "v_lin", "ffn.lin1", "ffn.lin2", "attention.out_proj"])

In [8]:
# Loading baseline model: DistilBert finetuned on Squadn dataset
model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased-distilled-squad",
                                                      quantization_config=bnb_config,
                                                      device_map={"": 0})

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [9]:
# Testing the model
question, text = "Who was the last pharaoh of ancient Egypt?", "The last pharaoh of ancient Egypt was Cleopatra."

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits)

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'cleopatra'

In [10]:
# Defining some parameters for computing the evaluation metrics
metric = evaluate.load("squad")
n_best = 20
max_answer_length = 30

# Evaluation funnction with default squad metrics (exact match and f1 score)
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["qid"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [
    {"id": ex["qid"], "answers": [{"text": ans, "answer_start": 0} for ans in ex["answers"]]}
    for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)


# Function for doing the complete evaluation on the preprocessed evaluation set
def eval_function(tokenized_eval, batch_size=64):
    # Remove unneccesary columns from evaluation set and convert it to torch tensors
    eval_set_for_model = tokenized_eval.remove_columns(["example_id", "offset_mapping"])
    eval_set_for_model.set_format("torch")

    # Evaluate on GPU if available
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    # Create a DataLoader for the evaluation set
    eval_loader = DataLoader(eval_set_for_model, batch_size=batch_size)

    # Log start and end logits
    all_start_logits = []
    all_end_logits = []

    model.eval()  # Set model to evaluation mode

    with torch.no_grad():
        for batch in tqdm(eval_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            all_start_logits.append(outputs.start_logits.cpu().numpy())
            all_end_logits.append(outputs.end_logits.cpu().numpy())

    # Concatennate start and end logit labels
    start_logits = np.concatenate(all_start_logits, axis=0)
    end_logits = np.concatenate(all_end_logits, axis=0)

    return compute_metrics(start_logits, end_logits, tokenized_eval, mrqa["test"])

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

In [11]:
# Calculating evaluation metrics before further finetuning
pre_training_metrics = eval_function(tokenized_eval)
print(f"Exact match before finetuning: {pre_training_metrics['exact_match']}\nF1 score before finetuning: {pre_training_metrics['f1']}")

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/5169 [00:00<?, ?it/s]

Exact match before finetuning: 82.93673824724318
F1 score before finetuning: 91.99384711182272


Defining the training paramaters and training the model

In [12]:
# Defining training parameters
output_dir_name = "trial_run_local"

# Configuring parameters of the low-rank adaptation
peft_config = LoraConfig(
    lora_alpha=6,
    lora_dropout=0.15,
    r=2,
    bias="none",
    task_type="QUESTION_ANS",
    target_modules=["q_lin", "k_lin", "v_lin", "ffn.lin1", "ffn.lin2", "attention.out_proj"])

# Parameters will be later adjusted, this is only to ensure that training pipeline works as intended
training_args = TrainingArguments(
    output_dir=output_dir_name,
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='new_dir',
    push_to_hub=False,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mrqa["train"],
    eval_dataset=tokenized_mrqa["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    peft_config=peft_config
)



In [13]:
# Training the model
wandb_project = input("Wandb project: ")
wandb_entity = input("Wandb entity: ")
wandb.init(project=wandb_project, entity=wandb_entity)
trainer.train()
wandb.finish()

Wandb project: nagyhazi_colab
Wandb entity: bme_deepl_learning


[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss
500,0.8107,0.670872
1000,0.7242,0.632653
1500,0.7033,0.611914
2000,0.6909,0.599911
2500,0.6628,0.59503
3000,0.6787,0.592286


0,1
eval/loss,█▅▃▂▁▁
eval/runtime,▆▁▄█▆▃
eval/samples_per_second,▃█▅▁▃▆
eval/steps_per_second,▃█▅▁▃▆
train/epoch,▁▁▂▂▄▄▅▅▆▆███
train/global_step,▁▁▂▂▄▄▅▅▆▆███
train/grad_norm,▁▂▃▇█▃
train/learning_rate,█▇▅▄▂▁
train/loss,█▄▃▂▁▂

0,1
eval/loss,0.59229
eval/runtime,26.7046
eval/samples_per_second,156.078
eval/steps_per_second,9.774
total_flos,4914983697177600.0
train/epoch,3.0
train/global_step,3126.0
train/grad_norm,3.37443
train/learning_rate,0.0
train/loss,0.6787


In [14]:
# # Uncomment and run if training was stopped earlier
# wandb.finish()

In [15]:
# Calculating evaluation metrics after further finetuning
post_training_metrics = eval_function(tokenized_eval)
print(f"Exact match after finetuning: {post_training_metrics['exact_match']}\nF1 score after finetuning: {post_training_metrics['f1']}")

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/5169 [00:00<?, ?it/s]

Exact match after finetuning: 82.54981621203328
F1 score after finetuning: 91.697089044733
