In [5]:
!pip install datasets trl evaluate
!pip install -U bitsandbytes
!pip install -U transformers



In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer
import json
from transformers import pipeline
import torch
from transformers import DistilBertForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator, BitsAndBytesConfig
from peft import LoraConfig, PeftModel
import os
import wandb
from trl import SFTTrainer
import evaluate
import numpy as np

In [7]:
# Loading the dataset
# The MRQA dataset is included in huggingface's datasets library, so we just have to load it
mrqa = load_dataset("mrqa", split="train[:20%]")
# Creating the train-test-validation split
mrqa = mrqa.train_test_split(test_size=0.2)
mrqa["train"] = mrqa["train"].train_test_split(test_size=0.2)
mrqa["val"] = mrqa["train"]["test"]
mrqa["train"] = mrqa["train"]["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

train-00000-of-00009.parquet:   0%|          | 0.00/40.9M [00:00<?, ?B/s]

train-00001-of-00009.parquet:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

train-00002-of-00009.parquet:   0%|          | 0.00/169M [00:00<?, ?B/s]

train-00003-of-00009.parquet:   0%|          | 0.00/331M [00:00<?, ?B/s]

train-00004-of-00009.parquet:   0%|          | 0.00/313M [00:00<?, ?B/s]

train-00005-of-00009.parquet:   0%|          | 0.00/296M [00:00<?, ?B/s]

train-00006-of-00009.parquet:   0%|          | 0.00/96.2M [00:00<?, ?B/s]

train-00007-of-00009.parquet:   0%|          | 0.00/80.2M [00:00<?, ?B/s]

train-00008-of-00009.parquet:   0%|          | 0.00/77.0M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/178M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/516819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/58221 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/9633 [00:00<?, ? examples/s]

In [8]:
# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-distilled-squad")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [9]:
# Tokenizing the texts with distilbert's own pretrained tokenizer and mapping the answer start and end character indices onto the tokenized text
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    # Tokenizing inputs
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors="pt"
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["detected_answers"]
    start_positions = []
    end_positions = []

    # Mapping the answer start and end characters to the tokenized text
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["char_spans"][0]["start"][0]
        end_char = answer["char_spans"][0]["end"][0]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
tokenized_mrqa = mrqa.map(preprocess_function, batched=True, remove_columns=mrqa["train"].column_names)
tokenized_mrqa.set_format(type="torch")

Map:   0%|          | 0/66152 [00:00<?, ? examples/s]

Map:   0%|          | 0/20673 [00:00<?, ? examples/s]

Map:   0%|          | 0/16539 [00:00<?, ? examples/s]

In [11]:
data_collator = DefaultDataCollator()

In [13]:
bnb_4bit_compute_dtype = "float16"
num_train_epochs = 3
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
optim = "paged_adamw_32bit"
lr_scheduler_type = "constant"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 25
logging_steps = 5
max_seq_length = None
packing = False


In [14]:
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, bnb_4bit_compute_dtype),
    bnb_4bit_use_double_quant=False,
)

# Configuring parameters of the low-rank adaptation
peft_config = LoraConfig(
    lora_alpha=6,
    lora_dropout=0.15,
    r=2,
    bias="none",
    task_type="QUESTION_ANS",
    target_modules=["q_lin", "k_lin", "v_lin", "ffn.lin1", "ffn.lin2", "attention.out_proj"])

In [16]:
model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased-distilled-squad",
                                                      quantization_config=bnb_config,
                                                      device_map={"": 0})

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

In [17]:
# Testing the model
question, text = "Who was the last pharaoh of ancient Egypt?", "The last pharaoh of ancient Egypt was Cleopatra."

inputs = tokenizer(question, text, return_tensors="pt")
with torch.no_grad():
    outputs = model(**inputs)

answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits)

predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'cleopatra'

In [26]:
# Defining evaluation metrics
exact_match_metric = evaluate.load("exact_match")
f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    # Get raw predictions and convert them to text
    predictions = pred.predictions
    references = pred.label_ids

    # Convert logits to predictions by taking the argmax
    predicted_ids = np.argmax(predictions, axis=-1)

    # Decode predicted and reference IDs to text using the tokenizer
    decoded_preds = [tokenizer.decode(pred, skip_special_tokens=True) for pred in predicted_ids]
    decoded_refs = [tokenizer.decode(ref, skip_special_tokens=True) for ref in references]

    # Calculate exact match and F1 score
    em_score = exact_match_metric.compute(predictions=decoded_preds, references=decoded_refs)["exact_match"]
    f1_score = f1_metric.compute(predictions=decoded_preds, references=decoded_refs)["f1"]

    return {
        "exact_match": em_score,
        "f1": f1_score,
    }

In [27]:
wandb_project = input("Wandb project: ")
wandb_entity = input("Wandb entity: ")
output_dir_name = "trial_run_local"

training_args = TrainingArguments(
    output_dir=output_dir_name,
    eval_strategy="steps",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='new_dir',
    push_to_hub=False,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_mrqa["train"],
    eval_dataset=tokenized_mrqa["val"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    peft_config=peft_config,
    compute_metrics=compute_metrics
)

wandb.init(project=wandb_project, entity=wandb_entity)
trainer.train()
wandb.finish()

Wandb project: nagyhazi_colab
Wandb entity: bme_deepl_learning




0,1
train/epoch,▁
train/global_step,▁
train/grad_norm,▁
train/learning_rate,▁
train/loss,▁

0,1
train/epoch,0.12092
train/global_step,500.0
train/grad_norm,3.48619
train/learning_rate,2e-05
train/loss,1.2021


Step,Training Loss,Validation Loss


ValueError: invalid literal for int() with base 10: '[unused117] [unused15] [unused58] [unused35] [unused363] [unused35] [unused9] [unused79] [unused152] [unused114] [unused27] [unused25] [unused62] [unused52] [unused75] [unused174] [unused115] [unused

In [None]:
#TODO: Correct the compute_metrics function

In [23]:
# Uncomment and run if training was stopped earlier
wandb.finish()