In [1]:
from training_utils import *
from datasets import load_dataset
from transformers import AutoTokenizer
import json
from transformers import pipeline
import torch
from transformers import DistilBertForQuestionAnswering, TrainingArguments, Trainer, DefaultDataCollator, BitsAndBytesConfig
from peft import LoraConfig, PeftModel
import os
import wandb
from trl import SFTTrainer
import evaluate
import numpy as np
from tqdm.auto import tqdm
import collections
from torch.utils.data import DataLoader




Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [2]:
# Loading the dataset
# The MRQA dataset is included in huggingface's datasets library, so we just have to load it
# Loading dataset (smaller fraction than in the final becasue had to train on local GPU)
mrqa = load_dataset("mrqa", split="train[:5%]")
# Creating the train-test-validation split
mrqa = mrqa.train_test_split(test_size=0.2)
mrqa["train"] = mrqa["train"].train_test_split(test_size=0.2)
mrqa["val"] = mrqa["train"]["test"]
mrqa["train"] = mrqa["train"]["train"]

In [3]:
# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-distilled-squad")

tokenized_mrqa = mrqa.map(preprocess_training_examples, batched=True, 
                          remove_columns=mrqa["train"].column_names,
                          fn_kwargs={"tokenizer": tokenizer})
tokenized_mrqa.set_format(type="torch")

# Tokenizing evaluation dataset
tokenized_eval = mrqa["test"].map(preprocess_validation_examples, batched=True, 
                                  remove_columns=mrqa["test"].column_names,
                                  fn_kwargs={"tokenizer": tokenizer})
tokenized_eval.set_format(type="torch")


Map:   0%|          | 0/16537 [00:00<?, ? examples/s]

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

Map:   0%|          | 0/4135 [00:00<?, ? examples/s]

Map:   0%|          | 0/5169 [00:00<?, ? examples/s]

In [4]:
# Defining data collator
data_collator = DefaultDataCollator()

# Configuring parameters for the quantation
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=getattr(torch, "float16"),
    bnb_4bit_use_double_quant=False,
)

# Configuring parameters of the low-rank adaptation
peft_config = LoraConfig(
    lora_alpha=6,
    lora_dropout=0.15,
    r=2,
    bias="none",
    task_type="QUESTION_ANS",
    target_modules=["q_lin", "k_lin", "v_lin", "ffn.lin1", "ffn.lin2", "attention.out_proj"])

# Loading baseline model: DistilBert finetuned on Squadn dataset
model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased-distilled-squad",
                                                       quantization_config=bnb_config,
                                                       device_map={"": 0})

In [5]:
# Calculating evaluation metrics before further finetuning
pre_training_metrics = eval_function(tokenized_eval, model, mrqa["test"])
print(f"Exact match before finetuning: {pre_training_metrics['exact_match']}\nF1 score before finetuning: {pre_training_metrics['f1']}, BLEU score before finetuning: {pre_training_metrics['bleu']}")

  0%|          | 0/82 [00:00<?, ?it/s]

  0%|          | 0/5169 [00:00<?, ?it/s]

Exact match before finetuning: 82.35635519442832
F1 score before finetuning: 91.48940098712508, BLEU score before finetuning: 0.7410732358191019
