# Fine Tuning Bert Model on SQuAD 2.0

In [1]:
!pip install transformers



In [2]:
!wget -O train-v2.0.json https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
!wget -O dev-v2.0.json https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

--2024-10-17 05:15:44--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘train-v2.0.json’


2024-10-17 05:16:26 (1.02 MB/s) - ‘train-v2.0.json’ saved [42123633/42123633]

--2024-10-17 05:16:26--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.109.153, 185.199.110.153, 185.199.111.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.109.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘dev-v2.0.json’


2024-10-17 05:16:28 (2.23 MB/s) - ‘dev-v2.0.json’ saved [4370528/4370528]



In [1]:
import os
import pandas as pd 
import numpy as np
import json

from transformers import BertTokenizer
from transformers import BertForQuestionAnswering
from tokenizers import BertWordPieceTokenizer


train_file = 'train-v2.0.json'
validation_file = 'dev-v2.0.json'

with open(train_file) as f:
    raw_train_data = json.load(f)
with open(validation_file) as f:
    raw_val_data = json.load(f)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sampled_train_data = raw_train_data['data'][:200]
raw_train_data['data'] = sampled_train_data

Install transformers

Create the tokenizer

In [3]:
# Load pre-trained model tokenizer (vocabulary)
slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
save_path = "bert_base_uncased/"
if not os.path.exists(save_path):
    os.makedirs(save_path)
slow_tokenizer.save_pretrained(save_path)

# Load the fast tokenizer from saved file
tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
max_len = 384


Create a class to save the data form sQuAD

In [39]:
class SquadDataset:
    def __init__(self, context, question, basic_answer, more_answers, start_idx):
        self.context = context
        self.question = question
        self.basic_answer = basic_answer
        self.more_answers = more_answers
        self.start_idx = start_idx
        self.end_idx = None
        self.start_idx_token = start_idx
        self.end_idx_token = None
        self.offsets = None
        self.input_ids = None 
        self.attention_mask = None
        self.token_type_ids = None
        self.validExample = True

    def preprocess(self): # function to preprocess data
       # Clean context, answer and question
        self.context = " ".join(str(self.context).split())
        self.question = " ".join(str(self.question).split())
        
        contextTokenizer =  tokenizer.encode(self.context)

        if self.basic_answer is not None: # in case we have an answer

          self.basic_answer = " ".join(str(self.basic_answer).split())
          #Calculate end_idx
          self.end_idx = self.start_idx + len(self.basic_answer)
          if (self.end_idx >=len(self.context)):
              self.validExample= False
              return

          #find characters of context that are part of answer
          is_part_of_answer = [0]*len(self.context)
          for i in range (self.start_idx, self.end_idx):
              is_part_of_answer[i] = 1

          
          #find index of token that corresponds to start and the end of the answer
          answer_id_token=[]
          for idx, (start,end) in enumerate(contextTokenizer.offsets):
              if (sum(is_part_of_answer[start:end]) >0 ):
                  answer_id_token.append(idx)
          #data to predict
          if len(answer_id_token) == 0 :
              self.validExample=False
              return         
          self.start_idx_token = answer_id_token[0]
          self.end_idx_token = answer_id_token[-1]
        self.offsets = contextTokenizer.offsets
        
        # work on question
        questionTokinizer  = tokenizer.encode(self.question)

        #Create model's inputs 
        self.input_ids = contextTokenizer.ids + questionTokinizer.ids[1:]
        self.attention_mask = [1] * len (self.input_ids)
        self.token_type_ids = [0] * len(contextTokenizer.ids) + [1]*len(questionTokinizer.ids[1:])  

        # fix padding
        padding_length = max_len - len(self.input_ids)
        if padding_length > 0:
            self.input_ids = self.input_ids + ([0] * padding_length)
            self.attention_mask = self.attention_mask + ([0] * padding_length)
            self.token_type_ids = self.token_type_ids + ([0] * padding_length)
        elif padding_length < 0:
            self.validExample= False 
            return 

Function that helps save data from json files

In [30]:
def create_squad_examples(raw_data):
    squad_examples = []
    for item in raw_data["data"]:
        for para in item["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                question = qa["question"]
                if qa["answers"]:
                  answer_text = qa["answers"][0]["text"]
                  all_answers = [_["text"] for _ in qa["answers"]]
                  start_char_idx = qa["answers"][0]["answer_start"]
                  #context, question, basic_answer, more_answers, start_idx
                  squad_eg = SquadDataset(context,question, answer_text, all_answers, start_char_idx)
                else:
                  squad_eg = SquadDataset(context,question, None, None, None) #context, question
                squad_eg.preprocess()
                squad_examples.append(squad_eg)
    return squad_examples

Function that create ttwo dictionaries, one for the input and one for the target 

In [6]:
def create_inputs_targets(squad_examples):
    dataset_dict = {
        "input_ids" : [],
        "attention_mask" : [],
        "token_type_ids" : [],
        "start_idx_token" : [],
        "end_idx_token" : []
    }
    for item in squad_examples:
        if item.validExample is True:
            for key in dataset_dict:
                dataset_dict[key].append(getattr(item, key))
    for key in dataset_dict:
        dataset_dict[key] = np.array(dataset_dict[key],dtype=np.float16)
    x = [dataset_dict["input_ids"], dataset_dict["attention_mask"], dataset_dict["token_type_ids"]]
    y = [dataset_dict["start_idx_token"], dataset_dict["end_idx_token"]]
    return x, y   

In [7]:
data = create_squad_examples(raw_train_data)  # save the data for the training set

val_data = create_squad_examples(raw_val_data)  # save the data for the validation set

Check what is saved

In [8]:
train_data = pd.DataFrame.from_records([vars(line) for line in data])
train_data[["context","question","basic_answer"]].head()

Unnamed: 0,context,question,basic_answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,in the late 1990s
1,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,singing and dancing
2,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,2003
3,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"Houston, Texas"
4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,late 1990s


In [9]:
x_train, y_train = create_inputs_targets(data)  # split the training data to input and target

x_eval, y_eval = create_inputs_targets(val_data)  # split the validation data to input and target

In [10]:
doc_stride = 64
max_seq_length = 128
max_query_length = 32
batch_size = 16

### Create Datasets

In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Convert to Tensors and build dataset
train_data = TensorDataset(torch.tensor(x_train[0], dtype=torch.int64),
                           torch.tensor(x_train[1], dtype=torch.float),
                           torch.tensor(x_train[2], dtype=torch.int64),
                           torch.tensor(y_train[0], dtype=torch.int64),
                           torch.tensor(y_train[1], dtype=torch.int64))

train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

In [12]:
# Convert to Tensors and build dataset
train_sampler = RandomSampler(train_data)
train_data_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
eval_data = TensorDataset(torch.tensor(x_eval[0], dtype=torch.int64),
                          torch.tensor(x_eval[1], dtype=torch.float),
                          torch.tensor(x_eval[2], dtype=torch.int64),
                          torch.tensor(y_eval[0], dtype=torch.int64),
                          torch.tensor(y_eval[1], dtype=torch.int64))

eval_sampler = SequentialSampler(eval_data)
validation_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=batch_size)

### Check for GPU availability

In [13]:
import torch

if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

### Initialize model

In [14]:
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device=device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-5, betas=(0.9, 0.98), eps=1e-9)

### Training model

Function for text normalization

In [16]:
import string
import re

def normalize_text(text):
    text = text.lower()
    text = "".join(ch for ch in text if ch not in set(string.punctuation))
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    text = re.sub(regex, " ", text)
    text = " ".join(text.split())
    return text

In [17]:
epochs = 1

for epoch in range(1, epochs + 1):
    print("Training epoch ", str(epoch))
   
    model.train()
    tr_loss = 0
    nb_tr_steps = 0
    from tqdm import tqdm

    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Training Steps"):
        batch = tuple(t.to(device) for t in batch)

        inputs = {'input_ids':       batch[0],
                  'attention_mask':  batch[1], 
                  'token_type_ids':  batch[2],  
                  'start_positions': batch[3], 
                  'end_positions':   batch[4]}

        optimizer.zero_grad()

        outputs = model(**inputs)
        loss = outputs[0]

        loss.backward()
        optimizer.step()
        tr_loss += loss.item()
        nb_tr_steps += 1

    print(f"\nTraining loss={tr_loss / nb_tr_steps:.4f}")


    # ============================================ VALIDATION ==========================================================
    model.eval()
    currect_query = 0
    correct_ans = 0
    valid_examples = [x for x in val_data if x.validExample is True]
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        
        input_ids, attention_mask, token_type_ids, start_positions, end_positions = batch

        with torch.no_grad():
          start_logits, end_logits =  model(input_ids=input_ids,
                                              attention_mask=attention_mask,
                                              token_type_ids=token_type_ids, return_dict=False)
          
          pred_start, pred_end = start_logits.detach().cpu().numpy(), end_logits.detach().cpu().numpy()

        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
            squad_eg = valid_examples[currect_query]
            currect_query += 1
            offsets = squad_eg.offsets
            start = np.argmax(start)
            end = np.argmax(end)
            if start >= len(offsets):
                continue
            pred_char_start = offsets[start][0]
            if end < len(offsets):
                pred_char_end = offsets[end][1]
                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
            else:
                pred_ans = squad_eg.context[pred_char_start:]
            normalized_pred_ans = normalize_text(pred_ans)
            normalized_true_ans = [normalize_text(x) for x in squad_eg.more_answers]
            if normalized_pred_ans in normalized_true_ans:
                correct_ans += 1
    acc = correct_ans / len(y_eval[0])

    print(f"\nAccuracy score={acc:.2f}\n")

Training epoch  1


Training Steps:   0%|          | 4/3616 [04:04<61:18:32, 61.11s/it]


KeyboardInterrupt: 

### Testing model

In [41]:
data = {
    "data": [
        {
            "title": "Universitas Gadjah Mada",
            "paragraphs": [
                {
                    "context": "Universitas Gadjah Mada (UGM) is a public research university located in "
                    "Sleman, Special Region of Yogyakarta, Indonesia. Officially founded on 19 December 1949, "
                    "Gadjah Mada University is one of the oldest and largest institutions of higher education "
                    "in the country, and has been credited as one of the best universities in Indonesia. In the "
                    "2024 QS World University Rankings, UGM is ranked 2nd in Indonesia and 263rd in the world. "
                    "During the period when native education was often restricted, the institution was the first "
                    "to open its medicine to native Indonesians when it was founded in the 1940s under Dutch rule. "
                    "Comprising 18 faculties and 27 research centers, UGM offers 68 undergraduate, 23 diplomas, "
                    "104 master's and specialist, 43 doctorates, and 4 clusters of post-doctoral study programs. "
                    "The university has enrolled approximately 55,000 students, 1,187 foreign students, and "
                    "has 2,500 faculty members. UGM maintains a campus of 360 acres (150 ha), with facilities "
                    "that include a stadium and a fitness center. The university is named after Gajah Mada, "
                    "the leader of the Majapahit Empire of Java in the 14th century, who is also considered "
                    "to be the nation's first unifier by some historians; The spelling of the university's "
                    "name still reflects the old Dutch-era spelling.",
                    "qas": [
                        {
                            "question": "When was UGM officially founded?",
                            "id": "Q1",
                            "answers": ""
                        },
                        {
                            "question": "What is the ranking of UGM in Indonesia according to the 2024 QS World University Rankings?",
                            "id": "Q2",
                            "answers": ""
                        },
                        {
                            "question": "How many faculties does UGM have?",
                            "id": "Q3",
                            "answers": ""
                        },
                        {
                            "question": "How many research centers are affiliated with UGM?",
                            "id": "Q4",
                            "answers": ""
                        },
                        {
                            "question": "Approximately how many students are enrolled at UGM?",
                            "id": "Q5",
                            "answers": ""
                        },
                        {
                            "question": "Who is UGM named after?",
                            "id": "Q6",
                            "answers": ""
                        },
                        {
                            "question": "What year did UGM open its medicine to native Indonesians?",
                            "id": "Q7",
                            "answers": ""
                        },
                        {
                            "question": "How large is UGM's campus?",
                            "id": "Q8",
                            "answers": ""
                        }
                    ]
                }
            ]
        }
    ]
}


model.eval()
test_samples = create_squad_examples(data)
x_test, _ = create_inputs_targets(test_samples)
pred_start, pred_end = model(torch.tensor(x_test[0], dtype=torch.int64, device=device),
                             torch.tensor(x_test[1], dtype=torch.float, device=device),
                             torch.tensor(x_test[2], dtype=torch.int64, device=device), return_dict=False)
pred_start, pred_end = pred_start.detach().cpu().numpy(), pred_end.detach().cpu().numpy()
for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
    test_sample = test_samples[idx]
    offsets = test_sample.offsets
    start = np.argmax(start)
    end = np.argmax(end)
    pred_ans = None
    if start >= len(offsets):
        continue
    pred_char_start = offsets[start][0]
    if end < len(offsets):
        pred_ans = test_sample.context[pred_char_start:offsets[end][1]]
    else:
        pred_ans = test_sample.context[pred_char_start:]
    print("Q: " + test_sample.question)
    print("A: " + pred_ans)
    print("----------------------------------------\n")