# **Install Library**

In [None]:
!pip install -q transformers

# **Prepare Dataset**

**About Dataset:** Stanford Question Answering Dataset (SQuAD) 2.0  is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.  

The dataset includes two sets, each contains two attributes: `context` (The paragraph or text from which the question is asked) and `qas` (A list of questions and answers). The most important attributes of each `qas` are `question` and `answers` (containing a list of `answer` and `answer_start`).


In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [None]:
from datasets import load_dataset

# Load the SQuAD dataset
dataset = load_dataset('squad_v2')

# Access the train and validation sets
train_dataset = dataset['train']
dev_dataset = dataset['validation']

# Display some samples
print("Train Dataset Sample:")
print(train_dataset[0])

print("\nValidation Dataset Sample:")
print(dev_dataset[0])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

Train Dataset Sample:
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".', 'question': 'When did Beyonce start becoming popular?', 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

Validation Dataset Sample:
{'id': '56ddde6b9a695914005b9628', 'title': 'Normans', 'cont

In [None]:
def extract_squad_data(dataset):
    """
    Extract contexts, questions, and answers from the SQuAD dataset.
    """
    contexts, questions, answers = list(), list(), list()

    for data in dataset:
        context = data['context']
        question = data['question']
        if 'plausible_answers' in data:
            access = 'plausible_answers'
        else:
            access = 'answers'

        for answer_text, answer_start in zip(data[access]['text'], data[access]['answer_start']):
            contexts.append(context)
            questions.append(question)
            answers.append({'text': answer_text, 'answer_start': answer_start})

    return contexts, questions, answers

train_contexts, train_questions, train_answers = extract_squad_data(train_dataset)
valid_contexts, valid_questions, valid_answers = extract_squad_data(dev_dataset)

print("Train Dataset Sample:")
print("Context:", train_contexts[0])
print("Question:", train_questions[0])
print("Answer:", train_answers[0])

print("\nValidation Dataset Sample:")
print("Context:", valid_contexts[0])
print("Question:", valid_questions[0])
print("Answer:", valid_answers[0])


Train Dataset Sample:
Context: Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Question: When did Beyonce start becoming popular?
Answer: {'text': 'in the late 1990s', 'answer_start': 269}

Validation Dataset Sample:
Context: The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave thei

In [None]:
from pprint import pprint
import random

# Set the seed for reproducibility
random.seed(0)

# Function to print random samples from the dataset
def print_random_samples(contexts, questions, answers, num_samples=5):
    indices = random.sample(range(0, len(contexts)), num_samples)
    for index in indices:
        print(f'Q:  {questions[index]}\n')
        print("Context:\n")
        pprint(contexts[index])
        print(f"\nAnswer: [{answers[index]}]\n")
        print("-" * 100)

# Extract data from the train and validation sets
train_contexts, train_questions, train_answers = extract_squad_data(train_dataset)
valid_contexts, valid_questions, valid_answers = extract_squad_data(dev_dataset)

# Print some instances of the training set
print("Random Samples from Training Set:")
print_random_samples(train_contexts, train_questions, train_answers)


Random Samples from Training Set:
Q:  What day marked the beginning of New Haven being overwhelmed by 12,000 individuals protesting the Black Panther trials? 

Context:

('In 1970, the New Haven Black Panther trials took place, the largest and '
 'longest trials in Connecticut history. Black Panther Party co-founder Bobby '
 'Seale and ten other Party members were tried for murdering an alleged '
 'informant. Beginning on May Day, the city became a center of protest for '
 '12,000 Panther supporters, college students, and New Left activists '
 '(including Jean Genet, Benjamin Spock, Abbie Hoffman, Jerry Rubin, and John '
 'Froines), who amassed on the New Haven Green, across the street from where '
 'the trials were being held. Violent confrontations between the demonstrators '
 'and the New Haven police occurred, and several bombs were set off in the '
 'area by radicals. The event became a rallying point for the New Left and '
 'critics of the Nixon Administration.')

Answer: [{'text

In [None]:
def apply_end_index(answers: list, contexts: list) -> list:
    '''
    the dataset has already character start_index of answers'
    '''
    _answers = answers.copy()
    for answer, context in zip(_answers, contexts):
        # this is the answer which is extracted from context
        answer_bound = answer['text']
        # we already know the start character position of answer from context
        start_idx = answer['answer_start']

        answer['answer_end'] = start_idx + len(answer_bound)
    return _answers

train_answers = apply_end_index(train_answers, train_contexts)
valid_answers = apply_end_index(valid_answers, valid_contexts)

# **Encode the dataset**

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def encode_data(contexts: list, questions: list, answers: list) -> dict:
    encodings = tokenizer(contexts, questions, truncation=True, padding=True, return_tensors="pt")

    # add start and end positions to encodings
    start_positions, end_positions = list(), list()

    for index in range(len(answers)):
        start_value = encodings.char_to_token(index, answers[index]['answer_start'])
        end_value   = encodings.char_to_token(index, answers[index]['answer_end'])

        # if start position is None, the answer passage has been truncated
        if start_value is None:
            start_value = tokenizer.model_max_length

        # end position cannot be found, char_to_token found space, so shift position until found
        shift = 1
        while end_value is None:
            end_value = encodings.char_to_token(index, answers[index]['answer_end'] - shift)
            shift += 1

        start_positions.append(start_value)
        end_positions.append(end_value)

    encodings.update({
        'start_positions': start_positions, 'end_positions': end_positions
    })

    return encodings



train_encodings = encode_data(train_contexts, train_questions, train_answers)
valid_encodings = encode_data(valid_contexts, valid_questions, valid_answers)

train_encodings.keys()

dict_keys(['input_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [None]:
del train_contexts, train_questions, train_answers
del valid_contexts, valid_questions, valid_answers

In [None]:
import torch


class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings: dict) -> None:
        self.encodings = encodings

    def __getitem__(self, index: int) -> dict:
        return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings['input_ids'])


train_ds = SquadDataset(train_encodings)
valid_ds = SquadDataset(valid_encodings)

In [None]:
del train_encodings, valid_encodings

# **Fine-tune the QuestionAnswering Transformer Model**

In [None]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-uncased')

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# activate training mode of model
model.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [None]:
from collections import defaultdict

import torch
from torch.optim.optimizer import Optimizer


class Lookahead(Optimizer):
    r"""PyTorch implementation of the lookahead wrapper.
    Lookahead Optimizer: https://arxiv.org/abs/1907.08610
    """

    def __init__(self, optimizer, la_steps=5, la_alpha=0.8, pullback_momentum="none"):
        """optimizer: inner optimizer
        la_steps (int): number of lookahead steps
        la_alpha (float): linear interpolation factor. 1.0 recovers the inner optimizer.
        pullback_momentum (str): change to inner optimizer momentum on interpolation update
        """
        self.optimizer = optimizer
        self._la_step = 0  # counter for inner optimizer
        self.la_alpha = la_alpha
        self._total_la_steps = la_steps
        pullback_momentum = pullback_momentum.lower()
        assert pullback_momentum in ["reset", "pullback", "none"]
        self.pullback_momentum = pullback_momentum

        self.state = defaultdict(dict)

        # Cache the current optimizer parameters
        for group in optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['cached_params'] = torch.zeros_like(p.data)
                param_state['cached_params'].copy_(p.data)
                if self.pullback_momentum == "pullback":
                    param_state['cached_mom'] = torch.zeros_like(p.data)

    def __getstate__(self):
        return {
            'state': self.state,
            'optimizer': self.optimizer,
            'la_alpha': self.la_alpha,
            '_la_step': self._la_step,
            '_total_la_steps': self._total_la_steps,
            'pullback_momentum': self.pullback_momentum
        }

    def zero_grad(self):
        self.optimizer.zero_grad()

    def get_la_step(self):
        return self._la_step

    def state_dict(self):
        return self.optimizer.state_dict()

    def load_state_dict(self, state_dict):
        self.optimizer.load_state_dict(state_dict)

    def _backup_and_load_cache(self):
        """Useful for performing evaluation on the slow weights (which typically generalize better)
        """
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                param_state['backup_params'] = torch.zeros_like(p.data)
                param_state['backup_params'].copy_(p.data)
                p.data.copy_(param_state['cached_params'])

    def _clear_and_load_backup(self):
        for group in self.optimizer.param_groups:
            for p in group['params']:
                param_state = self.state[p]
                p.data.copy_(param_state['backup_params'])
                del param_state['backup_params']

    @property
    def param_groups(self):
        return self.optimizer.param_groups

    def step(self, closure=None):
        """Performs a single Lookahead optimization step.
        Arguments:
            closure (callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        loss = self.optimizer.step(closure)
        self._la_step += 1

        if self._la_step >= self._total_la_steps:
            self._la_step = 0
            # Lookahead and cache the current optimizer parameters
            for group in self.optimizer.param_groups:
                for p in group['params']:
                    param_state = self.state[p]
                    p.data.mul_(self.la_alpha).add_(param_state['cached_params'], alpha=1.0 - self.la_alpha)  # crucial line
                    param_state['cached_params'].copy_(p.data)
                    if self.pullback_momentum == "pullback":
                        internal_momentum = self.optimizer.state[p]["momentum_buffer"]
                        self.optimizer.state[p]["momentum_buffer"] = internal_momentum.mul_(self.la_alpha).add_(
                            1.0 - self.la_alpha, param_state["cached_mom"])
                        param_state["cached_mom"] = self.optimizer.state[p]["momentum_buffer"]
                    elif self.pullback_momentum == "reset":
                        self.optimizer.state[p]["momentum_buffer"] = torch.zeros_like(p.data)

        return loss

In [None]:
from transformers import AdamW

# initialize adam optimizer with weight decay (reduces chance of overfitting)
base  = AdamW(model.parameters(), lr=1e-4)
optim = Lookahead(base)



In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

import warnings
warnings.simplefilter("ignore")


# initialize data loader for training data
train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)


for epoch in range(3):
    # set model to train mode
    model.train()

    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optim.zero_grad()

        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                        start_positions=start_positions, end_positions=end_positions)
        # extract loss
        loss = outputs[0]
        # calculate loss for every parameter that needs grad update
        loss.backward()

        # update parameters
        optim.step()

        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 5427/5427 [1:08:11<00:00,  1.33it/s, loss=1.05]
Epoch 1: 100%|██████████| 5427/5427 [1:08:26<00:00,  1.32it/s, loss=0.907]
Epoch 2: 100%|██████████| 5427/5427 [1:08:24<00:00,  1.32it/s, loss=1.4]


In [None]:
from google.colab import drive
import os
import torch

# Mount Google Drive
drive.mount('/content/drive/')

# Define the folder path
folder_path = '/content/drive/My Drive/Colab Notebooks/COMP8420/Major Project'

# Define the path to save the model
model_path = os.path.join(folder_path, 'model.model')

# Save the entire model object
torch.save(model, model_path)

# Save the tokenizer separately
tokenizer.save_pretrained(folder_path)



Mounted at /content/drive/


('/content/drive/My Drive/Colab Notebooks/COMP8420/Major Project/tokenizer_config.json',
 '/content/drive/My Drive/Colab Notebooks/COMP8420/Major Project/special_tokens_map.json',
 '/content/drive/My Drive/Colab Notebooks/COMP8420/Major Project/vocab.txt',
 '/content/drive/My Drive/Colab Notebooks/COMP8420/Major Project/added_tokens.json',
 '/content/drive/My Drive/Colab Notebooks/COMP8420/Major Project/tokenizer.json')

# **Evaluate the results**

###Evaluation metrics
In this note book, two main evaluation metrics are used to assess the performance of the question-answering model:

**SQuAD Metrics:**

- **Exact Match (EM)**: This metric calculates the percentage of predictions that match the ground truth answers exactly. An exact match means the predicted answer is identical to the true answer in terms of characters and words.
- **F1 Score**: This is a harmonic mean of precision and recall. It measures the overlap between the predicted and true answers. The F1 score takes into account the precision (the percentage of words in the predicted answer that are present in the true answer) and recall (the percentage of words in the true answer that are captured by the predicted answer).

**BERTScore:**

- **Precision (P)**: Measures the proportion of tokens in the predicted answer that are also present in the reference answer.
- **Recall (R)**: Measures the proportion of tokens in the reference answer that are captured by the predicted answer.
- **F1 Score (F1)**: Similar to the SQuAD F1 score, the BERTScore F1 is the harmonic mean of precision and recall but uses contextual embeddings from BERT to compare the semantic similarity between the predicted and reference answers. This means it evaluates the quality of the predictions based on their contextual and semantic similarity rather than just exact token matches.

Why SQuAD Metrics and BERTScore are Appropriate for Evaluating Question Answering Models

** SQuAD Metrics:**

1. **Exact Match (EM)**:
   - **Relevance**: In many question-answering tasks, particularly those that require factual accuracy, having an exact match between the predicted answer and the ground truth is crucial. This metric ensures that the model provides answers that are completely accurate and align perfectly with the expected response.
   - **Precision**: This metric is highly stringent and ensures that the model does not produce any superfluous or incorrect information, which is essential for applications where precision is critical.

2. **F1 Score**:
   - **Overlap Measurement**: The F1 score is particularly useful for evaluating the overlap between the predicted and true answers. Since questions can often have multiple valid answers that share significant overlap, the F1 score provides a balanced measure of both precision and recall.
   - **Comprehensive Evaluation**: By considering both precision (the accuracy of the words in the predicted answer) and recall (the extent to which the true answer is covered), the F1 score offers a more nuanced assessment of the model's performance, beyond exact matches.

**BERTScore:**

1. **Contextual Similarity**:
   - **Semantic Understanding**: Unlike traditional metrics that rely solely on word overlap, BERTScore uses contextual embeddings to evaluate the semantic similarity between the predicted and true answers. This is especially important in natural language processing tasks, where the meaning of a sentence can be conveyed in various ways.
   - **Flexibility**: BERTScore can capture nuanced differences and similarities in meaning, making it suitable for evaluating answers that may be phrased differently but still correct.

2. **Precision, Recall, and F1 Score**:
   - **Precision**: Measures how many of the tokens in the predicted answer are relevant to the true answer. High precision indicates that the model is not producing irrelevant information.
   - **Recall**: Measures how many of the tokens in the true answer are captured by the predicted answer. High recall ensures that the model is comprehensive in providing the necessary information.
   - **F1 Score**: The harmonic mean of precision and recall, BERTScore F1 gives a balanced view of the model’s performance, ensuring that both completeness and relevance are considered.


In [None]:
from google.colab import drive
import os
import torch
from transformers import AutoTokenizer

# Mount Google Drive
drive.mount('/content/drive/')

# Define the folder path and model path
folder_path = '/content/drive/My Drive/Colab Notebooks/COMP8420/Major Project'
model_path = os.path.join(folder_path, 'model.model')

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(folder_path, use_fast=True)

# Load the entire model object
model = torch.load(model_path)

# Verify the loading
print("Tokenizer and model loaded successfully.")


Mounted at /content/drive/
Tokenizer and model loaded successfully.


In [None]:
!pip install bert-score


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0.0->bert-score)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3

In [None]:
valid_ds

<__main__.SquadDataset at 0x7d834c3cb9a0>

In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import load_metric
from bert_score import score

# Ensure 'device' is defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Switch model out of training mode
model.eval()
model = model.to(device)

# Initialize validation set data loader
val_loader = DataLoader(valid_ds, batch_size=16)

# Load SQuAD metric
metric = load_metric("squad_v2", trust_remote_code=True)

# Initialize lists for BERTScore references and candidates
references = []
candidates = []

# Initialize unique ID counter
unique_id = 0

# Loop through batches and collect predictions and references
for batch in val_loader:
    with torch.no_grad():
        # Pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # True positions for accuracy calculation
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Collect predictions and references
        for i in range(len(input_ids)):
            pred_text = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
            true_text = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)

            prediction = {
                'id': str(unique_id),  # Use unique ID as string
                'prediction_text': pred_text,
                'no_answer_probability': float(outputs.start_logits[i][0].item())  # Example probability
            }
            reference = {
                'id': str(unique_id),  # Use unique ID as string
                'answers': {
                    'text': [true_text],
                    'answer_start': [start_true[i].item()]
                }
            }
            metric.add(prediction=prediction, reference=reference)

            # Append to BERTScore lists
            candidates.append(pred_text)
            references.append(true_text)

            # Increment unique ID counter
            unique_id += 1

# Compute the SQuAD metrics
squad_results = metric.compute()
print(f"SQuAD Metric Results: {squad_results}")
if 'exact_match' in squad_results:
    print(f"Exact Match (EM): {squad_results['exact_match']:.2f}")
if 'f1' in squad_results:
    print(f"F1 Score: {squad_results['f1']:.2f}")

# Compute BERTScore
P, R, F1 = score(candidates, references, lang="en", verbose=True)
bert_score_f1 = F1.mean().item()
print(f"BERTScore F1: {bert_score_f1:.4f}")

  return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}


SQuAD Metric Results: {'exact': 55.10787114569993, 'f1': 70.80233758887827, 'total': 20302, 'HasAns_exact': 55.10787114569993, 'HasAns_f1': 70.80233758887827, 'HasAns_total': 20302, 'best_exact': 55.10787114569993, 'best_exact_thresh': 0.03749268874526024, 'best_f1': 70.80233758887819, 'best_f1_thresh': 0.03749268874526024}
F1 Score: 70.80


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/166 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/318 [00:00<?, ?it/s]



done in 28.39 seconds, 715.09 sentences/sec
BERTScore F1: 0.9108


Warnings about Empty Candidate Sentences:

These warnings indicate that some of your predicted answers were empty. This could happen if the model fails to predict an answer span for some questions.

To Address These Warnings, check for empty predictions and handle them appropriately. By addressing these warnings and interpreting the results, the model evaluation is robust and accurate.

In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import load_metric
from bert_score import score

# Ensure 'device' is defined
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Switch model out of training mode
model.eval()
model = model.to(device)

# Initialize validation set data loader
val_loader = DataLoader(valid_ds, batch_size=16)

# Load SQuAD metric
metric = load_metric("squad_v2", trust_remote_code=True)

# Initialize lists for BERTScore references and candidates
references = []
candidates = []

# Initialize unique ID counter
unique_id = 0

# Loop through batches and collect predictions and references
for batch in val_loader:
    with torch.no_grad():
        # Pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # True positions for accuracy calculation
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)

        # Make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        start_pred = torch.argmax(outputs.start_logits, dim=1)
        end_pred = torch.argmax(outputs.end_logits, dim=1)

        # Collect predictions and references
        for i in range(len(input_ids)):
            pred_text = tokenizer.decode(input_ids[i][start_pred[i]:end_pred[i]+1], skip_special_tokens=True)
            true_text = tokenizer.decode(input_ids[i][start_true[i]:end_true[i]+1], skip_special_tokens=True)

            if not pred_text:
                pred_text = "[EMPTY]"  # Handle empty predictions

            prediction = {
                'id': str(unique_id),  # Use unique ID as string
                'prediction_text': pred_text,
                'no_answer_probability': float(outputs.start_logits[i][0].item())  # Example probability
            }
            reference = {
                'id': str(unique_id),  # Use unique ID as string
                'answers': {
                    'text': [true_text],
                    'answer_start': [start_true[i].item()]
                }
            }
            metric.add(prediction=prediction, reference=reference)

            # Append to BERTScore lists
            candidates.append(pred_text)
            references.append(true_text)

            # Increment unique ID counter
            unique_id += 1

# Compute the SQuAD metrics
squad_results = metric.compute()
print(f"SQuAD Metric Results: {squad_results}")
if 'exact_match' in squad_results:
    print(f"Exact Match (EM): {squad_results['exact_match']:.2f}")
if 'f1' in squad_results:
    print(f"F1 Score: {squad_results['f1']:.2f}")

# Compute BERTScore
P, R, F1 = score(candidates, references, lang="en", verbose=True)
bert_score_f1 = F1.mean().item()
print(f"BERTScore F1: {bert_score_f1:.4f}")


  return {key: torch.tensor(val[index]) for key, val in self.encodings.items()}


SQuAD Metric Results: {'exact': 55.07831740715201, 'f1': 70.77278385033036, 'total': 20302, 'HasAns_exact': 55.07831740715201, 'HasAns_f1': 70.77278385033036, 'HasAns_total': 20302, 'best_exact': 55.07831740715201, 'best_exact_thresh': 0.03749268874526024, 'best_f1': 70.77278385033027, 'best_f1_thresh': 0.03749268874526024}
F1 Score: 70.77


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/166 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/318 [00:00<?, ?it/s]



done in 27.83 seconds, 729.55 sentences/sec
BERTScore F1: 0.9366


### Interpretation of the Results

#### SQuAD Metrics:
- **Exact Match (EM)**: 55.08%
  - This means that 55.08% of the model's predictions are exactly the same as the ground truth answers. While over half of the predictions are correct word-for-word, there is still room for improvement in achieving higher accuracy.
  
- **F1 Score**: 70.77%
  - The F1 score is 70.77%, indicating a good balance between precision and recall. This score suggests that the model captures a substantial portion of the correct answers, even if not all predictions are exact matches.

- **Total Examples**: 20,302
  - The evaluation was conducted on a substantial number of examples, indicating that the results are likely representative of the model's performance.

- **Thresholds**: The best exact match and F1 scores were achieved with a specific threshold, highlighting the importance of fine-tuning hyperparameters.

#### BERTScore:
- **F1 Score**: 0.9366 (93.66%)
  - The BERTScore F1 is 93.66%, indicating a high level of semantic similarity between the predicted and reference answers. This suggests that the model is effective at understanding and generating answers that are contextually and semantically similar to the ground truth.

### Comments on the Model

1. **Strengths**:
   - The high BERTScore F1 (93.66%) demonstrates that the model has a strong understanding of the context and semantics of the questions and answers, even if not all predictions are exact matches.
   - The F1 score of 70.77% from the SQuAD metrics indicates good performance in capturing relevant parts of the answers, reflecting a balanced precision and recall.

2. **Areas for Improvement**:
   - The Exact Match score (55.08%) suggests that there is still a significant portion of answers that the model fails to predict word-for-word accurately. This could be due to the model's inability to handle specific nuances or variations in the phrasing of the answers.
   - The warning messages about empty reference sentences and uninitialized weights indicate that there may be some data preprocessing issues and that the model may benefit from further training or fine-tuning on the specific task.

Overall, the model shows promising results, particularly in understanding the context and semantics of the questions and answers, but there is room for improvement in achieving higher exact match accuracy.


# **Test the model on manual inputs**

In [None]:
def answer_to_questions(context: str, questions: list) -> list:
    '''
    return a list of answers to list of questions based on context.
    '''
    # encode the inputs
    encodings = tokenizer([context]*len(questions), questions, truncation=True, padding=True, return_tensors="pt")
    encodings = encodings.to(device)
    # make predictions
    outputs = model(**encodings)
    # pull prediction tensors out and argmax to get predicted tokens
    start_pred = torch.argmax(outputs['start_logits'], dim=1)
    end_pred = torch.argmax(outputs['end_logits'], dim=1)

    answers = list()
    for index, (start_idx, end_idx) in enumerate(zip(start_pred, end_pred)):
        tokens = tokenizer.convert_ids_to_tokens(encodings['input_ids'][index][start_idx:end_idx+1])
        answers.append( tokenizer.convert_tokens_to_string(tokens) )


    # print the results
    print("Context:")
    pprint(context)
    print()
    for question, answer in zip(questions, answers):
        print(f"Q:  {question}")
        print(f"A:  {answer}")
        print("-"*60)



    return answers

In [None]:
context = "The modern Olympic Games or Olympics (French: Jeux olympiques)[1][2] are leading international sporting events featuring summer and winter sports competitions in which thousands of athletes from around the world participate in a variety of competitions. The Olympic Games are considered the world's foremost sports competition with more than 200 nations participating.[3] The Olympic Games are normally held every four years, alternating between the Summer and Winter Olympics every two years in the four-year period."
questions = [
    "How often do the Olympic games hold?",
    "How many nations do participate in each Olympic?"
]

_ = answer_to_questions(context, questions)

Context:
('The modern Olympic Games or Olympics (French: Jeux olympiques)[1][2] are '
 'leading international sporting events featuring summer and winter sports '
 'competitions in which thousands of athletes from around the world '
 'participate in a variety of competitions. The Olympic Games are considered '
 "the world's foremost sports competition with more than 200 nations "
 'participating.[3] The Olympic Games are normally held every four years, '
 'alternating between the Summer and Winter Olympics every two years in the '
 'four-year period.')

Q:  How often do the Olympic games hold?
A:  every four years,
------------------------------------------------------------
Q:  How many nations do participate in each Olympic?
A:  more than 200
------------------------------------------------------------


In [None]:
context = "Vikings is the modern name given to seafaring people primarily from Scandinavia (present-day Denmark, Norway and Sweden), who from the late 8th to the late 11th centuries raided, pirated, traded and settled throughout parts of Europe. They also voyaged as far as the Mediterranean, North Africa, the Middle East, and North America. In some of the countries they raided and settled in, this period is popularly known as the Viking Age, and the term \"Viking\" also commonly includes the inhabitants of the Scandinavian homelands as a collective whole. The Vikings had a profound impact on the Early medieval history of Scandinavia, the British Isles, France, Estonia, and Kievan Rus'."
questions = [
    "When vikings started raided?",
]

_ = answer_to_questions(context, questions)

Context:
('Vikings is the modern name given to seafaring people primarily from '
 'Scandinavia (present-day Denmark, Norway and Sweden), who from the late 8th '
 'to the late 11th centuries raided, pirated, traded and settled throughout '
 'parts of Europe. They also voyaged as far as the Mediterranean, North '
 'Africa, the Middle East, and North America. In some of the countries they '
 'raided and settled in, this period is popularly known as the Viking Age, and '
 'the term "Viking" also commonly includes the inhabitants of the Scandinavian '
 'homelands as a collective whole. The Vikings had a profound impact on the '
 'Early medieval history of Scandinavia, the British Isles, France, Estonia, '
 "and Kievan Rus'.")

Q:  When vikings started raided?
A:  pirated,
------------------------------------------------------------
