In [None]:
!mkdir squad
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_train-v1.1.json  -O squad/train-v2.0.json
!wget https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_test-v1.1_WER54.json -O squad/test-v2.0.json

--2023-03-31 20:26:13--  https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_train-v1.1.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21805612 (21M) [text/plain]
Saving to: ‘squad/train-v2.0.json’


2023-03-31 20:26:14 (358 MB/s) - ‘squad/train-v2.0.json’ saved [21805612/21805612]

--2023-03-31 20:26:14--  https://raw.githubusercontent.com/chiahsuan156/Spoken-SQuAD/master/spoken_test-v1.1_WER54.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3370853 (3.2M) [text/plain]
Saving to: ‘squad/test-v2.0.

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m87.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.4


In [None]:
import torch
import json
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import random
import numpy as np

In [None]:
class SpokenSquadDataset(Dataset):
    def __init__(self, train=True, unprocessed=False, max_length=384, stride=128, model_checkpoint='bert-base-uncased'):
        super(SpokenSquadDataset, self).__init__()
        """This dataset loads the data into 3 synced lists:
        context, question, answer.
        
        It then creates encodings using the BertTokenizerFast tokenizer
        """
        
        self.train = train
        self.unprocessed = unprocessed
        
        if self.train:
            self.data_path = '/content/squad/train-v2.0.json'
        else:
            self.data_path = '/content/squad/test-v2.0.json'
        
        self.max_length = max_length
        self.stride = stride
        self.model_checkpoint = model_checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        
        # Sync the context, question, and answer data
        contexts, questions, answers, ids = self.read_data()
        self.examples = {'context': contexts, 'question': questions, 'answers': answers, 'id': ids}
        
        self.encodings = self.preprocess_examples()
        print(len(self.encodings['input_ids']))
        
    def __getitem__(self, idx):
        if self.unprocessed == False:
            return {key: val[idx] for key, val in self.encodings.items()}
        else:
            return {key: val[idx] for key, val in self.examples.items()}
        
            
    def __len__(self):
        return len(self.encodings['input_ids'])
    def read_data(self):
        
      with open(self.data_path) as f:
          data = json.load(f)['data']
          
          contexts = []
          questions = []
          answers = []
          ids = []
          
          for title in data:
              for paragraph in title['paragraphs']:
                  context = paragraph['context']
                  for qas in paragraph['qas']:
                      question = qas['question']
                      
                      # Check if there are any answers before trying to unpack them
                      if qas['answers']:
                          answer_text = qas['answers'][0]['text']
                          answer_start = qas['answers'][0]['answer_start']
                          id = qas['id']
                          # add an 'answer_end' to the answer
                          
                          # Check if we are training or evaluating
                          if self.train == True:
                              answers.append({'text': answer_text, 'answer_start': answer_start})
                          else:
                              answer_starts = []
                              texts = []
                              for answer in qas['answers']:
                                  texts.append(answer['text'])
                                  answer_starts.append(answer['answer_start'])
                              answers.append({'text': texts, 'answer_start': answer_starts})
                      
                      contexts.append(context)
                      questions.append(question)
                      ids.append(qas['id'])
          
          return contexts, questions, answers, ids       
      
        
    def preprocess_examples(self):
        
        questions = [q.strip() for q in self.examples["question"]]
        inputs = self.tokenizer(
            questions,
            self.examples["context"],
            max_length=self.max_length,
            truncation="only_second",
            stride=self.stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
            
        if self.train == True:
            offset_mapping = inputs.pop("offset_mapping")
            sample_map = inputs.pop("overflow_to_sample_mapping")
            answers = self.examples["answers"]
            start_positions = []
            end_positions = []
            
            for i, offset in enumerate(offset_mapping):
                sample_idx = sample_map[i]
                answer = answers[sample_idx]
                start_char = answer["answer_start"]
                end_char = answer["answer_start"] + len(answer["text"])
                sequence_ids = inputs.sequence_ids(i)

                print()
                # Find the start and end of the context
                idx = 0
                while sequence_ids[idx] != 1:
                    idx += 1
                context_start = idx
                while sequence_ids[idx] == 1:
                    idx += 1
                context_end = idx - 1

                # If the answer is not fully inside the context, label is (0, 0)
                if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                    start_positions.append(0)
                    end_positions.append(0)
                else:
                    # Otherwise it's the start and end token positions
                    idx = context_start
                    while idx <= context_end and offset[idx][0] <= start_char:
                        idx += 1
                    start_positions.append(idx - 1)

                    idx = context_end
                    while idx >= context_start and offset[idx][1] >= end_char:
                        idx -= 1
                    end_positions.append(idx + 1)
            
            inputs["start_positions"] = start_positions
            inputs["end_positions"] = end_positions
            return inputs 
           
        else:
            sample_map = inputs.pop("overflow_to_sample_mapping")
            example_ids = []

            for i in range(len(inputs["input_ids"])):
                sample_idx = sample_map[i]
                example_ids.append(self.examples["id"][sample_idx])

                sequence_ids = inputs.sequence_ids(i)
                offset = inputs["offset_mapping"][i]
                inputs["offset_mapping"][i] = [
                    o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
                ]

        inputs["example_id"] = example_ids
        return inputs

In [None]:
!pip install accelerate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.18.0


In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator, BertForQuestionAnswering, get_scheduler
from torch.optim import AdamW
from accelerate import Accelerator
from tqdm.auto import tqdm

# create spoken squad dataset and dataloader
trainset = SpokenSquadDataset()
trainloader = DataLoader(
    trainset, 
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8
)

# load model and choose optimizer
model_checkpoint = 'bert-base-uncased'
model = BertForQuestionAnswering.from_pretrained(model_checkpoint)
optimizer = AdamW(model.parameters(), lr=2e-5)

# utilize hugging face accelerator to ensure that all tensors are on the correct device
accelerator = Accelerator(mixed_precision='no')

model, optimizer, trainloader = accelerator.prepare(
    model, optimizer, trainloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(trainloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# train model
progress_bar = tqdm(range(num_training_steps))

output_dir = 'checkpoints'

for epoch in range(num_train_epochs):
    model.train()
    for step, batch in enumerate(trainloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

  0%|          | 0/13992 [00:00<?, ?it/s]

In [None]:
model_path = '/content/drive/MyDrive/checkpoints'
model.save_pretrained(model_path)
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_path, save_function=accelerator.save)
trainset.tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/checkpoints/tokenizer_config.json',
 '/content/drive/MyDrive/checkpoints/special_tokens_map.json',
 '/content/drive/MyDrive/checkpoints/vocab.txt',
 '/content/drive/MyDrive/checkpoints/added_tokens.json',
 '/content/drive/MyDrive/checkpoints/tokenizer.json')

In [None]:
!pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  

In [None]:
import torch
from transformers import AutoTokenizer, BertForQuestionAnswering, default_data_collator
from datasets import load_dataset
import evaluate
import collections
import numpy as np
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator

def preprocess_squad_validation_examples(examples):
    # preprocess the validation data
    model_checkpoint = 'bert-base-uncased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    
    # data was trained using the below max length and stride
    max_length = 384
    stride = 128

    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

# define a function to compute f1 score
def compute_metrics(start_logits, end_logits, features, examples):
    n_best = 20
    max_answer_length = 30
    predicted_answers = []

    metric = evaluate.load("squad")
    
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

def main():
    """Main evaluation loop
    """
    # load the squad dataset
    
    squad_dataset = load_dataset('squad')
    squad_train_dataset = squad_dataset['train']
    squad_val_dataset = squad_dataset['validation']  # Using the 'validation' split for validation
    raw_datasets = {"train": squad_train_dataset, "validation": squad_val_dataset}
    # creates the validation dataset using the above defined function
    squad_validation_dataset = raw_datasets["validation"].map(
        preprocess_squad_validation_examples,
        batched=True,
        remove_columns=raw_datasets["validation"].column_names,
    )

    # create the spoken squad preprocessed validation dataset
    spoken_squad_validation_dataset = SpokenSquadDataset(train=False)
    
    # create the spoken squad unprocessed dataset. This will need to be used with
    # the compute metrics function later on
    spoken_squad_raw = SpokenSquadDataset(train=False, unprocessed=True)
    
    #squad_raw = raw_datasets['validation']
    
    # define model checkpoint paths
    #squad_model_checkpoint_path = 'checkpoints/squad'
    spoken_squad_model_checkpoint_path = '/content/drive/MyDrive/checkpoints_bert_uncased'  
    
    # remove the example id and offset mapping columns as they are unnecessary
    # WARNING WARNING if you do not remove the offset mapping column you will get an
    # error when you try to convert to tensor related to some of the values in the list
    # being None type
    #squad_validation_set = squad_validation_dataset.remove_columns(["example_id", "offset_mapping"])
    spoken_squad_validation_set = squad_validation_dataset.remove_columns(["example_id", "offset_mapping"])
    
    # set hugging face datasets to torch format
    squad_validation_dataset.set_format("torch")
    
    # set parameters
    #squad_model = BertForQuestionAnswering.from_pretrained(squad_model_checkpoint_path)
    spoken_squad_model = BertForQuestionAnswering.from_pretrained(spoken_squad_model_checkpoint_path)
    
    # Create dataloaders
    squad_eval_loader = DataLoader(
        squad_validation_dataset,
        collate_fn=default_data_collator,
        batch_size=8,
    )

    spoken_squad_eval_loader = DataLoader(
        spoken_squad_validation_set,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=8
    )
    
    accelerator = Accelerator(mixed_precision='no')
    squad_model, spoken_squad_model, squad_eval_loader, spoken_squad_eval_loader = accelerator.prepare(
        squad_model, spoken_squad_model, squad_eval_loader, spoken_squad_eval_loader
    )
    
    
    
    """evaluate the performance of each model on each dataset"""
    
    
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(spoken_squad_eval_loader):
        with torch.no_grad():
            outputs = squad_model(**batch)
        
        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())
            
    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(squad_validation_dataset)]
    end_logits = end_logits[: len(squad_validation_dataset)]
    
    metrics = compute_metrics(
        start_logits, end_logits, spoken_squad_validation_dataset, spoken_squad_raw
    )
    
    print(metrics)
    
    # spoken squad trained model on spoken squad test dataset
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(spoken_squad_eval_loader):
        with torch.no_grad():
            outputs = spoken_squad_model(**batch)
        
        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())
            
    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(squad_validation_dataset)]
    end_logits = end_logits[: len(squad_validation_dataset)]
    
    metrics = compute_metrics(
        start_logits, end_logits, spoken_squad_validation_dataset, spoken_squad_raw
    )
    
    print(metrics)
    


In [None]:
from transformers import pipeline

model_checkpoint_spoken = '/content/drive/MyDrive/checkpoints_bert_uncased'
model_checkpoint = '/content/drive/MyDrive/bert_base_uncased-custom'
question_answerer_spoken = pipeline("question-answering", model=model_checkpoint_spoken)
question_answerer = pipeline("question-answering", model=model_checkpoint)

context_spoken = """Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with 1 before loading them for inference with the other."""

context = """🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

while 1:

    question = input("Please input a question:")

    answer_spoken = question_answerer_spoken(question=question, context=context_spoken)
    answer = question_answerer(question=question, context=context_spoken)
    print(f'spoken answer: {answer_spoken}')
    print(f'answer: {answer}')
    
    answer_spoken = question_answerer_spoken(question=question, context=context)
    answer = question_answerer(question=question, context=context)
    print(f'spoken answer: {answer_spoken}')
    print(f'answer: {answer}')

Please input a question:Which deep learning libraries back 🤗 Transformers?
spoken answer: {'score': 0.035526860505342484, 'start': 75, 'end': 102, 'answer': 'Jax, PyTorch and TensorFlow'}
answer: {'score': 0.07184605300426483, 'start': 49, 'end': 102, 'answer': 'deep learning libraries — Jax, PyTorch and TensorFlow'}
spoken answer: {'score': 0.11756867915391922, 'start': 32, 'end': 50, 'answer': 'three most popular'}
answer: {'score': 0.07318601757287979, 'start': 51, 'end': 104, 'answer': 'deep learning libraries — Jax, PyTorch and TensorFlow'}
Please input a question:what are the three most popular deep learning libraries?
spoken answer: {'score': 0.6253482699394226, 'start': 75, 'end': 102, 'answer': 'Jax, PyTorch and TensorFlow'}
answer: {'score': 0.1522851288318634, 'start': 92, 'end': 102, 'answer': 'TensorFlow'}
spoken answer: {'score': 0.570705771446228, 'start': 77, 'end': 104, 'answer': 'Jax, PyTorch and TensorFlow'}
answer: {'score': 0.26196354627609253, 'start': 94, 'end': 

Lets try the BERT-BASE-CASED MODEL


In [None]:
class SpokenSquadDataset(Dataset):
    def __init__(self, train=True, unprocessed=False, max_length=384, stride=128, model_checkpoint='bert-base-cased'):
        super(SpokenSquadDataset, self).__init__()
        """This dataset loads the data into 3 synced lists:
        context, question, answer.
        
        It then creates encodings using the BertTokenizerFast tokenizer
        """
        
        self.train = train
        self.unprocessed = unprocessed
        
        if self.train:
            self.data_path = '/content/squad/train-v2.0.json'
        else:
            self.data_path = '/content/squad/test-v2.0.json'
        
        self.max_length = max_length
        self.stride = stride
        self.model_checkpoint = model_checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
        
        # Sync the context, question, and answer data
        contexts, questions, answers, ids = self.read_data()
        self.examples = {'context': contexts, 'question': questions, 'answers': answers, 'id': ids}
        
        self.encodings = self.preprocess_examples()
        print(len(self.encodings['input_ids']))
        
    def __getitem__(self, idx):
        if self.unprocessed == False:
            return {key: val[idx] for key, val in self.encodings.items()}
        else:
            return {key: val[idx] for key, val in self.examples.items()}
        
            
    def __len__(self):
        return len(self.encodings['input_ids'])
    def read_data(self):
        
      with open(self.data_path) as f:
          data = json.load(f)['data']
          
          contexts = []
          questions = []
          answers = []
          ids = []
          
          for title in data:
              for paragraph in title['paragraphs']:
                  context = paragraph['context']
                  for qas in paragraph['qas']:
                      question = qas['question']
                      
                      # Check if there are any answers before trying to unpack them
                      if qas['answers']:
                          answer_text = qas['answers'][0]['text']
                          answer_start = qas['answers'][0]['answer_start']
                          id = qas['id']
                          # add an 'answer_end' to the answer
                          
                          # Check if we are training or evaluating
                          if self.train == True:
                              answers.append({'text': answer_text, 'answer_start': answer_start})
                          else:
                              answer_starts = []
                              texts = []
                              for answer in qas['answers']:
                                  texts.append(answer['text'])
                                  answer_starts.append(answer['answer_start'])
                              answers.append({'text': texts, 'answer_start': answer_starts})
                      
                      contexts.append(context)
                      questions.append(question)
                      ids.append(qas['id'])
          
          return contexts, questions, answers, ids       
      
        
    def preprocess_examples(self):
        
        questions = [q.strip() for q in self.examples["question"]]
        inputs = self.tokenizer(
            questions,
            self.examples["context"],
            max_length=self.max_length,
            truncation="only_second",
            stride=self.stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
            
        if self.train == True:
            offset_mapping = inputs.pop("offset_mapping")
            sample_map = inputs.pop("overflow_to_sample_mapping")
            answers = self.examples["answers"]
            start_positions = []
            end_positions = []
            
            for i, offset in enumerate(offset_mapping):
                sample_idx = sample_map[i]
                answer = answers[sample_idx]
                start_char = answer["answer_start"]
                end_char = answer["answer_start"] + len(answer["text"])
                sequence_ids = inputs.sequence_ids(i)

                print()
                # Find the start and end of the context
                idx = 0
                while sequence_ids[idx] != 1:
                    idx += 1
                context_start = idx
                while sequence_ids[idx] == 1:
                    idx += 1
                context_end = idx - 1

                # If the answer is not fully inside the context, label is (0, 0)
                if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                    start_positions.append(0)
                    end_positions.append(0)
                else:
                    # Otherwise it's the start and end token positions
                    idx = context_start
                    while idx <= context_end and offset[idx][0] <= start_char:
                        idx += 1
                    start_positions.append(idx - 1)

                    idx = context_end
                    while idx >= context_start and offset[idx][1] >= end_char:
                        idx -= 1
                    end_positions.append(idx + 1)
            
            inputs["start_positions"] = start_positions
            inputs["end_positions"] = end_positions
            return inputs 
           
        else:
            sample_map = inputs.pop("overflow_to_sample_mapping")
            example_ids = []

            for i in range(len(inputs["input_ids"])):
                sample_idx = sample_map[i]
                example_ids.append(self.examples["id"][sample_idx])

                sequence_ids = inputs.sequence_ids(i)
                offset = inputs["offset_mapping"][i]
                inputs["offset_mapping"][i] = [
                    o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
                ]

        inputs["example_id"] = example_ids
        return inputs

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import default_data_collator, BertForQuestionAnswering, get_scheduler
from torch.optim import AdamW
from accelerate import Accelerator
from tqdm.auto import tqdm

# create spoken squad dataset and dataloader
trainset = SpokenSquadDataset()
trainloader = DataLoader(
    trainset, 
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8
)

# load model and choose optimizer
model_checkpoint = 'bert-base-cased'
model = BertForQuestionAnswering.from_pretrained(model_checkpoint)
optimizer = AdamW(model.parameters(), lr=2e-5)

# utilize hugging face accelerator to ensure that all tensors are on the correct device
accelerator = Accelerator(mixed_precision='no')

model, optimizer, trainloader = accelerator.prepare(
    model, optimizer, trainloader
)

num_train_epochs = 3
num_update_steps_per_epoch = len(trainloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# train model
progress_bar = tqdm(range(num_training_steps))

output_dir = 'checkpoints'

for epoch in range(num_train_epochs):
    model.train()
    for step, batch in enumerate(trainloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
        

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and a

  0%|          | 0/14061 [00:00<?, ?it/s]

In [None]:
model_path = '/content/drive/MyDrive/checkpoints_bert_base_cased'
model.save_pretrained(model_path)
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(model_path, save_function=accelerator.save)
trainset.tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/checkpoints_bert_base_cased/tokenizer_config.json',
 '/content/drive/MyDrive/checkpoints_bert_base_cased/special_tokens_map.json',
 '/content/drive/MyDrive/checkpoints_bert_base_cased/vocab.txt',
 '/content/drive/MyDrive/checkpoints_bert_base_cased/added_tokens.json',
 '/content/drive/MyDrive/checkpoints_bert_base_cased/tokenizer.json')

In [None]:
import torch
from transformers import AutoTokenizer, BertForQuestionAnswering, default_data_collator
from datasets import load_dataset
import evaluate
import collections
import numpy as np
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator

def preprocess_squad_validation_examples(examples):
    # preprocess the validation data
    model_checkpoint = 'bert-base-cased'
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    
    # data was trained using the below max length and stride
    max_length = 384
    stride = 128

    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

# define a function to compute f1 score
def compute_metrics(start_logits, end_logits, features, examples):
    n_best = 20
    max_answer_length = 30
    predicted_answers = []

    metric = evaluate.load("squad")
    
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)



In [None]:
def main():
    """Main evaluation loop
    """
    # load the squad dataset
    
    squad_dataset = load_dataset('squad')
    squad_train_dataset = squad_dataset['train']
    squad_val_dataset = squad_dataset['validation']  # Using the 'validation' split for validation
    raw_datasets = {"train": squad_train_dataset, "validation": squad_val_dataset}
    # creates the validation dataset using the above defined function
    squad_validation_dataset = raw_datasets["validation"].map(
        preprocess_squad_validation_examples,
        batched=True,
        remove_columns=raw_datasets["validation"].column_names,
    )

    # create the spoken squad preprocessed validation dataset
    spoken_squad_validation_dataset = SpokenSquadDataset(train=False)
    
    # create the spoken squad unprocessed dataset. This will need to be used with
    # the compute metrics function later on
    spoken_squad_raw = SpokenSquadDataset(train=False, unprocessed=True)
    
    squad_raw = raw_datasets['validation']
    
    # define model checkpoint paths
    #squad_model_checkpoint_path = 'checkpoints/squad'
    spoken_squad_model_checkpoint_path = '/content/drive/MyDrive/checkpoints_bert_uncased'  
    
    # remove the example id and offset mapping columns as they are unnecessary
    # WARNING WARNING if you do not remove the offset mapping column you will get an
    # error when you try to convert to tensor related to some of the values in the list
    # being None type
    #squad_validation_set = squad_validation_dataset.remove_columns(["example_id", "offset_mapping"])
    spoken_squad_validation_set = squad_validation_dataset.remove_columns(["example_id", "offset_mapping"])
    
    # set hugging face datasets to torch format
    squad_validation_dataset.set_format("torch")
    
    # set parameters
    #squad_model = BertForQuestionAnswering.from_pretrained(squad_model_checkpoint_path)
    spoken_squad_model = BertForQuestionAnswering.from_pretrained(spoken_squad_model_checkpoint_path)
    
    # Create dataloaders
    squad_eval_loader = DataLoader(
        squad_validation_dataset,
        collate_fn=default_data_collator,
        batch_size=8,
    )

    spoken_squad_eval_loader = DataLoader(
        spoken_squad_validation_set,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=8
    )
    
    accelerator = Accelerator(mixed_precision='no')
    squad_model, spoken_squad_model, squad_eval_loader, spoken_squad_eval_loader = accelerator.prepare(
        squad_model, spoken_squad_model, squad_eval_loader, spoken_squad_eval_loader
    )
    
    
    
    """evaluate the performance of each model on each dataset"""
    
    
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(spoken_squad_eval_loader):
        with torch.no_grad():
            outputs = squad_model(**batch)
        
        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())
            
    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(squad_validation_dataset)]
    end_logits = end_logits[: len(squad_validation_dataset)]
    
    metrics = compute_metrics(
        start_logits, end_logits, spoken_squad_validation_dataset, spoken_squad_raw
    )
    
    print(metrics)
    
    # spoken squad trained model on spoken squad test dataset
    start_logits = []
    end_logits = []
    accelerator.print("Evaluation!")
    for batch in tqdm(spoken_squad_eval_loader):
        with torch.no_grad():
            outputs = spoken_squad_model(**batch)
        
        start_logits.append(accelerator.gather(outputs.start_logits).cpu().numpy())
        end_logits.append(accelerator.gather(outputs.end_logits).cpu().numpy())
            
    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    start_logits = start_logits[: len(squad_validation_dataset)]
    end_logits = end_logits[: len(squad_validation_dataset)]
    
    metrics = compute_metrics(
        start_logits, end_logits, spoken_squad_validation_dataset, spoken_squad_raw
    )
    
    print(metrics)
    


In [None]:
from transformers import pipeline

model_checkpoint_spoken = '/content/drive/MyDrive/checkpoints_bert_base_cased'
model_checkpoint = '/content/drive/MyDrive/checkpoints_bert_base_cased'
question_answerer_spoken = pipeline("question-answering", model=model_checkpoint_spoken)
question_answerer = pipeline("question-answering", model=model_checkpoint)

context_spoken = """Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with 1 before loading them for inference with the other."""

context = """🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""

while 1:

    question = input("Please input a question:")

    answer_spoken = question_answerer_spoken(question=question, context=context_spoken)
    answer = question_answerer(question=question, context=context_spoken)
    print(f'spoken answer: {answer_spoken}')
    print(f'answer: {answer}')
    
    answer_spoken = question_answerer_spoken(question=question, context=context)
    answer = question_answerer(question=question, context=context)
    print(f'spoken answer: {answer_spoken}')
    print(f'answer: {answer}')

  

Please input a question:Which deep learning libraries back 🤗 Transformers?
spoken answer: {'score': 0.3543950617313385, 'start': 75, 'end': 102, 'answer': 'Jax, PyTorch and TensorFlow'}
answer: {'score': 0.3543950617313385, 'start': 75, 'end': 102, 'answer': 'Jax, PyTorch and TensorFlow'}
spoken answer: {'score': 0.3538737893104553, 'start': 77, 'end': 104, 'answer': 'Jax, PyTorch and TensorFlow'}
answer: {'score': 0.3538737893104553, 'start': 77, 'end': 104, 'answer': 'Jax, PyTorch and TensorFlow'}
Please input a question:what are the three most popular deep learning libraries?
spoken answer: {'score': 0.5048927664756775, 'start': 75, 'end': 102, 'answer': 'Jax, PyTorch and TensorFlow'}
answer: {'score': 0.5048927664756775, 'start': 75, 'end': 102, 'answer': 'Jax, PyTorch and TensorFlow'}
spoken answer: {'score': 0.47822535037994385, 'start': 77, 'end': 104, 'answer': 'Jax, PyTorch and TensorFlow'}
answer: {'score': 0.47822535037994385, 'start': 77, 'end': 104, 'answer': 'Jax, PyTorch