In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install transformers

# Needed for Kaggle - import datasets from Drive - (links are probably still available)


## Import Datasets from Drive

In [None]:
# just import datasets from my drive (source: https://www.youtube.com/watch?v=rzz0uaNKAJY&ab_channel=ChafikBlm)
! conda install -y gdown # needed for kaggle

/bin/bash: conda: command not found


## import SQuAD

In [None]:
# for kaggle
!gdown 1bgkt-gBTVhM8LgvTvkZ_tItyiEGEJXOI  # get the data from drive (check the video above)
!unzip "./SQuAD.zip"



## import TriviaQA 

In [None]:
!gdown 1fTjdYW_1JO0XyO8Sq-SoCZOXwpiLSNm9
!unzip "./TriviaQA.zip"

## import NQ

In [None]:
!gdown 16PKZN2Bx7vrgHSL1hg-n2P5XWfSndcR7
!unzip "./NQ.zip"

## import QuAC

In [None]:
!gdown 1_Mkd_huzYSiIvv73e_t2Vj04Cb5ko-eo
!unzip "./QuAC.zip"

## import NewsQA

In [None]:
!gdown  1WHEAPldWLWhbh1J0ypZN6ECj7YpgLeSp
!unzip "./NewsQA.zip"

# Paths for datasets

In [2]:
# SQuAD
Location_squad_train = '/content/drive/MyDrive/AI_2/project4/Squad2.0/train-v2.0.json'
Location_squad_dev = '/content/drive/MyDrive/AI_2/project4/Squad2.0/dev-v2.0.json'


# TriviaQA
Location_trivia_train = '/content/drive/MyDrive/AI_2/project4/TrivialQA/triviaqa_train.json'
Location_trivia_dev = '/content/drive/MyDrive/AI_2/project4/TrivialQA/triviaqa_dev.json'


# Natural Questions
Location_nq_train = '/content/drive/MyDrive/AI_2/project4/NQ/nq_train.json'
Location_nq_dev = '/content/drive/MyDrive/AI_2/project4/NQ/nq_dev.json'


# QuAC
Location_quac_train = '/content/drive/MyDrive/AI_2/project4/QuAC/quac_train.json'
Location_quac_dev = '/content/drive/MyDrive/AI_2/project4/QuAC/quac_dev.json'


# NewsQA
Location_newsqa_train = '/content/drive/MyDrive/AI_2/project4/NewsQA/newsqa_train.json'
Location_newsqa_dev = '/content/drive/MyDrive/AI_2/project4/NewsQA/newsqa_dev.json'

# Import necessary libraries and set seeds

In [3]:
import torch
from tqdm.notebook import tqdm

# from transformers import BertTokenizer
# from torch.utils.data import TensorDataset
# from transformers import BertForSequenceClassification

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
import pandas as pd
import random
import numpy as np
import os
import time # time module 

import json



def set_seed(seed = 1234):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed(23456)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use 'cuda' if available else 'cpu'
print('Working on:', device)

Working on: cuda


# Prepare train and test set

In [4]:
# num_groups_prec will tell us how much data we take from dataset 
# if num_groups_precent = -1 then we take all the data
# if remove_no_answer == False, we will train model with all questions otherwise we will remove the questions without answer


def read_dataset(path, num_groups_percent=-1, remove_no_answers = False):
    num_groups = -1

    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
        
    no_answer = 0

    # initialize lists for contexts, questions, anLd answers
    contexts = []
    questions = []
    answers = []
    
    if num_groups_percent != -1:
        num_groups = len(squad_dict['data']) * num_groups_percent   # take the selected number of the whole dataset
    
    
    # iterate through all data in squad data
    for group in squad_dict['data']:
        
        if num_groups <= 0 and num_groups != -1:   # take only the selected groups if there are
          break 
        

        for passage in group['paragraphs']:
            context = passage['context']

            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                
                if qa[access] != []:             # question with answers
                    for answer in qa[access]:
                        # append data to lists
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)
                        
                elif qa[access] == []:             # questions without answers
                    if remove_no_answers == True:                             # questios without answers
                            no_answer += 1
                    elif remove_no_answers == False:
                            # append data to lists
                            contexts.append(context)
                            questions.append(question)
                            answers.append({'answer_start': 0, 'text': '-', 'answer_end': 1})
                    

        if num_groups != -1:
          num_groups -= 1
        
    
    # return formatted data lists
    if remove_no_answers == True:
        return contexts, questions, answers, no_answer
    
    return contexts, questions, answers
        

## Indicate the start and the end of each answer within the context

In [5]:
def add_end_idx(answers, contexts):
    # loop through each answer-context pair
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer['text']
        # we already know the start index
        start_idx = answer['answer_start']
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        # ...however, sometimes squad answers are off by a character or two
        if context[start_idx:end_idx] == gold_text:
            # if the answer is not off :)
            answer['answer_end'] = end_idx
        else:
            # this means the answer is off by 1-2 tokens
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n
            

## Tokenization function for Bert 

In [6]:
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    for i in range(len(answers)):

        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'])) # each number has a word from answer
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))

        # print(i, ') ', answers[i], 'start_positions: ', start_positions[-1], 'end_position: ', end_positions[-1], ' -- ', answers[i]['answer_end'], answers[i]['answer_start'])

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length

        # if i == 1541:
        shift = 1
        # end position cannot be found, char_to_token found space, so shift position until found ( little cheat :P )  
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] - shift)
            shift += 1

 
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

## Initializing the Dataset Squad 

In [7]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Bert model

## Tokenization with BertTokenizer and Encoding the data

In [8]:
from transformers import AutoTokenizer, BertForQuestionAnswering

# initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",
                                                      do_lower_case=True)

# f1_score and exact_score from https://rajpurkar.github.io/SQuAD-explorer/

In [9]:
# Function for exc
def extract_tokens(compressed_tokens, positions):
  tokens = dict()

  sp = positions[0]     # start positions for each answer
  ep = positions[1]     # end positions for each answer (it's not actually the start position, is the first token for the answer)
  
  num_answers = len(sp)
  index = 0
  max_length = len(compressed_tokens['input_ids'][index])   # this is the max_length from tokenizer !
  
  for start, end in zip(sp, ep):  
    tokens[index] = []          # we will fill this list with tokens for each question
    
    start = int(start)
    end = int(end)

    if start > end:
      # print('index: ', index)
      if end <= max_length:    # we have to check it if we have argument max_length in tokenizer
        tokens[index].append(tokenizer.decode(compressed_tokens['input_ids'][index][end]))
      if start <= max_length :
        tokens[index].append(tokenizer.decode(compressed_tokens['input_ids'][index][start]))
    else:
      for i in range(start, end+1):
        # print(i)
        if i <= max_length:
          tokens[index].append(tokenizer.decode(compressed_tokens['input_ids'][index][i]))
    index += 1

  return tokens

In [10]:
import argparse
import collections
import json
import numpy as np
import os
import re
import string
import sys


def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

  
def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()



def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
      # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
      return int(gold_toks == pred_toks)
    if num_same == 0:
      return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1


def compute_f1_score(truth_answers, pred_answers):
    num_answers = len(truth_answers)   # number of answers for batch ( == batch_size)
    total_f1_score = 0

    for curr_answer in range(0, num_answers):    # for each answer compute f1 score
      a_gold = ' '.join(truth_answers[curr_answer])     # get the current truth answer
      a_pred = ' '.join(pred_answers[curr_answer])      # get the current predicted answer

      total_f1_score += compute_f1(a_gold, a_pred)

    return total_f1_score / num_answers


def compute_exact(a_gold, a_pred):
  return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_exact_score(truth_answers, pred_answers):
    num_answers = len(truth_answers)   # number of answers for batch ( == batch_size)
    total_exact_score = 0

    for curr_answer in range(0, num_answers):    # for each answer compute f1 score
      a_gold = ' '.join(truth_answers[curr_answer])     # get the current truth answer
      a_pred = ' '.join(pred_answers[curr_answer])      # get the current predicted answer

      total_exact_score += compute_exact(a_gold, a_pred)

    return total_exact_score / num_answers



## Define Bert-model

In [11]:
from transformers import BertForQuestionAnswering
from torch.utils.data import DataLoader



model_Bert = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

# Evaluate Function

In [12]:
from torch.utils.data import DataLoader


# returns validation losses, accuracy, exact, f1 scores
def evaluate(model, dataloader_val):
    test_loss = []
    acc_train = []
    acc_test = []
    pred_start = []
    pred_end = []
    true_start = []
    true_end = []
    f1_scores = []
    exact_scores = []

    model.eval()  # switch model out of training mode
    for batch in tqdm(dataloader_val):
        # we don't need to calculate gradients as we're not training
        with torch.no_grad():
            # pull all the tensor batches required for training
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # we will use true positions for accuracy calc
            start_true = batch['start_positions'].to(device)
            end_true = batch['end_positions'].to(device)


            # train model on batch and return outputs (incl. loss)
            outputs = model(input_ids, attention_mask=attention_mask,
                                start_positions=start_true,
                                end_positions=end_true)
            
            # extract loss
            loss = outputs[0]

            # pull prediction tensors out and argmax to get predicted tokens
            start_pred = torch.argmax(outputs['start_logits'], dim=1)
            end_pred = torch.argmax(outputs['end_logits'], dim=1)

            # ~~~ Compute F1 score with tokens ~~~~
            # extract truth tokens
            truth_positions = [start_true.detach().cpu(), end_true.detach().cpu()]
            truth_tokens = extract_tokens(batch, truth_positions)

            # extract predicted tokens
            pred_positions = [start_pred, end_pred]
            pred_tokens = extract_tokens(batch, pred_positions)

            f1_scores.append (compute_f1_score(truth_tokens, pred_tokens))
            exact_scores.append(compute_exact_score(truth_tokens, pred_tokens))


                      
            test_loss.append(loss.item())



            # calculate accuracy for both and append to accuracy list
            acc_test.append(((start_pred == start_true).sum()/len(start_pred)).item())
            acc_test.append(((end_pred == end_true).sum()/len(end_pred)).item())
    
    # calculate the average test loss
    test_loss_temp = np.mean(test_loss)

    # calculate average accuracy in total
    acc_test_temp = np.mean(acc_test)
    f1_score_temp = np.mean(f1_scores)
    exact_score_temp = np.mean(exact_scores)
    
    return test_loss_temp, acc_test_temp, exact_score_temp, f1_score_temp 




# Evaluate the current pre-trained model with dev in given location
def Evaluate_on(model, Location_datasetName_dev, remove_no_answers=True):
    
    # Prepare the validation data from dataset in Location_datasetName_dev
    num_groups = -1  # getting all data
    
    if remove_no_answers == True:
      val_contexts, val_questions, val_answers, no_answers_counter = read_dataset(Location_datasetName_dev, num_groups, remove_no_answers) # convert data to lists
    else:
      val_contexts, val_questions, val_answers = read_dataset(Location_datasetName_dev, num_groups, remove_no_answers) # convert data to lists
    
    add_end_idx(val_answers, val_contexts)   # indicate each answer
    val_encodings = tokenizer(val_contexts,  # encoding context and questions
                              val_questions, 
                              truncation=True, 
                              max_length=324,
                              padding=True)

    add_token_positions(val_encodings, val_answers) # find the answer and keep the start and end position
    val_dataset = SquadDataset(val_encodings)

    BATCH_SIZE = 8
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE) # create the batches for validation set

    # Get  status
    test_loss, Val_Acc, exact_score, f1_score = evaluate (model, val_loader)  #  evaluate the model and get scores
    
    return test_loss, Val_Acc, exact_score, f1_score
    

# SQuAD

## Fine-tuning Bert model with SQuAD dataset

In [13]:
# Prepare the train data from squad
train_contexts, train_questions, train_answers = read_dataset(Location_squad_train, 
                                                              num_groups_percent= -1, 
                                                              remove_no_answers=False) # get all the data into to lists

print(len(train_contexts))  # from https://arxiv.org/pdf/2004.03490.pdf (table 1) this should be 130,319


add_end_idx(train_answers, train_contexts) #indicate each answer
train_encodings = tokenizer(train_contexts, # encoding context and questions
                          train_questions, 
                          truncation=True, 
                          max_length=324,
                          padding=True)
add_token_positions(train_encodings, train_answers)
train_dataset = SquadDataset(train_encodings)

BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)


130319


In [None]:
from tqdm import tqdm

# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = torch.optim.AdamW(model_Bert.parameters(), lr=2e-5)

epochs = 2


train_losses = []
test_losses = []

for epoch in range(epochs):
    
    train_loss = []
    test_loss = []
    acc_train = []
    acc_test = []

    # for compute f1
    pred_start = []
    pred_end = []
    true_start = []
    true_end = []
    f1_scores = []
    exact_scores = []

    model_Bert.train()   # set model to train mode
    with torch.set_grad_enabled(True):
      # setup loop (we use tqdm for the progress bar)
      progress_bar = tqdm(train_loader, f"Epoch: {epoch+1}")
      for batch in progress_bar:
          # initialize calculated gradients (from prev step)
          optim.zero_grad()

          # pull all the tensor batches required for training
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)

          # print('start_positions', start_positions)
          
          # train model on batch and return outputs (incl. loss)
          outputs = model_Bert(input_ids, attention_mask=attention_mask,
                          start_positions=start_positions,
                          end_positions=end_positions)
          
          # get the predictions
          start_pred = torch.argmax(outputs['start_logits'], dim=1)
          end_pred = torch.argmax(outputs['end_logits'], dim=1)

          
          # extract loss
          loss = outputs[0]

          # calculate loss for every parameter that needs grad update
          loss.backward()
          # update parameters
          optim.step()
          train_loss.append(loss.item())

          # calculate accuracy for both and append to accuracy list
          acc_train.append(((start_pred == start_positions).sum()/len(start_pred)).item())
          acc_train.append(((end_pred == end_positions).sum()/len(end_pred)).item())

          # print relevant info to progress bar
          # progress_bar.set_description(f'Epoch {epoch+1}')
          progress_bar.set_postfix({'training_loss': '{:.3f}'.format(np.mean(train_loss))})
    
    # get the average loss
    train_loss_temp = np.mean(train_loss)
    train_losses.append(train_loss_temp)
    
    acc_train_temp = np.mean(acc_train)   # average accuracy of the train set


    tqdm.write(f'''Epoch: {epoch+1} | Train Loss: {train_loss_temp:.3f} | Train Acc: {acc_train_temp*100:.2f}%''')


Epoch: 1: 100%|██████████| 16290/16290 [1:24:00<00:00,  3.23it/s, training_loss=1.343]


Epoch: 1 | Train Loss: 1.343 | Train Acc: 61.77%


Epoch: 2: 100%|██████████| 16290/16290 [1:24:19<00:00,  3.22it/s, training_loss=0.833]

Epoch: 2 | Train Loss: 0.833 | Train Acc: 74.14%





In [None]:
# # Save model 
# import torch
# torch.save(model_Bert.state_dict(), './model_SQuAD.pth')

<a href="./model_TriviaQA"> Download model</a>

## Load fine-tuned model from Drive

In [112]:
# I have upload model in my drive
# !gdown 1F9a5vrT5YJHfJoae2OpmnOF-aWjqpdh3    # for kaggle
! unzip "/content/drive/MyDrive/AI_2/project4/Models/model_SQuAD.zip"

Archive:  /content/drive/MyDrive/AI_2/project4/Models/model_SQuAD.zip
  inflating: model_SQuAD.pth         


In [113]:
# load model
from collections import OrderedDict
import collections
model_check = torch.load('./model_SQuAD.pth')
model_Bert.load_state_dict(model_check)

<All keys matched successfully>

## Evaluate with SQuAD 

In [115]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_squad_dev, remove_no_answers=True)

  0%|          | 0/3281 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 1.261 
Val_acc:	 66.56%  
Exact score:	 59.31%  
F1 score:	 75.38% 


## Evaluate with TriviaQA

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_trivia_dev, remove_no_answers=True)

  0%|          | 0/1230 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 4.445 
Val_acc:	 28.35%  
Exact score:	 33.95%  
F1 score:	 43.39% 


## Evaluate with Natural Questions

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_nq_dev, remove_no_answers=True)

  0%|          | 0/457 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 2.310 
Val_acc:	 47.16%  
Exact score:	 37.39%  
F1 score:	 57.61% 


## Evaluate with QuAC

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_quac_dev, remove_no_answers=True)

  0%|          | 0/734 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 5.941 
Val_acc:	 16.11%  
Exact score:	 5.21%  
F1 score:	 18.32% 


## Evaluate with NewsQA

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_newsqa_dev, remove_no_answers=True)

  0%|          | 0/646 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 4.639 
Val_acc:	 27.55%  
Exact score:	 27.28%  
F1 score:	 41.53% 


# TriviaQA

## Fine-tuning Bert model with TriviaQA dataset

In [None]:
# Prepare the train data from triviaqa
train_contexts, train_questions, train_answers = read_dataset(Location_trivia_train, 
                                                              num_groups_percent= -1, 
                                                              remove_no_answers=True) # get all the data into to lists
print(len(train_contexts))  # from https://arxiv.org/pdf/2004.03490.pdf (table 1) this should be 110,647 ( i removed questions without answers because my ram cannot handle all this data)

add_end_idx(train_answers, train_contexts) #indicate each answer
train_encodings = tokenizer(train_contexts, # encoding context and questions
                          train_questions, 
                          truncation=True, 
                          max_length=324,
                          padding=True)
add_token_positions(train_encodings, train_answers)
train_dataset = SquadDataset(train_encodings)

BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)


In [None]:
from tqdm import tqdm

# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = torch.optim.AdamW(model_Bert.parameters(), lr=2e-5)

epochs = 2


train_losses = []
test_losses = []

for epoch in range(epochs):
    
    train_loss = []
    test_loss = []
    acc_train = []
    acc_test = []

    # for compute f1
    pred_start = []
    pred_end = []
    true_start = []
    true_end = []
    f1_scores = []
    exact_scores = []

    model_Bert.train()   # set model to train mode
    with torch.set_grad_enabled(True):
      # setup loop (we use tqdm for the progress bar)
      progress_bar = tqdm(train_loader, f"Epoch: {epoch+1}")
      for batch in progress_bar:
          # initialize calculated gradients (from prev step)
          optim.zero_grad()

          # pull all the tensor batches required for training
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)

          # print('start_positions', start_positions)
          
          # train model on batch and return outputs (incl. loss)
          outputs = model_Bert(input_ids, attention_mask=attention_mask,
                          start_positions=start_positions,
                          end_positions=end_positions)
          
          # get the predictions
          start_pred = torch.argmax(outputs['start_logits'], dim=1)
          end_pred = torch.argmax(outputs['end_logits'], dim=1)

          
          # extract loss
          loss = outputs[0]

          # calculate loss for every parameter that needs grad update
          loss.backward()
          # update parameters
          optim.step()
          train_loss.append(loss.item())

          # calculate accuracy for both and append to accuracy list
          acc_train.append(((start_pred == start_positions).sum()/len(start_pred)).item())
          acc_train.append(((end_pred == end_positions).sum()/len(end_pred)).item())

          # print relevant info to progress bar
          # progress_bar.set_description(f'Epoch {epoch+1}')
          progress_bar.set_postfix({'training_loss': '{:.3f}'.format(np.mean(train_loss))})
    
    # get the average loss
    train_loss_temp = np.mean(train_loss)
    train_losses.append(train_loss_temp)
    
    acc_train_temp = np.mean(acc_train)   # average accuracy of the train set


    tqdm.write(f'''Epoch: {epoch+1} | Train Loss: {train_loss_temp:.3f} | Train Acc: {acc_train_temp*100:.2f}%''')

Epoch: 1: 100%|██████████| 9636/9636 [49:40<00:00,  3.23it/s, training_loss=1.824]


Epoch: 1 | Train Loss: 1.824 | Train Acc: 46.20%


Epoch: 2: 100%|██████████| 9636/9636 [49:43<00:00,  3.23it/s, training_loss=1.076]


Epoch: 2 | Train Loss: 1.076 | Train Acc: 60.45%


In [None]:
# # Save model 
# import torch
# torch.save(model_Bert.state_dict(), './model_TriviaQA.pth')

<a href="./model_TriviaQA.pth"> Download model</a>

## Load fine-tuned model

In [None]:
# Load model from Drive
# !gdown 1pobs6Bp2ZeXWzOY3qcT97CqWyh2mBcbN  # for kaggle
!unzip "/content/drive/MyDrive/AI_2/project4/Models/model_TriviaQA.zip"


Archive:  /content/drive/MyDrive/AI_2/project4/Models/model_TriviaQA.zip
replace model_TriviaQA.pth? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
from collections import OrderedDict
import collections
model_check = torch.load('./model_TriviaQA.pth')
model_Bert.load_state_dict(model_check)

<All keys matched successfully>

## Evaluate with TriviaQA dataset

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_trivia_dev, remove_no_answers=True)

  0%|          | 0/1230 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 1.412 
Val_acc:	 55.92%  
Exact score:	 47.05%  
F1 score:	 53.08% 


## Evaluate with SQuAD dataset

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_squad_dev, remove_no_answers=True)

  0%|          | 0/3279 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 5.351 
Val_acc:	 23.10%  
Exact score:	 23.39%  
F1 score:	 35.69% 


## Evaluate with Natural Questions


In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_nq_dev, remove_no_answers=True)

  0%|          | 0/457 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 6.094 
Val_acc:	 22.16%  
Exact score:	 20.21%  
F1 score:	 36.79% 


## Evaluate with QuAC

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_quac_dev, remove_no_answers=True)

  0%|          | 0/734 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ') #  :(

Test loss:	 8.373 
Val_acc:	 8.84%  
Exact score:	 1.21%  
F1 score:	 9.40% 


## Evaluate wtih NewsQA

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_newsqa_dev, remove_no_answers=True)

  0%|          | 0/646 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 4.394 
Val_acc:	 26.02%  
Exact score:	 16.30%  
F1 score:	 26.08% 


# Natural Questions

## Fine-tuning Bert model with TriviaQA dataset

In [18]:
# Prepare the train data from triviaqa
train_contexts, train_questions, train_answers, no_answers_counter = read_dataset(Location_nq_train, 
                                                              num_groups_percent= -1, 
                                                              remove_no_answers=True) # get all the data into to lists

print(len(train_answers) , '+',  no_answers_counter, '=', len(train_answers) +  no_answers_counter)  # from paper should be 110,857 (2004)  - we use this dataset in 2022 :), they have probably added contexts.
# We will train our model only with 72K contexts, we removed the questions without answers 

add_end_idx(train_answers, train_contexts) #indicate each answer
train_encodings = tokenizer(train_contexts, # encoding context and questions
                          train_questions, 
                          truncation=True, 
                          max_length=324,
                          padding=True)
add_token_positions(train_encodings, train_answers)
train_dataset = SquadDataset(train_encodings)

BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)


74218 + 36647 = 110865


In [None]:
from tqdm import tqdm

# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = torch.optim.AdamW(model_Bert.parameters(), lr=2e-5)

epochs = 2


train_losses = []
test_losses = []

for epoch in range(epochs):
    
    train_loss = []
    test_loss = []
    acc_train = []
    acc_test = []

    # for compute f1
    pred_start = []
    pred_end = []
    true_start = []
    true_end = []
    f1_scores = []
    exact_scores = []

    model_Bert.train()   # set model to train mode
    with torch.set_grad_enabled(True):
      # setup loop (we use tqdm for the progress bar)
      progress_bar = tqdm(train_loader, f"Epoch: {epoch+1}")
      for batch in progress_bar:
          # initialize calculated gradients (from prev step)
          optim.zero_grad()

          # pull all the tensor batches required for training
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)

          # print('start_positions', start_positions)
          
          # train model on batch and return outputs (incl. loss)
          outputs = model_Bert(input_ids, attention_mask=attention_mask,
                          start_positions=start_positions,
                          end_positions=end_positions)
          
          # get the predictions
          start_pred = torch.argmax(outputs['start_logits'], dim=1)
          end_pred = torch.argmax(outputs['end_logits'], dim=1)

          
          # extract loss
          loss = outputs[0]

          # calculate loss for every parameter that needs grad update
          loss.backward()
          # update parameters
          optim.step()
          train_loss.append(loss.item())

          # calculate accuracy for both and append to accuracy list
          acc_train.append(((start_pred == start_positions).sum()/len(start_pred)).item())
          acc_train.append(((end_pred == end_positions).sum()/len(end_pred)).item())

          # print relevant info to progress bar
          # progress_bar.set_description(f'Epoch {epoch+1}')
          progress_bar.set_postfix({'training_loss': '{:.3f}'.format(np.mean(train_loss))})
    
    # get the average loss
    train_loss_temp = np.mean(train_loss)
    train_losses.append(train_loss_temp)
    
    acc_train_temp = np.mean(acc_train)   # average accuracy of the train set


    tqdm.write(f'''Epoch: {epoch+1} | Train Loss: {train_loss_temp:.3f} | Train Acc: {acc_train_temp*100:.2f}%''')

Epoch: 1: 100%|██████████| 9278/9278 [47:48<00:00,  3.23it/s, training_loss=1.470]


Epoch: 1 | Train Loss: 1.470 | Train Acc: 58.41%


Epoch: 2: 100%|██████████| 9278/9278 [47:49<00:00,  3.23it/s, training_loss=0.894]

Epoch: 2 | Train Loss: 0.894 | Train Acc: 72.35%





In [None]:
# # Save model 
# import torch
# torch.save(model_Bert.state_dict(), './model_NQ.pth')

<a href="./model_NQ.pth"> Download model</a>

## Load fine-tuned Bert model

In [None]:
# Load model from Drive
# !gdown 1Bo1BFGtrKi2DRPVGROgJhpqmZmS6XyEf
!unzip "/content/drive/MyDrive/AI_2/project4/Models/model_NQ.zip"

Archive:  /content/drive/MyDrive/AI_2/project4/Models/model_NQ.zip
  inflating: model_NQ.pth            


In [None]:
import collections
from collections import OrderedDict

model_check = torch.load('./model_NQ.pth')
model_Bert.load_state_dict(model_check)

<All keys matched successfully>

## Evaluate with Natural Questions

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_nq_dev, remove_no_answers=True)

  0%|          | 0/457 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 1.402 
Val_acc:	 60.79%  
Exact score:	 50.14%  
F1 score:	 70.34% 


## Evaluate with SQuAD dataset

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_squad_dev, remove_no_answers=True)

  0%|          | 0/3279 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 1.956 
Val_acc:	 49.37%  
Exact score:	 40.74%  
F1 score:	 59.93% 


## Evaluate with TriviaQA dataset

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_trivia_dev, remove_no_answers=True)

  0%|          | 0/1230 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 5.170 
Val_acc:	 25.33%  
Exact score:	 29.29%  
F1 score:	 40.46% 


## Evaluate with QuAC dataset

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_quac_dev, remove_no_answers=True)

  0%|          | 0/734 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 5.480 
Val_acc:	 8.36%  
Exact score:	 2.81%  
F1 score:	 14.36% 


## Evaluate with NewsQA

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_quac_dev, remove_no_answers=True)

  0%|          | 0/734 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 2.971 
Val_acc:	 24.61%  
Exact score:	 6.81%  
F1 score:	 21.35% 


# QuAC


## Fine-tuning Bert model with QuAC dataset

In [19]:
# Prepare the train data from triviaqa
train_contexts, train_questions, train_answers = read_dataset(Location_quac_train, 
                                                              num_groups_percent= -1, 
                                                              remove_no_answers=False) # get all the data into to lists

print(len(train_contexts), len(train_answers), len(train_questions)) # from https://arxiv.org/pdf/2004.03490.pdf (pg.2 -> table 1) this should be 83,568

add_end_idx(train_answers, train_contexts) #indicate each answer
train_encodings = tokenizer(train_contexts, # encoding context and questions
                          train_questions, 
                          truncation=True, 
                          max_length=324,
                          padding=True)
add_token_positions(train_encodings, train_answers)
train_dataset = SquadDataset(train_encodings)

BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)


83568 83568 83568


In [None]:
from tqdm import tqdm

# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = torch.optim.AdamW(model_Bert.parameters(), lr=2e-5)

epochs = 2


train_losses = []
test_losses = []

for epoch in range(epochs):
    
    train_loss = []
    test_loss = []
    acc_train = []
    acc_test = []

    # for compute f1
    pred_start = []
    pred_end = []
    true_start = []
    true_end = []
    f1_scores = []
    exact_scores = []

    model_Bert.train()   # set model to train mode
    with torch.set_grad_enabled(True):
      # setup loop (we use tqdm for the progress bar)
      progress_bar = tqdm(train_loader, f"Epoch: {epoch+1}")
      for batch in progress_bar:
          # initialize calculated gradients (from prev step)
          optim.zero_grad()

          # pull all the tensor batches required for training
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)

          # print('start_positions', start_positions)
          
          # train model on batch and return outputs (incl. loss)
          outputs = model_Bert(input_ids, attention_mask=attention_mask,
                          start_positions=start_positions,
                          end_positions=end_positions)
          
          # get the predictions
          start_pred = torch.argmax(outputs['start_logits'], dim=1)
          end_pred = torch.argmax(outputs['end_logits'], dim=1)

          
          # extract loss
          loss = outputs[0]

          # calculate loss for every parameter that needs grad update
          loss.backward()
          # update parameters
          optim.step()
          train_loss.append(loss.item())

          # calculate accuracy for both and append to accuracy list
          acc_train.append(((start_pred == start_positions).sum()/len(start_pred)).item())
          acc_train.append(((end_pred == end_positions).sum()/len(end_pred)).item())

          # print relevant info to progress bar
          # progress_bar.set_description(f'Epoch {epoch+1}')
          progress_bar.set_postfix({'training_loss': '{:.3f}'.format(np.mean(train_loss))})
    
    # get the average loss
    train_loss_temp = np.mean(train_loss)
    train_losses.append(train_loss_temp)
    
    acc_train_temp = np.mean(acc_train)   # average accuracy of the train set


    tqdm.write(f'''Epoch: {epoch+1} | Train Loss: {train_loss_temp:.3f} | Train Acc: {acc_train_temp*100:.2f}%''')

Epoch: 1: 100%|██████████| 10446/10446 [53:40<00:00,  3.24it/s, training_loss=2.966]


Epoch: 1 | Train Loss: 2.966 | Train Acc: 25.59%


Epoch: 2: 100%|██████████| 10446/10446 [53:53<00:00,  3.23it/s, training_loss=2.442]

Epoch: 2 | Train Loss: 2.442 | Train Acc: 31.60%





In [None]:
# # Save model 
# import torch
# torch.save(model_Bert.state_dict(), './model_QuAC.pth')

<a href="./model_QuAC"> Download model</a>

## Load fine-tuned Bert model

In [None]:
# Load model from Drive
# !gdown 1c9j5vP8tbEp3b0mLWKKkTjhbUirlu5WN

!unzip "/content/drive/MyDrive/AI_2/project4/Models/model_QuAC.zip"

Archive:  /content/drive/MyDrive/AI_2/project4/Models/model_QuAC.zip
  inflating: model_QuAC.pth          


In [None]:
from collections import OrderedDict
import collections
model_check = torch.load('./model_QuAC.pth')
model_Bert.load_state_dict(model_check)

<All keys matched successfully>

## Evaluate with QuAC

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_quac_dev, remove_no_answers=True)

  0%|          | 0/734 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 2.971 
Val_acc:	 24.61%  
Exact score:	 6.81%  
F1 score:	 21.35% 


## Evaluate with SQuAD

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_squad_dev, remove_no_answers=True)

  0%|          | 0/3279 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 4.523 
Val_acc:	 9.79%  
Exact score:	 2.38%  
F1 score:	 13.52% 


## Evaluate with TriviaQA

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_trivia_dev, remove_no_answers=True)

  0%|          | 0/1230 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 5.410 
Val_acc:	 11.41%  
Exact score:	 2.73%  
F1 score:	 10.45% 


## Evaluate with Natural Questions

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_nq_dev, remove_no_answers=True)

  0%|          | 0/457 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 3.993 
Val_acc:	 16.19%  
Exact score:	 3.50%  
F1 score:	 21.58% 


## Evaluate with NewsQA

In [None]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_newsqa_dev, remove_no_answers=True)

  0%|          | 0/646 [00:00<?, ?it/s]

In [None]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 4.971 
Val_acc:	 13.03%  
Exact score:	 4.90%  
F1 score:	 17.99% 


# NewsQA

## Fine-tuning Bert model with QuAC dataset

In [None]:
# Prepare the train data from triviaqa
num_groups = -1 #getting all data
train_contexts, train_questions, train_answers = read_dataset(Location_newsqa_train, 
                                                              num_groups_percent= 0.95, 
                                                              remove_no_answers=False) #get the 95 % of the data, that's how can hold the RAM from kaggle
print(len(train_answers))

add_end_idx(train_answers, train_contexts) #indicate each answer
train_encodings = tokenizer(train_contexts, # encoding context and questions
                          train_questions, 
                          truncation=True, 
                          max_length=324,
                          padding=True)
add_token_positions(train_encodings, train_answers)
train_dataset = SquadDataset(train_encodings)

BATCH_SIZE = 8
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)

87922


In [None]:
from tqdm import tqdm

# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = torch.optim.AdamW(model_Bert.parameters(), lr=2e-5)

epochs = 2


train_losses = []
test_losses = []

for epoch in range(epochs):
    
    train_loss = []
    test_loss = []
    acc_train = []
    acc_test = []

    # for compute f1
    pred_start = []
    pred_end = []
    true_start = []
    true_end = []
    f1_scores = []
    exact_scores = []

    model_Bert.train()   # set model to train mode
    with torch.set_grad_enabled(True):
      # setup loop (we use tqdm for the progress bar)
      progress_bar = tqdm(train_loader, f"Epoch: {epoch+1}")
      for batch in progress_bar:
          # initialize calculated gradients (from prev step)
          optim.zero_grad()

          # pull all the tensor batches required for training
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)

          # print('start_positions', start_positions)
          
          # train model on batch and return outputs (incl. loss)
          outputs = model_Bert(input_ids, attention_mask=attention_mask,
                          start_positions=start_positions,
                          end_positions=end_positions)
          
          # get the predictions
          start_pred = torch.argmax(outputs['start_logits'], dim=1)
          end_pred = torch.argmax(outputs['end_logits'], dim=1)

          
          # extract loss
          loss = outputs[0]

          # calculate loss for every parameter that needs grad update
          loss.backward()
          # update parameters
          optim.step()
          train_loss.append(loss.item())

          # calculate accuracy for both and append to accuracy list
          acc_train.append(((start_pred == start_positions).sum()/len(start_pred)).item())
          acc_train.append(((end_pred == end_positions).sum()/len(end_pred)).item())

          # print relevant info to progress bar
          progress_bar.set_postfix({'training_loss': '{:.3f}'.format(np.mean(train_loss))})
    
    # get the average loss
    train_loss_temp = np.mean(train_loss)
    train_losses.append(train_loss_temp)
    
    acc_train_temp = np.mean(acc_train)   # average accuracy of the train set


    tqdm.write(f'''Epoch: {epoch+1} | Train Loss: {train_loss_temp:.3f} | Train Acc: {acc_train_temp*100:.2f}%''')

Epoch: 1: 100%|██████████| 10991/10991 [56:34<00:00,  3.24it/s, training_loss=2.002]


Epoch: 1 | Train Loss: 2.002 | Train Acc: 44.87%


Epoch: 2: 100%|██████████| 10991/10991 [56:37<00:00,  3.24it/s, training_loss=1.369]

Epoch: 2 | Train Loss: 1.369 | Train Acc: 56.65%





In [None]:
# # Save model 
# import torch
# torch.save(model_Bert.state_dict(), './model_NewsQA.pth')

<a href="./model_NewsQA"> Download model</a>

## Load fine-tuned Bert model

In [14]:
# Load model from Drive
# !gdown path
!unzip "/content/drive/MyDrive/AI_2/project4/Models/model_NewsQA.zip"

Archive:  /content/drive/MyDrive/AI_2/project4/Models/model_NewsQA.zip
  inflating: model_NewsQA.pth        


In [15]:
from collections import OrderedDict
import collections
model_check = torch.load('./model_NewsQA.pth')
model_Bert.load_state_dict(model_check)

<All keys matched successfully>

## Evaluate with NewsQA

In [23]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_newsqa_dev, remove_no_answers=False)

  0%|          | 0/646 [00:00<?, ?it/s]

In [22]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 1.843 
Val_acc:	 50.33%  
Exact score:	 37.04%  
F1 score:	 50.47% 


## Evaluate with SQuAD

In [24]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_squad_dev, remove_no_answers=False)   # getting all the data

  0%|          | 0/3281 [00:00<?, ?it/s]

In [25]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 4.831 
Val_acc:	 37.03%  
Exact score:	 41.13%  
F1 score:	 60.75% 


## Evaluate with TriviaQA

In [26]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_trivia_dev, remove_no_answers=False)

  0%|          | 0/1779 [00:00<?, ?it/s]

In [27]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 4.519 
Val_acc:	 27.33%  
Exact score:	 20.24%  
F1 score:	 28.83% 


## Evaluate with Natural Questions

In [32]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ')

Test loss:	 5.664 
Val_acc:	 30.88%  
Exact score:	 33.34%  
F1 score:	 53.67% 


## Evaluate with QuAC

In [28]:
test_loss, Val_Acc, exact_score, f1_score = Evaluate_on(model_Bert, Location_quac_dev, remove_no_answers=False)

  0%|          | 0/920 [00:00<?, ?it/s]

In [29]:
print(f'Test loss:\t {test_loss:.3f} \nVal_acc:\t {Val_Acc*100:.2f}%  \nExact score:\t {exact_score*100:.2f}%  \nF1 score:\t {f1_score*100:.2f}% ') # :(

Test loss:	 7.850 
Val_acc:	 8.52%  
Exact score:	 2.32%  
F1 score:	 12.20% 
