# Question Answering with Bert
*by Nefeli Tavoulari*

#### In this notebook, I use a pre-trained Bert model to answer questions.

## Install Dependencies

In [1]:
%%capture
!pip install transformers
!pip install datasets

## Import Packages

In [2]:
%matplotlib inline
import io
import re
import csv
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_curve, roc_auc_score

import transformers
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, BertForQuestionAnswering
from datasets import load_dataset
import logging

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
logging.basicConfig(level=logging.INFO)
transformers.logging.set_verbosity_error()

## Use GPU for faster processing

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Available device:", device)

Available device: cuda


## Load Bert tokenizer and model

In [4]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

#tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

## Useful functions for training

In [5]:
def normalize_text(s):

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
    
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

## SQuAD

### Upload dataset - Create and Clean dataframes

In [6]:
train_df, dev_df = load_dataset('squad_v2', split=['train', 'validation'])



  0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
# convert to dataframes
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [8]:
# train_df = train_df.iloc[0:10,:]
# dev_df = dev_df.iloc[0:10,:]

In [9]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['id'], axis = 1, inplace = True) 
train_df.drop(['title'], axis = 1, inplace = True) 
dev_df.drop(['id'], axis = 1, inplace = True) 
dev_df.drop(['title'], axis = 1, inplace = True) 

In [10]:
def get_dataframe(df):
  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []
  all_answers = []

  for index, row in df.iterrows():                          # for each instance
    if (row['answers']['text']):                            # if there is an answer
      for j, ans in enumerate(row['answers']['text']):        # for each answer
        text_length = len(ans)
        start_idx = row['answers']['answer_start'][j]
        end_idx = start_idx + text_length
        if (row['context'][start_idx:end_idx] == ans):        # found the start and end of the answer
          answer_start.append(start_idx)
          answer_end.append(end_idx)
        else:                                               # it can be off by 1 or 2 characters
          start = 0
          end = 0
          for i in [1, 2]:
            if row['context'][start_idx-i:end_idx-i] == ans:
                start = start_idx - i
                end = end_idx - i
          answer_start.append(start)
          answer_end.append(end)
        context.append(row['context'])
        question.append(row['question'])
        answer.append(ans)
        all_answers.append(row['answers']['text'])          # save all plausible answers
    else:                                                   # no answer -> set to None every column regarding the answers
      context.append(row['context'])
      question.append(row['question'])
      answer.append(None)
      all_answers.append(None)
      answer_start.append(None)
      answer_end.append(None)
    

  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  return {'context': context, 'question': question, 'answer': answer_dict, 'all_answers': all_answers}

train_df = get_dataframe(train_df)
dev_df = get_dataframe(dev_df)

In [11]:
# get answers' token positions using answers' character positions
def add_token_positions(encoding, answer, answer_start, answer_end):
    start = []
    end = []
    for i in range(len(answer["text"])):
      if (answer['answer_start'][i] == None):
        start.append(0)
        end.append(0)
        continue
      start.append(encoding.char_to_token(i, answer['answer_start'][i]))
      end.append(encoding.char_to_token(i, answer['answer_end'][i]))
      if start[-1] is None:
        start[-1] = tokenizer.model_max_length
      if end[-1] is None:
        end[-1] = encoding.char_to_token(i, answer['answer_end'][i] - 1)
      if end[-1] is None:
        end[-1] = tokenizer.model_max_length
    return start, end

In [12]:
# create dataset class to gather and organize all info
class SquadDataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    dict = {}
    for key, val in self.encodings.items():
      dict[key] = torch.tensor(val[idx])
    return dict

  def __len__(self):
    return len(self.encodings.input_ids)

In [13]:
def get_dataloaders(train_df, dev_df, BATCH_SIZE):
  # training data
  encoding = tokenizer(train_df["context"], train_df["question"],
                      truncation = True, 
                      padding = "max_length", max_length = 400,
                      return_attention_mask = True)

  # validation data
  encoding_dev = tokenizer(dev_df["context"], dev_df["question"],
                      truncation = True, 
                      padding = "max_length", max_length = 400,
                      return_attention_mask = True)

  encoding['start_positions'], encoding['end_positions'] = add_token_positions(encoding, train_df["answer"], train_df["answer"]["answer_start"], train_df["answer"]["answer_end"])
  encoding_dev['start_positions'], encoding_dev['end_positions']  = add_token_positions(encoding_dev,  dev_df["answer"], dev_df["answer"]["answer_start"], dev_df["answer"]["answer_end"])

  train_dataset = SquadDataset(encoding)
  validation_dataset = SquadDataset(encoding_dev)

  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE)

  return train_dataloader, validation_dataloader

In [14]:
BATCH_SIZE = 16
train_dataloader, validation_dataloader = get_dataloaders(train_df, dev_df, BATCH_SIZE)

### Configurations

In [15]:
#Define Hyperparameters
learning_rate = 1e-5

#Initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.001)

clip = 2

#model

### Training-Validation

In [16]:
torch.cuda.empty_cache()

def train_model(train_dataloader):

    batch_losses = []
    loss = 0

    # sets the mode to train
    model.train()
    for batch in train_dataloader:  # for every batch

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      token_type_ids = batch['token_type_ids'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      
      optimizer.zero_grad()

      output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, start_positions=start_positions, end_positions=end_positions)
      
      loss = output[0]
      loss.backward()
      batch_losses.append(loss)
      #nn.utils.clip_grad_norm_(model.parameters(), clip)
      optimizer.step()

    return sum(batch_losses)/len(train_dataloader)

def validate_model(validation_dataloader):

    exact_match = 0
    f1_score = 0

    # validation    
    with torch.no_grad():

      # sets the mode to testing
      model.eval()
      for batch in validation_dataloader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        start_logits, end_logits =  model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False) 

        start_logits = start_logits.detach().cpu()
        end_logits = end_logits.detach().cpu()

        # correct predictions
        for idx, (start, end) in enumerate(zip(start_logits, end_logits)):
          ans_start = torch.argmax(start).tolist()
          ans_end = torch.argmax(end).tolist()
          if (ans_start > sum(attention_mask[idx].tolist()) or ans_start > ans_end): # bigger than number of tokens or ending token
            continue
          if (ans_end < sum(attention_mask[idx].tolist())):
            prediction = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[idx][ans_start:ans_end]))
          else:
            prediction = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[idx][ans_start:]))
        
          if (dev_df['all_answers'][idx] == None):
            if (ans_start == 0 and ans_end == 0):
              exact_match += 1
              f1_score += 1
              continue
            else:
              continue

          exact_match += max((compute_exact_match(prediction, answer)) for answer in dev_df['all_answers'][idx])
          f1_score += max((compute_f1(prediction, answer)) for answer in dev_df['all_answers'][idx])

      return f1_score/len(validation_dataloader), exact_match/len(validation_dataloader)

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
  train_loss = train_model(train_dataloader)
  f1_score, exact_match = validate_model(validation_dataloader)
  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Validation F1 Score = {f1_score:.5f} | Validation Exact Match = {exact_match:.5f} ")

## TriviaQA

In [None]:
train_df, dev_df, test_df = load_dataset("trivia_qa", 'rc' , split=['train[:10%]', 'validation[:10%]', 'test[:10%]'])

In [None]:
# convert to dataframes
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [None]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['question_source'], axis = 1, inplace = True) 
train_df.drop(['question_id'], axis = 1, inplace = True) 
train_df.drop(['entity_pages'], axis = 1, inplace = True) 
train_df.drop(['search_results'], axis = 1, inplace = True) 

train_df.drop(['question_source'], axis = 1, inplace = True) 
dev_df.drop(['question_id'], axis = 1, inplace = True) 
dev_df.drop(['entity_pages'], axis = 1, inplace = True) 
dev_df.drop(['search_results'], axis = 1, inplace = True) 

In [None]:
# get the information we need from data
def get_data_triviaqa(df):

  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []
  all_answers = []

  for index, row in df.iterrows():                        # for each row - pair of question-answers
      

  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  dict = {'question': question, 'answer': answer_dict, 'all_answers': all_answers}  
  return dict

train_df = get_data_triviaqa(train_df)
dev_df = get_data_triviaqa(dev_df)

In [None]:
train_df['answer'][2]

## Quac

In [None]:
train_df, dev_df = dataset = load_dataset("quac" , split=['train', 'validation'])

In [None]:
# convert to dataframes
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [None]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["questions"], inplace=True)
dev_df.dropna(subset = ["questions"], inplace=True)

train_df.drop(['dialogue_id'], axis = 1, inplace = True) 
train_df.drop(['wikipedia_page_title'], axis = 1, inplace = True) 
train_df.drop(['background'], axis = 1, inplace = True) 
train_df.drop(['section_title'], axis = 1, inplace = True) 
train_df.drop(['turn_ids'], axis = 1, inplace = True) 
train_df.drop(['followups'], axis = 1, inplace = True) 
train_df.drop(['yesnos'], axis = 1, inplace = True) 

dev_df.drop(['dialogue_id'], axis = 1, inplace = True) 
dev_df.drop(['wikipedia_page_title'], axis = 1, inplace = True) 
dev_df.drop(['background'], axis = 1, inplace = True) 
dev_df.drop(['section_title'], axis = 1, inplace = True) 
dev_df.drop(['turn_ids'], axis = 1, inplace = True) 
dev_df.drop(['followups'], axis = 1, inplace = True) 
dev_df.drop(['yesnos'], axis = 1, inplace = True) 

In [None]:
# get the information we need from data
def get_data_quac(df):

  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []
  all_answers = []

  for index, row in df.iterrows():                        # for each row - pair of question-answers
      #all_answers.append(row['answers']['texts'])     
      for i, answers in enumerate(zip(*row['orig_answers'].values())):
        start_idx = answers[1]
        end_idx = start_idx + len(answers[0])
        if (row['context'][i][start_idx:end_idx] == row['orig_answers'][i]['texts']):
          answer_start.append(start_idx)
          answer_end.append(end_idx)
        else:
          start = 0
          end = 0
          for j in [1, 2]:                                  # maybe the answer is off by one or two chars
            if row['context'][i][start_idx-j:end_idx-j] == row['orig_answers'][i]['texts']:
                start = start_idx - j
                end = end_idx - j
          answer_start.append(start)
          answer_end.append(end)
        answer.extend(row['orig_answers']['texts'])
      for q in row['questions']:
        question.append(q)
        context.append(row['context'])

  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  dict = {'context': context, 'question': question, 'answer': answer_dict, 'all_answers': all_answers}  
  return dict

train_df = get_data_quac(train_df)
dev_df = get_data_quac(dev_df)

## NewsQA

In [None]:
train_df, dev_df = load_dataset("newsqa", "combined-csv" , split=['train', 'validation'])