# Question Answering with Bert
*by Nefeli Tavoulari*

#### In this notebook, I use a pre-trained Bert model to answer questions.

## Install Dependencies

In [None]:
%%capture
!pip install transformers
!pip install datasets

## Import Packages

In [None]:
%matplotlib inline
import io
import re
import csv
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import files, drive

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split

import transformers
from transformers import AutoTokenizer, BertModel, BertForSequenceClassification, BertForQuestionAnswering
from datasets import load_dataset
import logging

SEED = 1234
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
logging.basicConfig(level=logging.INFO)
transformers.logging.set_verbosity_error()

## Use GPU for faster processing

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Available device:", device)

Available device: cuda


## Load Bert tokenizer and model

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased').to(device)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

## Useful functions for all datasets

In [None]:
def normalize_text(s):

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the|[CLS]|[SEP])\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))
    
def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [None]:
# get answers' token positions using answers' character positions
def add_token_positions(encoding, answer, answer_start, answer_end):
    start = []
    end = []
    for i in range(len(answer["text"])):
      if (answer['answer_start'][i] == None):
        start.append(0)
        end.append(0)
        continue
      start.append(encoding.char_to_token(i, answer['answer_start'][i]))
      end.append(encoding.char_to_token(i, answer['answer_end'][i]))
      if start[-1] is None:
        start[-1] = tokenizer.model_max_length
      if end[-1] is None:
        end[-1] = encoding.char_to_token(i, answer['answer_end'][i] - 1)
      if end[-1] is None:
        end[-1] = tokenizer.model_max_length
    return start, end

In [None]:
# create dataset class to gather and organize all info
class QADataset(torch.utils.data.Dataset):
  def __init__(self, encodings):
    self.encodings = encodings

  def __getitem__(self, idx):
    dict = {}
    for key, val in self.encodings.items():
      dict[key] = torch.tensor(val[idx])
    return dict

  def __len__(self):
    return len(self.encodings.input_ids)

In [None]:
def get_dataloaders(train_df, dev_df, BATCH_SIZE):
  # training data
  encoding = tokenizer(train_df["context"], train_df["question"],
                      truncation = True, 
                      padding = "max_length", max_length = 400,
                      return_attention_mask = True)

  # validation data
  encoding_dev = tokenizer(dev_df["context"], dev_df["question"],
                      truncation = True, 
                      padding = "max_length", max_length = 400,
                      return_attention_mask = True)
  
  encoding['start_positions'], encoding['end_positions'] = add_token_positions(encoding, train_df["answer"], train_df["answer"]["answer_start"], train_df["answer"]["answer_end"])

  train_dataset = QADataset(encoding)
  validation_dataset = QADataset(encoding_dev)

  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE)
  return train_dataloader, validation_dataloader, encoding_dev

## Configurations for all datasets

In [None]:
#Define Hyperparameters
learning_rate = 1e-5

#Initialize optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)#, weight_decay=0.001)

clip = 2

#model

## Training and Validation

In [None]:
torch.cuda.empty_cache()

def train(train_dataloader):

    batch_losses = []
    loss = 0

    # sets the mode to train
    model.train()
    for batch in train_dataloader:  # for every batch

      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      token_type_ids = batch['token_type_ids'].to(device)
      start_positions = batch['start_positions'].to(device)
      end_positions = batch['end_positions'].to(device)
      optimizer.zero_grad()

      output = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, start_positions=start_positions, end_positions=end_positions)
      
      loss = output[0]
      loss.backward()
      batch_losses.append(loss)
      #nn.utils.clip_grad_norm_(model.parameters(), clip)
      optimizer.step()

    return sum(batch_losses)/len(train_dataloader)

def validate(validation_dataloader, mode="dev"):
    if (mode == "dev"):
      data = data
    else:
      data = test_df
    f1_score = 0

    # validation    
    with torch.no_grad():

      # sets the mode to testing
      model.eval()
      for batch in validation_dataloader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        start_logits, end_logits =  model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, return_dict=False) 

        start_logits = start_logits.detach().cpu()
        end_logits = end_logits.detach().cpu()

        # correct predictions
        for idx, (start, end) in enumerate(zip(start_logits, end_logits)):
          ans_start = torch.argmax(start).tolist()
          ans_end = torch.argmax(end).tolist()
          # when no answer
          if (data['answer']['text'][idx] == None):
            if (ans_start == 0 and ans_end == 0):
              f1_score += 1.0
            continue

          if (ans_start > sum(attention_mask[idx].tolist()) or ans_start > ans_end): # bigger than number of tokens or ending token
            continue

          prediction = tokens[idx][ans_start]
          for i in range(ans_start + 1, ans_end + 1):
              if tokens[idx][i] == '[SEP]':
                break
              if tokens[idx][i][0:2] == '##':
                  prediction += tokens[idx][i][2:]
              else:
                  prediction += ' ' + tokens[idx][i]
              
          #print('Answer: "' + prediction + '"')
          if "all_answers" in data:
            f1_score += max((compute_f1(prediction, answer)) for answer in data['all_answers'][idx])
          else:
            f1_score += compute_f1(prediction, data['answer']['text'][idx])

    return f1_score/len(validation_dataloader)

## SQuAD

### Upload dataset - Create and Clean dataframes

In [None]:
train_df, dev_df = load_dataset('squad_v2', split=['train[:60%]', 'validation[:60%]'])

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# convert to dataframes
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [None]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['id'], axis = 1, inplace = True) 
train_df.drop(['title'], axis = 1, inplace = True) 
dev_df.drop(['id'], axis = 1, inplace = True) 
dev_df.drop(['title'], axis = 1, inplace = True) 

In [None]:
def get_squad(df):
  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []
  all_answers = []

  for index, row in df.iterrows():                          # for each instance
    if (row['answers']['text']):                            # if there is an answer
      for j, ans in enumerate(row['answers']['text']):        # for each answer
        text_length = len(ans)
        start_idx = row['answers']['answer_start'][j]
        end_idx = start_idx + text_length
        if (row['context'][start_idx:end_idx] == ans):        # found the start and end of the answer
          answer_start.append(start_idx)
          answer_end.append(end_idx)
        else:                                               # it can be off by 1 or 2 characters
          start = 0
          end = 0
          for i in [1, 2]:
            if row['context'][start_idx-i:end_idx-i] == ans:
                start = start_idx - i
                end = end_idx - i
          answer_start.append(start)
          answer_end.append(end)
        context.append(row['context'])
        question.append(row['question'])
        answer.append(ans)
        all_answers.append(row['answers']['text'])          # save all plausible answers
    else:                                                   # no answer -> set to None every column regarding the answers
      context.append(row['context'])
      question.append(row['question'])
      answer.append(None)
      all_answers.append(None)
      answer_start.append(None)
      answer_end.append(None)
  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  return {'context': context, 'question': question, 'answer': answer_dict, 'all_answers': all_answers}
train_df = get_squad(train_df)
dev_df = get_squad(dev_df)

In [None]:
BATCH_SIZE = 16
train_dataloader_squad, validation_dataloader_squad, encoding_dev = get_dataloaders(train_df, dev_df, BATCH_SIZE)

In [None]:
tokens = {}
for i, x in enumerate(encoding_dev['input_ids']):
  tokens[i] = tokenizer.convert_ids_to_tokens(x)

### Training-Validation

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
  train_loss = train(train_dataloader_squad)
  f1_score = validate(validation_dataloader_squad)
  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Validation F1 Score = {f1_score:.5f} ")

## TriviaQA

### Upload dataset - Create and Clean dataframes

In [None]:
train_df, dev_df, test_df = load_dataset("trivia_qa", 'rc' , split=['train[:1%]', 'validation[:1%]', 'test[:1%]'])

In [None]:
# convert to dataframes
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)
test_df = pd.DataFrame(test_df)

In [None]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)
test_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['question_source'], axis = 1, inplace = True) 
train_df.drop(['question_id'], axis = 1, inplace = True) 
train_df.drop(['search_results'], axis = 1, inplace = True) 

dev_df.drop(['question_source'], axis = 1, inplace = True) 
dev_df.drop(['question_id'], axis = 1, inplace = True) 
dev_df.drop(['search_results'], axis = 1, inplace = True) 

test_df.drop(['question_source'], axis = 1, inplace = True) 
test_df.drop(['question_id'], axis = 1, inplace = True) 
test_df.drop(['search_results'], axis = 1, inplace = True) 

In [None]:
def get_triviaqa(df):
  context = []
  question = []
  answer = []
  all_answers = []
  answer_start = []
  answer_end = []

  for index, row in df.iterrows():                          # for each instance
    if (row['answer']['aliases']):                            # if there is an answer
      for j, ans in enumerate(row['answer']['aliases']):        # for each answer
        if (row['entity_pages']['wiki_context']):             # if these is a context
          if (row['entity_pages']['wiki_context'][0][:70].find(ans) == -1): # answer not in context
            continue
          else:
            start_idx = row['entity_pages']['wiki_context'][0][:70].find(ans)
            end_idx = start_idx + len(ans)
            if (row['entity_pages']['wiki_context'][0][:70][start_idx:end_idx] == ans):        # found the start and end of the answer
              answer_start.append(start_idx)
              answer_end.append(end_idx)
            else:                                               # it can be off by 1 or 2 characters
              start = 0
              end = 0
              for i in [1, 2]:
                if row['entity_pages']['wiki_context'][0][:70][start_idx:end_idx] == ans:
                    start = start_idx - i
                    end = end_idx - i
              answer_start.append(start)
              answer_end.append(end)
          context.append(row['entity_pages']['wiki_context'][0][:70])
        else:
          context.append("")
          answer_start.append(None)
          answer_end.append(None)
        question.append(row['question'])
        answer.append(ans)
        

        all_answers.append(row['answer']['aliases'])          # save all plausible answers
    else:                                                   # no answer -> set to None every column regarding the answers
      if (row['entity_pages']['wiki_context']):
        context.append(row['entity_pages']['wiki_context'][0][:50])
      else:
        context.append("")
      question.append(row['question'])
      answer.append(None)
      all_answers.append(None)
  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  return {'context': context, 'question': question, 'answer': answer_dict, 'all_answers': all_answers}

train_df = get_triviaqa(train_df)
dev_df = get_triviaqa(dev_df)
test_df = get_triviaqa(test_df)

In [None]:
def get_dataloaders_triviaqa(train_df, dev_df, test_df, BATCH_SIZE):
  # training data
  encoding = tokenizer(train_df["context"], train_df["question"],
                      truncation = True, 
                      padding = "max_length", max_length = 100,
                      return_attention_mask = True)

  # validation data
  encoding_dev = tokenizer(dev_df["context"], dev_df["question"],
                      truncation = True, 
                      padding = "max_length", max_length = 100,
                      return_attention_mask = True)
  
  # test data
  encoding_test = tokenizer(test_df["context"], test_df["question"],
                      truncation = True, 
                      padding = "max_length", max_length = 100,
                      return_attention_mask = True)
  
  encoding['start_positions'], encoding['end_positions'] = add_token_positions(encoding, train_df["answer"], train_df["answer"]["answer_start"], train_df["answer"]["answer_end"])
  
  train_dataset = QADataset(encoding)
  validation_dataset = QADataset(encoding_dev)
  test_dataset = QADataset(encoding_test)

  train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=BATCH_SIZE)
  test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

  return train_dataloader, validation_dataloader, test_dataloader, encoding_dev, encoding_test

In [None]:
BATCH_SIZE = 16
train_dataloader_triviaqa, validation_dataloader_triviaqa, test_dataloader_triviaqa, encoding_dev, encoding_test = get_dataloaders_triviaqa(train_df, dev_df, test_df, BATCH_SIZE)

In [None]:
tokens_dev = {}
for i, x in enumerate(encoding_dev['input_ids']):
  tokens_dev[i] = tokenizer.convert_ids_to_tokens(x)

tokens_test = {}
for i, x in enumerate(encoding_test['input_ids']):
  tokens_test[i] = tokenizer.convert_ids_to_tokens(x)

### Training-Validation

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
  train_loss = train(train_dataloader_triviaqa)
  f1_score = validate(validation_dataloader_triviaqa)
  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Validation F1 Score = {f1_score:.5f} ")

In [None]:
f1_score = validate(test_dataloader_triviaqa, "test")
print(f"Test F1 Score = {f1_score:.5f} ")

## Quac

### Upload dataset - Create and Clean dataframes

In [None]:
train_df, dev_df = dataset = load_dataset("quac" , split=['train', 'validation'])

Downloading:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.73k [00:00<?, ?B/s]

Downloading and preparing dataset quac/plain_text (download: 73.47 MiB, generated: 62.51 MiB, post-processed: Unknown size, total: 135.99 MiB) to /root/.cache/huggingface/datasets/quac/plain_text/1.1.0/4170258e7e72d7c81bd6441b3f3489ea1544f0ff226ce61e22bb00c6e9d01fb6...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/68.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.93M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset quac downloaded and prepared to /root/.cache/huggingface/datasets/quac/plain_text/1.1.0/4170258e7e72d7c81bd6441b3f3489ea1544f0ff226ce61e22bb00c6e9d01fb6. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# convert to dataframes
train_df = pd.DataFrame(train_df)
dev_df = pd.DataFrame(dev_df)

In [None]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["questions"], inplace=True)
dev_df.dropna(subset = ["questions"], inplace=True)

train_df.drop(['dialogue_id'], axis = 1, inplace = True) 
train_df.drop(['wikipedia_page_title'], axis = 1, inplace = True) 
train_df.drop(['background'], axis = 1, inplace = True) 
train_df.drop(['section_title'], axis = 1, inplace = True) 
train_df.drop(['turn_ids'], axis = 1, inplace = True) 
train_df.drop(['followups'], axis = 1, inplace = True) 
train_df.drop(['yesnos'], axis = 1, inplace = True) 

dev_df.drop(['dialogue_id'], axis = 1, inplace = True) 
dev_df.drop(['wikipedia_page_title'], axis = 1, inplace = True) 
dev_df.drop(['background'], axis = 1, inplace = True) 
dev_df.drop(['section_title'], axis = 1, inplace = True) 
dev_df.drop(['turn_ids'], axis = 1, inplace = True) 
dev_df.drop(['followups'], axis = 1, inplace = True) 
dev_df.drop(['yesnos'], axis = 1, inplace = True) 

In [None]:
def get_quac(df):
  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []
  all_answers = []

  for index, row in df.iterrows():                          # for each instance
    question.extend(row['questions'])
    for i in row['questions']:
      context.append(row['context'])

    for ans in zip(*row['answers'].values()):        # for each answer
      answer.append(ans[0][0])
      all_answers.append(ans[0])          # save all plausible answers
      text_length = len(ans[0][0])
      start_idx = ans[1][0]
      end_idx = start_idx + text_length
      if (row['context'][start_idx:end_idx] == ans[0][0]):        # found the start and end of the answer
        answer_start.append(start_idx)
        answer_end.append(end_idx)
      else:                                               # it can be off by 1 or 2 characters
        start = 0
        end = 0
        for i in [1, 2]:
          if row['context'][start_idx-i:end_idx-i] == ans[0][0]:
              start = start_idx - i
              end = end_idx - i
        answer_start.append(start)
        answer_end.append(end)

    
  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  return {'context': context, 'question': question, 'answer': answer_dict, 'all_answers': all_answers}

train_df = get_quac(train_df)
dev_df = get_quac(dev_df)

In [None]:
BATCH_SIZE = 16
train_dataloader_quac, validation_dataloader_quac, encoding_dev = get_dataloaders(train_df, dev_df, BATCH_SIZE)

In [None]:
tokens = {}
for i, x in enumerate(encoding_dev['input_ids']):
  tokens[i] = tokenizer.convert_ids_to_tokens(x)

### Training-Validation

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
  train_loss = train(train_dataloader_quac)
  f1_score = validate(validation_dataloader_quac)
  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Validation F1 Score = {f1_score:.5f} ")

## NewsQA

### Upload dataset - Create and Clean dataframes

In [None]:
# followed the instructions from https://github.com/Maluuba/newsqa
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df = pd.read_csv('gdrive/MyDrive/combined-newsqa-data-v1.csv')
df = df.iloc[0:10000,:] # get subset of dataset

In [None]:
train_df, dev_df = train_test_split(df, test_size=0.2)

In [None]:
# remove empty instances / duplicates / extra columns
train_df.dropna(subset = ["question"], inplace=True)
dev_df.dropna(subset = ["question"], inplace=True)

train_df.drop(['story_id'], axis = 1, inplace = True) 
train_df.drop(['is_answer_absent'], axis = 1, inplace = True) 
train_df.drop(['is_question_bad'], axis = 1, inplace = True) 
train_df.drop(['validated_answers'], axis = 1, inplace = True) 

dev_df.drop(['story_id'], axis = 1, inplace = True) 
dev_df.drop(['is_answer_absent'], axis = 1, inplace = True) 
dev_df.drop(['is_question_bad'], axis = 1, inplace = True) 
dev_df.drop(['validated_answers'], axis = 1, inplace = True) 

In [None]:
def get_newsqa(df):
  context = []
  question = []
  answer = []
  answer_start = []
  answer_end = []

  for index, row in df.iterrows():                          # for each instance
    context.append(row["story_text"])
    question.append(row['question'])
    if (row['answer_char_ranges'].split('|')[0] == 'None'):
      answer_start.append(None)
      answer_end.append(None)
      answer.append(None)
    else:
      start = int((row['answer_char_ranges'].split('|')[0]).split(':')[0])
      end = int((row['answer_char_ranges'].split('|')[0]).split(',')[0].split(':')[1])
      answer_start.append(start)
      answer_end.append(end)
      answer.append(row["story_text"][start:end])

  answer_dict = {'text': answer, 'answer_start': answer_start, 'answer_end': answer_end}
  return {'context': context, 'question': question, 'answer': answer_dict}

train_df = get_newsqa(train_df)
dev_df = get_newsqa(dev_df)

In [None]:
BATCH_SIZE = 16
train_dataloader_newsqa, validation_dataloader_newsqa, encoding_dev = get_dataloaders(train_df, dev_df, BATCH_SIZE)

In [None]:
tokens = {}
for i, x in enumerate(encoding_dev['input_ids']):
  tokens[i] = tokenizer.convert_ids_to_tokens(x)

### Training-Validation

In [None]:
EPOCHS = 1
for epoch in range(EPOCHS):
  #train_loss = train(train_dataloader_newsqa)
  f1_score = validate(validation_dataloader_newsqa)
  print(f"Epoch {epoch:3}: | Train Loss = {train_loss:.5f} | Validation F1 Score = {f1_score:.5f} ")