In [22]:
# Imports here
import json
import numpy as np
import torch
import zipfile
import os
!pip install transformers
from transformers import BertForQuestionAnswering
from transformers import AutoTokenizer
from transformers import BertTokenizerFast
from transformers import DistilBertForQuestionAnswering
from transformers import DistilBertTokenizerFast
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm



from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Open the json training-data and store them in arrays

In [None]:
# Store the paths to the training file and the wikipedia paragraphs (evidence)
path = r'./gdrive/MyDrive/Information_Retrieval_Project'
wikiTrain = path + '/qa/wikipedia-train.json'
evidencePath = path + '/evaluation/wikipedia.zip'
verifiedPath = path + '/qa/verified-wikipedia-dev.json'

In [None]:
# Open the training file
with open(wikiTrain, 'r', encoding='utf-8') as jsonFile:
    data = json.load(jsonFile)

In [None]:
# Test if the file opens
answer = data["Data"][0]["Answer"]
question = data["Data"][0]["Question"]
answerFileName = data["Data"][0]["EntityPages"][1]["Filename"]
answerFileName

'Judi_Dench.txt'

In [None]:
archive = zipfile.ZipFile(evidencePath, 'r')
dataArray = np.array(data["Data"])

questionArray = np.zeros(len(dataArray), dtype='object')
answerArray = np.zeros(len(dataArray), dtype='object')
answerFileArray = np.zeros(len(dataArray), dtype='object')

i = 0
# Read the Question, Answers + Paragraphs and store them in an array each
for item in dataArray:
    questionArray[i] = item["Question"]
    answerArray[i] = item["Answer"]["Value"]
    filePath = item["EntityPages"][0]["Filename"]
    
    # Read the Paragraph (if the File exists)
    try:        
      paragraph = archive.read('wikipedia/' + filePath).decode("utf-8")
      answerFileArray[i] = paragraph
    except KeyError:
      answerFileArray[i] = "NAN"   
    i += 1       

**3 Methods to:**
1. get a pre-trained model
2. answer a question given a model (from 1.) + tokenizer (from 1.) + question + paragraph which contains the answer
3. use 2. but can take multiple paragraphs (paragraphArray) per question as parameter

Test the pre-trained model

In [None]:
# get a pre-trained model
def getModel():
    #Model
    model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    #Tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    return model,tokenizer

In [None]:
# Get the pretained model + tokenizer
model, tokenizer = getModel()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
def questionAnswering(model, tokenizer, question, paragraph, numOverflow = 3, returnStartEndValues = False):
    encoding = tokenizer.encode_plus(question, paragraph, max_length=512, truncation='only_second', return_overflowing_tokens=True, add_special_tokens=True)

    # get the type of bert model to be able to compare later
    bertModel = type(BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad'))

    try:
        # gives a TypeError when we only have one token (no overflow)
        # The only time this might be a problem is when we would have only one single token and
        # only one value inside this token, which is impossible because of the words that Bert adds
        # while tokenizing the question and the paragraph
        lenEntries = len(encoding['input_ids'][0])

        # OverflowTokens Exist -> Change the length
        numberTokens = len(encoding['input_ids'])
    except TypeError:
        # Set the tokens to one to use the foor loop only once
        numberTokens = 1

    # Limit number of overflowTokens to reduce computing time
    if numberTokens > numOverflow:
        numberTokens = numOverflow

    # empty Arrays to store overflow scoring values
    startValues = np.empty(0)
    endValues = np.empty(0)

    # go through every token, calculate the appropriate scores and append them to start/endValues
    # Bert is limited to one token with 512 entries at a time
    for i in range(numberTokens):
        if numberTokens != 1:
            tokens = encoding['input_ids'][i]
            sentence_embedding = encoding['token_type_ids'][i]
            attention_mask = encoding['attention_mask'][i]
        else:
            tokens = encoding['input_ids'][0]
            sentence_embedding = encoding['token_type_ids'][0]
            attention_mask = encoding['attention_mask'][0]

        if isinstance(model, bertModel):
          output = model(input_ids=torch.tensor([tokens]), attention_mask=torch.tensor([attention_mask]), token_type_ids=torch.tensor([sentence_embedding]))
          startValues = np.append(startValues, output.start_logits.detach().numpy()[0])
          endValues = np.append(endValues, output.end_logits.detach().numpy()[0])          
        else:
          output = model(input_ids=torch.tensor([tokens]), attention_mask=torch.tensor([attention_mask]))
          startValues = np.append(startValues, output[0].detach().numpy()[0])
          endValues = np.append(endValues, output[1].detach().numpy()[0])

    # find the absolute position of start and end index
    start_index = np.argmax(startValues)
    startValue = np.amax(startValues)
    end_index = np.argmax(endValues)
    endValue = np.amax(endValues)
    

    # if overflow tokens exist:
    if numberTokens != 1:
        # Find the correct start and end tokens (only the tokens not the position)
        overflowNrStart = int(start_index / 512)
        overflowNrEnd = int(end_index / 512)

        # Find the correct position in the token
        answerStart = start_index % 512
        answerEnd = end_index % 512

        # get the answer from the corresponding tokens and format for better reading
        answer = ' '.join(tokenizer.convert_ids_to_tokens(encoding['input_ids'][overflowNrStart][answerStart:answerEnd + 1]))
        answer = answer.replace(' ##', '')
        if returnStartEndValues == True:
          return (answer, startValue, endValue)
        else:    
          return (answer)
    else:
        # get the answer from the corresponding tokens and format for better reading
        answer = ' '.join(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0][start_index:end_index + 1]))
        answer = answer.replace(' ##', '')
        if returnStartEndValues == True:
          return (answer, startValue, endValue)
        else:    
          return (answer)

In [None]:
question = "What are machine learning models based on?"
paragraph= """
Machine learning (ML) is the study of computer algorithms that improve automatically through experience. 
It is seen as a part of artificial intelligence.
Machine learning algorithms build a model based on sample data,  as "training data", 
in order to make predictions or decisions without being explicitly programmed to do so. 
Machine learning algorithms are used in a wide variety of applications, 
such as email filtering and computer vision, 
where it is difficult or unfeasible to develop conven algorithms to perform the needed tasks.
"""
questionAnswering(model, tokenizer, question, paragraph)

'sample data'

Create a function that can work with multiple paragraphs (files) per question

In [None]:
# give multiple paragraphs for the same question and return the answer with the highest score
def multipleQuestionAnswering(model, tokenizer, question, paragraphArray, numOverflow = 3, returnStartEndValues = False):
  # check if paragraphArray is in fact an array or just a single paragraph
  # if only a single paragraph: change nothing from normal questionAnswering
  if isinstance(paragraphArray, str):
      return (questionAnswering(model, tokenizer, question, paragraphArray, numOverflow, returnStartEndValues))       
  
  paragraphs = len(paragraphArray)
  answerArray = np.zeros(paragraphs, dtype="object")
  startValuesArray = np.zeros(paragraphs)
  endValuesArray = np.zeros(paragraphs)
  i = 0
  # answer the same question for each of the paragraphs
  for paragraph in paragraphArray:
    # check if another paragraph exists. 0 = no more paragraphs
    if paragraph == 0:
      break
    answer, startValue, endValue = questionAnswering(model, tokenizer, question, paragraph, numOverflow, returnStartEndValues=True)
    answerArray[i] = answer
    startValuesArray[i] = startValue
    endValuesArray[i] = endValue
    i += 1

  #print(startValuesArray)
  #print(endValuesArray)
  # it doesn't matter if searching in the start- or endvalues array
  # return the answer with the highest score
  if returnStartEndValues == True:
    return (answerArray[np.argmax(startValuesArray)], np.amax(startValuesArray), np.amax(endValuesArray))
  else:    
    return (answerArray[np.argmax(startValuesArray)])          

Store multiple files per question in a separate array

In [None]:
# Change numberFileArray to be able to multiple paragraphs per question/answer
# Limit number of paragraphs to 5 per question/answer
maxParagraphs = 5
# create first entry in multipleAnswerFileArray 
# each entry has to be the same length (maxParagraphs) in order to 
# store a 1D-Array in a 2D-Array
multipleAnswerFileArray = np.zeros((len(dataArray) , maxParagraphs), dtype='object')

j = 0
for item in dataArray:
  currentParagraphs = np.zeros(maxParagraphs, dtype='object')
  i = 0
  for file in (item["EntityPages"]):
    # Read the Paragraph (if the File exists) and save in array
    try:        
      multipleAnswerFileArray[j][i] = archive.read('wikipedia/' + file["Filename"]).decode("utf-8")
    except KeyError:
      multipleAnswerFileArray[j][i] = "NAN"  


    i += 1
    # if maximum no of paragraphs is reached:
    # do not continue inner for loop
    if i == maxParagraphs:
      break
  # stack the array with the current paragraphs with all others      
  j += 1

#show the seconds paragraph (textfile) of the first question
multipleAnswerFileArray[0][1][:100]

'Dame Judith Olivia "Judi" Dench,  (born 9 December 1934)  is an English actress and author.  Dench m'

Fine-tune a pretrained Bert model (DistilBert)

In [None]:
# Train own model

# Create another array to store both the Start and End of the Answer inside each Answer
numberAnswers = len(answerArray)
answerStartEndArray = np.empty((numberAnswers, 3), dtype=object)
deleteArray = np.empty(0, dtype=np.int32)

counter = 0
for i in range(numberAnswers):
  paragraph = answerFileArray[i].lower()
  answer = answerArray[i].lower().strip()
  answerStart = paragraph.find(" " + answer + " ")
  answerEnd = answerStart + len(" " + answer + " ")
  if answerStart < 0:
    counter += 1 
    deleteArray = np.append(deleteArray, i)
  else:
    answerStartEndArray[i][0] = answer
    answerStartEndArray[i][1] = answerStart
    answerStartEndArray[i][2] = answerEnd

print("Removed entries: " + str(counter))
#remove entries, where the answer was not found in the text from all three arrays
answerStartEndArray = np.delete(answerStartEndArray, deleteArray, 0)
questionDelArray = np.delete(questionArray, deleteArray)
multipleAnswerFileDelArray = np.delete(multipleAnswerFileArray, deleteArray, 0)
answerFileDelArray = np.delete(answerFileArray, deleteArray)

Removed entries: 19494


In [None]:
# When using Bert as above, Colab does not have enough RAM (the kernel closes after 20 seconds or so)
def getDistilBertModel():
  model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad', torchscript=True)
  tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
  return model, tokenizer

Function to:
1. tokenize the paragraph + question
2. Find for each question the bucket (with 512 tokens) that contains the answer
   to the question
   returns tokenized questions-paragraphs to use for model training

In [None]:
def tokenizeRightBucket(tokenizer, paragraphs, questions, answers):

  # tokenize the paragraph + question
  # train_encodings               -> 1 bucket for each question, truncated to 512 tokens
  # train_encodings_overflow      -> x buckets for each question, each bucket truncated to 512 tokens
  # train_encodings_non_truncated -> 1 bucket for each question, x amount of tokens (non-truncated)
  train_encodings = tokenizer(paragraphs.tolist(), questions.tolist(), padding=True, truncation=True)
  train_encodings_Overflow = tokenizer(paragraphs.tolist(), questions.tolist(), padding=True, truncation=True, max_length=512, return_overflowing_tokens=True)
  train_encodings_non_Truncated = tokenizer(paragraphs.tolist(), questions.tolist(), padding=False)

  
  # save the length of each question (to know where to start)
  lastBucket = 0
  maxLen = len(train_encodings_Overflow['input_ids'])

  # Go through every question and save only the bucket (the paragraph) with the answer to the question
  for i in range(len(train_encodings['input_ids'])):

    # Special Tokens that occur in each bucket: 101 = CLS, 102 = SEP
    # Tokens for Paragraph = Find the first use of the SEP
    SepToken = train_encodings['input_ids'][i].index(102)
    # Get the total Number of Paragraph Tokens (without the SEP/CLS tokens and without the question)
    NoTokens = train_encodings_non_Truncated['input_ids'][i].index(102) - 1
    # Get the total Number of buckets needed (to be later added to lastBucket)
    NoBuckets = int(NoTokens / SepToken)
    
    # Search the token inside the whole paragraph which represents the answer
    # tokenize the answer to know which token to look for
    # Searching for only the first token is not enough
    # Example: Right answer = Joan Rivers, but seraching only for Joan
    # Joan Alexandra is found, therefore search for all answer tokens
    tokenizedAnswer = tokenizer(answers[i][0], answers[i][0])['input_ids']
    # tokenized Answer looks something like this then: [CLS] joan rivers [SEP] joan rivers [SEP]
    # search for the tokens on position 1 and stop on the [SEP] token (number 102)
    # If there are multiple tokens in the answer they have to be exactly 1 position away from each other
    # in the non_truncated list in order to ensure that the correct answer is found and not some partly
    # correct string
    # Offset to give a start where to look for the answer
    answerSepPosition = tokenizedAnswer.index(102)
    tokenOffset = 0
    currentStart = 0 
    j = 1
    while j < answerSepPosition:
        # check if end is reached
        if j + 1 == answerSepPosition:
          break
        # save the start Position  
        if j == 1:
          currentStart = tokenOffset
        currentPosition = train_encodings_non_Truncated['input_ids'][i].index(tokenizedAnswer[j], tokenOffset)
        #Set tokenOffset to the current Position to not find any previous mentions of the next position
        tokenOffset = currentPosition + 1
        nextPosition = train_encodings_non_Truncated['input_ids'][i].index(tokenizedAnswer[j + 1], tokenOffset)
        # if the next answer part does not follow directly the first part: wrong answer found
        if not currentPosition + 1 == nextPosition:
          j = 1
        else:  
          j += 1
    # set Offset to the start of the answer token
    tokenOffset = currentStart

    # get the index of the answer Token
    tokenPosition = train_encodings_non_Truncated['input_ids'][i].index(tokenizedAnswer[1], tokenOffset)
    bucketNo = int(tokenPosition / SepToken)
    tokenPositionCorrected = tokenPosition - bucketNo * SepToken
    tokenPositionCorrected = tokenPositionCorrected + bucketNo

    # check if the calculated bucket has indeed the right token(s)
    # Very rarely the bucket is slightly off
    correction = 0
    try:
      j = 1
      while j < answerSepPosition:
        test = train_encodings_Overflow['input_ids'][bucketNo + lastBucket].index(tokenizedAnswer[j])
        j += 1
    except ValueError:
      try:
        j = 1
        while j < answerSepPosition:
          test = train_encodings_Overflow['input_ids'][bucketNo + lastBucket].index(tokenizedAnswer[j])
          j += 1
        correction = -1    
      except ValueError:
        try:
          j = 1
          while j < answerSepPosition:
            test = train_encodings_Overflow['input_ids'][bucketNo + lastBucket].index(tokenizedAnswer[j])
            j += 1
          correction = 1  
        except ValueError:
          #do nothing for now (entry will be removed later)
          correction = 0
    
    # replace the first bucket with the bucket that contains the answer (for both the input_ids and attention_mask)
    #print(tokenizer.decode(train_encodings_Overflow['input_ids'][bucketNo + lastBucket][tokenPositionCorrected]))
    train_encodings['input_ids'][i] = train_encodings_Overflow['input_ids'][bucketNo + lastBucket + correction]
    train_encodings['attention_mask'][i] = train_encodings_Overflow['attention_mask'][bucketNo + lastBucket + correction]

    #Increase the bucket count accordingly
    lastBucket += NoBuckets + 1 

    # check if the calculated lastBucket is really the last Bucket (the calculation is very slightly off rarely which can can cause problems)
    # save the tokens for the current question and compare them with the tokens in the next bucket
    # if they are the same: they belong to the same question
    # if not: different (exit loop)
    if lastBucket < maxLen:
      # Because of the overflowing tokens, the separation token is not in the same spot for all of them
      sepTokenOverflow = train_encodings_Overflow['input_ids'][lastBucket - 1].index(102)
      currentQuestion = train_encodings_Overflow['input_ids'][lastBucket - 1][sepTokenOverflow:]
      endReached = False
      while not endReached:
        #Same for overflowBucket
        sepTokenOverflowNextBucket = train_encodings_Overflow['input_ids'][lastBucket].index(102)
        
        # Find the start of the padding token (and therefore the end of the question at one position before that)
        # This is only relevant for the nextBucket, because if a padding exists in the previous bucket, then
        # this bucket is definitely the end of the question and no further check is needed.
        try:
          questionEndNextBucket = train_encodings_Overflow['input_ids'][lastBucket].index(0)
        except ValueError:
          # it might be possible that there is no padding but still it is the end of the question
          # if the tokens take up exactly 512 slots in the bucket -> then the length of the bucket
          # is the end of the question
          questionEndNextBucket = len(train_encodings_Overflow['input_ids'][lastBucket])

        questionNextBucket = train_encodings_Overflow['input_ids'][lastBucket][sepTokenOverflowNextBucket:questionEndNextBucket]
        #questionNextBucket[0] -> Padding (discard)
        if currentQuestion == questionNextBucket and questionNextBucket[0] != 0:
          lastBucket += 1
        else: 
          endReached = True
  return train_encodings 

 Add the start and end position of the answer to the train encodings

In [None]:
#
def add_token_positions(encodings, answers):
    # initialize lists to contain the token indices of answer start/end
    start_positions = np.zeros(len(encodings['input_ids']), dtype=object)
    end_positions = np.zeros(len(encodings['input_ids']), dtype=object)
    deleteArray = []
    for i in range(len(answers)):
        # tokenize the answer to know which token to look for
        tokenizedAnswer = tokenizer(answers[i][0], answers[i][0])['input_ids']
        sepTokenAnswer = tokenizedAnswer.index(102)
        sepTokenQuestion = encodings['input_ids'][i].index(102)
        
        tokenOffset = 0
        currentStart = 0 
        j = 1
        try:
          while j < sepTokenAnswer:
              # check if end is reached
              if j + 1 == sepTokenAnswer:
                break
              if j == 1:
                currentStart = tokenOffset
              currentPosition = encodings['input_ids'][i].index(tokenizedAnswer[j], tokenOffset)
              #Set tokenOffset to the current Position to not find any previous mentions of the next position
              tokenOffset = currentPosition + 1

              # it may happen that nextPosition is not part of the bucket anymore
              # Since only one bucket can be used at a time -> rest of the answer has to be discarded
              if (j + tokenOffset + 1) >= sepTokenQuestion:
                break

              nextPosition = encodings['input_ids'][i].index(tokenizedAnswer[j + 1], tokenOffset)
              # if the next answer part does not follow directly the first part: wrong answer found
              if not currentPosition + 1 == nextPosition:
                j = 1
              else:  
                j += 1
          tokenOffset = currentStart
          # find out how many tokens are used for the answer
          answerNoTokens = sepTokenAnswer - 2
          
          # append start/end token position using char_to_token method
          start_positions[i] = encodings['input_ids'][i].index(tokenizedAnswer[1], tokenOffset)   #encodings.char_to_token(i, answers[i][1])

          # if greater than 512: return to 511 (maximum index)
          end_positions[i] = start_positions[i] + answerNoTokens
          if end_positions[i] >= sepTokenQuestion:
            end_positions[i] = sepTokenQuestion - 1
        except ValueError:
          # if it hasn't been found: discard this entry
          # this happen in about 10 - 20 cases per 5000 questions
          start_positions[i] = 0
          end_positions[i] = 0
          deleteArray.append(i)                 

    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions.tolist(), 'end_positions': end_positions.tolist()})
    # delete the entries where the question was not found in the assigned bucket
    # this happens roughly in 1 of 500 questions
    for value in deleteArray:
      del encodings['input_ids'][value]
      del encodings['attention_mask'][value]
      del encodings['start_positions'][value]
      del encodings['end_positions'][value]
      answers = np.delete(answers, value, 0)      
    return answers

Define a class Dataset for the tokenized Questions+Paragraphs

In [None]:
class QuestionAnswerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids) 

Train the Distilbert with the correct token inside the 512 token length paragraph

In [None]:
def modelTraining(model, train_dataset, batchSize=32):
  # setup GPU/CPU
  device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
  # move model over to detected device
  model.to(device)
  # activate training mode of model
  model.train()
  # initialize adam optimizer with weight decay (reduces chance of overfitting)
  optim = AdamW(model.parameters(), lr=0.00001)

  # initialize data loader for training data
  train_loader = DataLoader(train_dataset, batch_size=batchSize, shuffle=True)

  for epoch in range(1):
      # set model to train mode
      model.train()# setup loop (we use tqdm for the progress bar)
      loop = tqdm(train_loader, leave=True)
      for batch in loop:
          # initialize calculated gradients (from prev step)
          optim.zero_grad()
          # pull all the tensor batches required for training
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          start_positions = batch['start_positions'].to(device)
          end_positions = batch['end_positions'].to(device)

          # train model on batch and return outputs (incl. loss)
          outputs = model(input_ids, attention_mask=attention_mask,
                          start_positions=start_positions,
                          end_positions=end_positions)
          
          # extract loss
          loss = outputs[0]
          # calculate loss for every parameter that needs grad update
          loss.backward()
          # update parameters
          optim.step()
          # print relevant info to progress bar
          loop.set_description(f'Epoch {epoch}')
          loop.set_postfix(loss=loss.item())

Train the model using above functions

In [None]:
def trainModel (model, tokenizer, train_Paragraph, train_Question, train_Answer):

  # Tokenize the Question + Paragraphs
  train_encodings = tokenizeRightBucket(tokenizer, train_Paragraph, train_Question, train_Answer)

  # Search for the Token inside the tokenized Paragraphs (if they are not truncated)
  train_AnswerCorrected = add_token_positions(train_encodings, train_Answer)

  # build datasets for both our training and validation sets
  train_dataset = QuestionAnswerDataset(train_encodings)

  # train the model
  modelTraining(model, train_dataset)

In [None]:
size = 0.05
train_Question, test_Question = train_test_split(questionDelArray, test_size=size, shuffle=False)
train_Answer, test_Answer = train_test_split(answerStartEndArray, test_size=size, shuffle=False)
multiple_Train_Paragraph, multiple_Test_Paragraph = train_test_split(multipleAnswerFileDelArray, test_size=size, shuffle=False)
train_Paragraph, test_Paragraph = train_test_split(answerFileDelArray, test_size=size, shuffle=False)

In [None]:
# Switch to DistilBert
model, tokenizer = getDistilBertModel()

# Get the latest Model
model = DistilBertForQuestionAnswering.from_pretrained('./gdrive/MyDrive/Information_Retrieval_Project/bertQA/', local_files_only = True, torchscript=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=451.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=265481570.0, style=ProgressStyle(descri…




In [None]:
# because of RAM limitations only 5000 entries are safe to train at a time
# therefore train the model on 5000 entries, then the next 5000 and so on
# Also saves the  model after 5000 steps 
# (in case Colab disconnects you the amount of data lost in reduced)
stepSize = 5000
startTraining = 0
endTraining = startTraining + stepSize
numberQuestions = len(train_Paragraph)
while startTraining < numberQuestions:
  trainModel(model, tokenizer, train_Paragraph[startTraining:endTraining], train_Question[startTraining:endTraining], train_Answer[startTraining:endTraining])
  startTraining += stepSize
  endTraining += stepSize
  model.save_pretrained('./gdrive/MyDrive/Information_Retrieval_Project/bertQA')

Token indices sequence length is longer than the specified maximum sequence length for this model (17444 > 512). Running this sequence through the model will result in indexing errors
Epoch 0: 100%|██████████| 156/156 [24:51<00:00,  9.56s/it, loss=0.594]
Epoch 0: 100%|██████████| 156/156 [27:25<00:00, 10.55s/it, loss=0.0783]
Epoch 0: 100%|██████████| 156/156 [29:27<00:00, 11.33s/it, loss=0.257]
Epoch 0: 100%|██████████| 156/156 [32:39<00:00, 12.56s/it, loss=0.0621]
Epoch 0: 100%|██████████| 156/156 [33:54<00:00, 13.04s/it, loss=0.47]
Epoch 0: 100%|██████████| 156/156 [34:24<00:00, 13.23s/it, loss=0.518]
Epoch 0: 100%|██████████| 156/156 [33:48<00:00, 13.00s/it, loss=0.218]
Epoch 0: 100%|██████████| 156/156 [34:09<00:00, 13.14s/it, loss=0.535]
Epoch 0: 100%|██████████| 9/9 [01:54<00:00, 12.72s/it, loss=0.518]


**Test different models**

In [None]:
# Test the Bert with 1 Text-input per Question
# Show the first 10 Question / Answer Pairs when using 1 paragraph per question
# Since these are in the training Data: Just for testing purposes!!
bertModel = getModel()[0]
distilBertModel = model

for question in enumerate(questionArray[:10]):
    print("Question: " + str(question[1]))

    # Test the Bert with 1 Text-input per Question
    print("1 Input Answer: " + str(questionAnswering(bertModel, tokenizer, question[1], answerFileArray[question[0]])))

    # Test the Bert model with multiple files per question
    print("Multiple Inputs Answer: " + str(multipleQuestionAnswering(bertModel, tokenizer, question[1], multipleAnswerFileArray[question[0]])))

    # Test the fine-tuned distilbert
    print("Distilbert Answer: " + str(multipleQuestionAnswering(distilBertModel, tokenizer, question[1], multipleAnswerFileArray[question[0]])))

    print("Correct Answer: " + str(answerArray[question[0]]))
    print()

**Evaluation**

In [36]:
def evaluateAnswers(correctAnswer, givenAnswer, counterCorrect, counterPartlyCorrect, counterWrong):
  if givenAnswer == correctAnswer:
    counterCorrect += 1
  elif len(str(givenAnswer)) > 50:
    counterWrong += 1
  else:

    # Only if all strings in the correct answer are found in the given answer
    # the answer is viewed as correct
    found = True
    for correct in str(correctAnswer).split():
      if not correct in str(givenAnswer):
        found = False

    if found == True:
      counterPartlyCorrect += 1
    else:
    # Same with the all the strings in the given answer  
      found = True  
      for given in str(givenAnswer).split():
        if not given in str(correctAnswer):
          found = False     
      if found == True:
        counterPartlyCorrect += 1
      else:
        counterWrong += 1        

  return (counterCorrect, counterPartlyCorrect, counterWrong)
   

**Evaluate different models on test Data**


In [None]:
# 3 models:
# Bert with 1 text input
# Bert with multiple (max 5) text inputs
# Fine-tuned DistilBert (only tested with multiple inputs [sigle input would also be possilbe])

# get the 2 models
bertModel = getModel()[0]
distilBertModel, tokenizer = getDistilBertModel()

# specify how many paragraphs should be respected for each question
# higher: takes longer but respects more text per question 
# (answer might else be truncated)
numberParagraphs = 3
counterQuestions = 0


# make counters for each of them
counterCorrectOneInput = 0
counterPartlyCorrectOneInput = 0
counterWrongOneInput = 0

counterCorrectMultipleInput = 0
counterPartlyCorrectMultipleInput = 0
counterWrongMultipleInput = 0

counterCorrectDistilBert = 0
counterPartlyCorrectDistilBert = 0
counterWrongDistilBert = 0

for i in range(len(test_Question[:300])):
  question = test_Question[i]
  paragraphArray = multiple_Test_Paragraph[i]
  correctAnswer = test_Answer[i][0]
  #print(str(i) + " out of " + str(len(test_Question)))
  #print(correctAnswer)

  # Bert with 1 text input
  #givenAnswer = questionAnswering(bertModel, tokenizer, question, paragraphArray[0], numberParagraphs)
  #counterCorrectOneInput, counterPartlyCorrectOneInput, counterWrongOneInput = evaluateAnswers(correctAnswer, givenAnswer, counterCorrectOneInput, counterPartlyCorrectOneInput, counterWrongOneInput)
  #print(givenAnswer)

  # Bert with multiple text inputs
  #givenAnswer = multipleQuestionAnswering(bertModel, tokenizer, question, paragraphArray, numberParagraphs)
  #counterCorrectMultipleInput, counterPartlyCorrectMultipleInput, counterWrongMultipleInput = evaluateAnswers(correctAnswer, givenAnswer, counterCorrectMultipleInput, counterPartlyCorrectMultipleInput, counterWrongMultipleInput)

  # DistilBert (multiple text inputs always seem to work better)
  givenAnswer = multipleQuestionAnswering(distilBertModel, tokenizer, question, paragraphArray, numberParagraphs)
  counterCorrectDistilBert, counterPartlyCorrectDistilBert, counterWrongDistilBert = evaluateAnswers(correctAnswer, givenAnswer, counterCorrectDistilBert, counterPartlyCorrectDistilBert, counterWrongDistilBert)   

  counterQuestions += 1

#print("Correct answers 1 Input: " + str(counterCorrectOneInput))
#print("Partly correct answers 1 Input: " + str(counterPartlyCorrectOneInput))
#print("Wrong answers 1 Input: " + str(counterWrongOneInput))
#print()

#print("Correct answers multiple Inputs: " + str(counterCorrectMultipleInput))
#print("Partly correct answers multiple Inputs: " + str(counterPartlyCorrectMultipleInput))
#print("Wrong answers multiple Inputs: " + str(counterWrongMultipleInput))
#print()

print("Correct answers Distilbert: " + str(counterCorrectDistilBert))
print("Partly correct answers Distilbert: " + str(counterPartlyCorrectDistilBert))
print("Wrong answers Distilbert: " + str(counterWrongDistilBert))
print()

print("Total number of questions: " + str(counterQuestions))  

**Evaluate each Epoch of the Distilbert model**



In [None]:
numberParagraphs = 3

# create counters
counterCorrect = 0
counterPartlyCorrect = 0
counterWrong = 0

for i in range(6):

  counterCorrect = 0
  counterPartlyCorrect = 0
  counterWrong = 0

  #the time colab provides is not enough to run through all of them at once (comment the model to be used out)
  #and change the number of range above depending on the model used
  if i == 0:
    continue
    #model, tokenizer = getModel()
    #model, tokenizer = getDistilBertModel()
  else:  
    model = DistilBertForQuestionAnswering.from_pretrained('./gdrive/MyDrive/Information_Retrieval_Project/bertQAEpochs/Epoch ' + str(i) + '/', local_files_only = True, torchscript=True)
  
  for j in range(len(test_Question[:250])):
    question = test_Question[j]
    paragraphArray = multiple_Test_Paragraph[j]
    correctAnswer = test_Answer[j][0]

    # DistilBert
    givenAnswer = multipleQuestionAnswering(model, tokenizer, question, paragraphArray, numberParagraphs)
    counterCorrect, counterPartlyCorrect, counterWrong = evaluateAnswers(correctAnswer, givenAnswer, counterCorrect, counterPartlyCorrect, counterWrong)   

  print(str(i) + ". Epoch")
  #print("Pre-Trained Bert")
  print("Correct answers: " + str(counterCorrect))
  print("Partly correct answers: " + str(counterPartlyCorrect))
  print("Wrong answers: " + str(counterWrong))
  print()


Results from the test set:

Pre-Trained Bert:
0. 
Correct answers: 113,
Partly correct answers: 50,
Wrong answers: 87

Pre-Trained DistilBert:
0. Epoch
Correct answers: 80,
Partly correct answers: 43,
Wrong answers: 127

Trained DistilBert:
1. Epoch
Correct answers: 108,
Partly correct answers: 50,
Wrong answers: 92

2. Epoch
Correct answers: 109,
Partly correct answers: 35,
Wrong answers: 106

3. Epoch
Correct answers: 104,
Partly correct answers: 32,
Wrong answers: 114

4. Epoch
Correct answers: 105,
Partly correct answers: 29,
Wrong answers: 116

5. Epoch
Correct answers: 100,
Partly correct answers: 31,
Wrong answers: 119

**Evaluate different models on the validation set**

In [None]:
# Open the training file
with open(verifiedPath, 'r', encoding='utf-8') as jsonFile:
    verifiedData = json.load(jsonFile)
archive = zipfile.ZipFile(evidencePath, 'r')    

In [None]:
verifiedDataArray = np.array(verifiedData["Data"])
verifiedMaxParagraphs = 5

verifiedQuestionArray = np.zeros(len(verifiedDataArray), dtype='object')
verifiedAnswerArray = np.zeros(len(verifiedDataArray), dtype='object')
verifiedAnswerTextArray = np.zeros((len(verifiedDataArray), verifiedMaxParagraphs), dtype='object')


i = 0
# Read the Question, Answers + Paragraphs and store them in an array each
for item in verifiedDataArray:
    verifiedQuestionArray[i] = item["Question"]
    verifiedAnswerArray[i] = item["Answer"]["Value"]

    j = 0
    for file in item["EntityPages"]:
      try:
        verifiedAnswerTextArray[i][j] = archive.read('wikipedia/' + file["Filename"]).decode("utf-8")
      except KeyError:
        verifiedAnswerTextArray[i][j] = "NAN"
      j += 1
      # if maximum no of paragraphs is reached:
      # do not continue inner for loop
      if j == verifiedMaxParagraphs:
        break

    i += 1       

In [None]:
# Verify on validation data set
# 1. on the standard DistilBert
# 2. on the trained DistilBert
# 3. on the standard Bert


# specify how many paragraphs should be respected for each question
# higher: takes longer but respects more text per question 
# (answer might else be truncated)
numberParagraphs = 5

for j in range(5):

  # create counters
  counterCorrect = 0
  counterPartlyCorrect = 0
  counterWrong = 0
  counterQuestions = 0
  
  #the time colab provides is not enough to run through all of them at once (comment the model to be used out)
  #and change the number of range above depending on the model
  if j == 0:
    model, tokenizer = getDistilBertModel()
    text = "Standard DistilBert"
  elif j < 4:
    model = DistilBertForQuestionAnswering.from_pretrained('./gdrive/MyDrive/Information_Retrieval_Project/bertQAEpochs/Epoch ' + str(j) + '/', local_files_only = True, torchscript=True)
    text = "Epoch: " + str(j)
  elif j == 4:
    model, tokenizer = getModel()
    text = "Standard Bert"


  for i in range(len(verifiedQuestionArray)):
      counterQuestions += 1
      question = verifiedQuestionArray[i]
      paragraphArray = verifiedAnswerTextArray[i]
      correctAnswer = verifiedAnswerArray[i]
      
      givenAnswer = multipleQuestionAnswering(model, tokenizer, question, paragraphArray, numberParagraphs)
      counterCorrect, counterPartlyCorrect, counterWrong = evaluateAnswers(correctAnswer, givenAnswer, counterCorrect, counterPartlyCorrect, counterWrong)   

  print(text)
  print("Correct answers: " + str(counterCorrect))
  print("Partly correct answers: " + str(counterPartlyCorrect))
  print("Wrong answers: " + str(counterWrong))
  print("Number Questions: " + str(counterQuestions))
  print()

Results from the Validation Set: 

Standard Bert:
Correct answers: 6,
Partly correct answers: 8,
Wrong answers: 304

Standard DistilBert:
Correct answers: 4, 
Partly correct answers: 15, 
Wrong answers: 299

Epoch 1:
Correct answers: 6,
Partly correct answers: 31,
Wrong answers: 281

Epoch 2:
Correct answers: 5,
Partly correct answers: 24,
Wrong answers: 289

Epoch 3:
Correct answers: 5,
Partly correct answers: 26,
Wrong answers: 287

**See the time difference**

In [None]:
import time

# Normal Bert
model, tokenizer = getModel()
start = time.time()
multipleQuestionAnswering(model, tokenizer, questionArray[0], answerFileArray[0:1])
print("Standard Bert: --- %s seconds ---" % (time.time() - start))

# Distil Bert
model, tokenizer = getDistilBertModel()
start = time.time()
multipleQuestionAnswering(model, tokenizer, questionArray[0], answerFileArray[0:1])
print("Distil Bert: --- %s seconds ---" % (time.time() - start))

Standard Bert: --- 6.701009273529053 seconds ---
Distil Bert: --- 4.129925727844238 seconds ---


**Make the pipeline work**

In [37]:
# Open the training file
import pandas as pd

def answerFromCSV (path):
  df = pd.read_csv(path, sep=',')
  model, tokenizer = getDistilBertModel()
  model = DistilBertForQuestionAnswering.from_pretrained('./gdrive/MyDrive/Information_Retrieval_Project/bertQAEpochs/Epoch 1/', local_files_only = True, torchscript=True)
  question = df.iloc[0]['full_question']
  textArray = np.asarray(df.iloc[0:5]['full_doc'])

  answer = multipleQuestionAnswering(model, tokenizer, question, textArray)
  return answer, df['answer'][0]

In [39]:
predicted, correct = answerFromCSV('./gdrive/MyDrive/Information_Retrieval_Project/duoBERT/finaltest.csv')
print("Correct Answer: " + str(correct).lower())
print("Predicted Answer: " + str(predicted))

predicted, correct = answerFromCSV('./gdrive/MyDrive/Information_Retrieval_Project/duoBERT/finaltest2.csv')
print("Correct Answer: " + str(correct).lower())
print("Predicted Answer: " + str(predicted))

Correct Answer: dukan diet
Predicted Answer: dukan diet
Correct Answer: the daleks
Predicted Answer: 42
