In [None]:
%%capture
!pip install transformers

In [None]:
import torch
from transformers import AutoTokenizer,BertTokenizerFast, BertForQuestionAnswering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

modelSquad = torch.load("/content/drive/MyDrive/QA/finalFineTunedModelSquadV2",map_location=torch.device('cpu'))
modelNewsQA = torch.load("/content/drive/MyDrive/QA/finalFineTunedModelNewsQA",map_location=torch.device('cpu'))

In [None]:
inputs=tokenizer.encode_plus('what is my name', 'my name is meet', return_tensors='pt')
outputs=modelSquad(**inputs)
answer_start = torch.argmax(outputs[0]) # get the most likely beginning of answer with the argmax of the score
answer_end = torch.argmax(outputs[1]) + 1
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
print(answer)

meet


## Functions to make predictions


Here I used some useful functions from the evaluation script of SQuAD dataset 2.0 so as to evaluate my fine-tuned model.

In [None]:
def predict(model,context,query):

  inputs = tokenizer.encode_plus(query, context, return_tensors='pt')

  outputs = model(**inputs)
  answer_start = torch.argmax(outputs[0])  # get the most likely beginning of answer with the argmax of the score
  answer_end = torch.argmax(outputs[1]) + 1 

  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))

  return answer

def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re

  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)

  def white_space_fix(text):
    return " ".join(text.split())

  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

  def lower(text):
    return text.lower()

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return 2 * (prec * rec) / (prec + rec)

In [None]:
def give_an_answer(context,query,answer):

  prediction_newsQA = predict(modelNewsQA,context,query)
  prediction_Squad = predict(modelSquad,context,query)
  if(prediction_newsQA=="[CLS]" or prediction_newsQA=="" or prediction_newsQA=="[SEP]"):
    prediction_newsQA="No answer predicted!!"
  if(prediction_Squad=="[CLS]" or prediction_Squad=="" or prediction_Squad=="[SEP]"):
    prediction_Squad="No answer predicted!!"
  em_score_newsqa = compute_exact_match(prediction_newsQA, answer)
  f1_score_newsqa = compute_f1(prediction_newsQA, answer)
  em_score_squad = compute_exact_match(prediction_Squad, answer)
  f1_score_squad = compute_f1(prediction_Squad, answer)

  print(f"Question: {query}")
  print(f"True Answer: {answer}")
  print("-----------------------")
  print(f"Prediction SQUAD: {prediction_Squad}")
  print(f"EM SQUAD: {em_score_squad}")
  print(f"F1 SQUAD: {f1_score_squad}")
  print("-----------------------")
  print(f"Prediction NewsQA: {prediction_newsQA}")
  print(f"EM NEWSQA: {em_score_newsqa}")
  print(f"F1 NEWSQA: {f1_score_newsqa}")
  print("\n")

## Testing

##### Our both models perform great on simple example below!

In [None]:
context = "Hello! My name is Liam and I am 29 years old. I used to live in Oakville of Ontario, but now I moved to Burlington of Ontario. I enjoy exploring new places and trying new foods. I have a passion for photography and enjoy taking pictures of the beautiful landscapes around me."

queries = ["What is my name?",
           "What is age of liam?",
           "Where does Liam live currently?",
           "What is liam's passion?"
          ]
answers = ["Liam",
           "29 years",
           "Burlington",
           "photography"
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: What is my name?
True Answer: Liam
-----------------------
Prediction SQUAD: liam
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: liam
EM NEWSQA: 1
F1 NEWSQA: 1.0


Question: What is age of liam?
True Answer: 29 years
-----------------------
Prediction SQUAD: 29
EM SQUAD: 0
F1 SQUAD: 0.6666666666666666
-----------------------
Prediction NewsQA: 29 years old.
EM NEWSQA: 0
F1 NEWSQA: 0.8


Question: Where does Liam live currently?
True Answer: Burlington
-----------------------
Prediction SQUAD: burlington
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: No answer predicted!!
EM NEWSQA: 0
F1 NEWSQA: 0


Question: What is liam's passion?
True Answer: photography
-----------------------
Prediction SQUAD: photography
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: photography
EM NEWSQA: 1
F1 NEWSQA: 1.0




##### Here I took some content from Wikipedia pages to test my model. I observed that for questions that requires an answer with more than one entities, that in the context are seperated by comma, the model return only the first one (in the question of the members of the band). Moreover, when I asked about the kind of band they are, the model give me the answer of "British rock", while I didn't asked about the origin of the band. 

In [None]:
context = """ The Great Barrier Reef is the world's largest coral reef system, located in the Coral Sea, off the coast of Australia. It is composed of over 2,900 individual reefs and 900 islands, stretching for over 2,300 km. The Great Barrier Reef is home to thousands of species of marine life, including over 1,500 species of fish and over 600 species of hard and soft coral."""

queries = ["Where is the Great Barrier Reef located? ",
           "How long is great barrier reef?",
           "What type of marine life in reef?"
          ]
answers = ["Coral Sea, off the coast of Australia",
           "2, 300 km",
           "over 1, 500 species of fish and over 600 species of hard and soft coral"
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: Where is the Great Barrier Reef located? 
True Answer: Coral Sea, off the coast of Australia
-----------------------
Prediction SQUAD: coral sea
EM SQUAD: 0
F1 SQUAD: 0.5
-----------------------
Prediction NewsQA: off the coast of australia.
EM NEWSQA: 0
F1 NEWSQA: 0.8


Question: How long is great barrier reef?
True Answer: 2, 300 km
-----------------------
Prediction SQUAD: 2, 300 km
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: over 2, 300 km.
EM NEWSQA: 0
F1 NEWSQA: 0.8571428571428571


Question: What type of marine life in reef?
True Answer: over 1, 500 species of fish and over 600 species of hard and soft coral
-----------------------
Prediction SQUAD: over 1, 500 species of fish and over 600 species of hard and soft coral
EM SQUAD: 1
F1 SQUAD: 0.7333333333333333
-----------------------
Prediction NewsQA: coral
EM NEWSQA: 0
F1 NEWSQA: 0.125




In [None]:
context = """ Mount Olympus is the highest mountain in Greece. It is part of the Olympus massif near 
              the Gulf of Thérmai of the Aegean Sea, located in the Olympus Range on the border between 
              Thessaly and Macedonia, between the regional units of Pieria and Larissa, about 80 km (50 mi) 
              southwest from Thessaloniki. Mount Olympus has 52 peaks and deep gorges. The highest peak, 
              Mytikas, meaning "nose", rises to 2917 metres (9,570 ft). It is one of the 
              highest peaks in Europe in terms of topographic prominence. """

queries = [
           "How many metres is Olympus?",
           "Where Olympus is near?",
           "How far away is Olympus from Thessaloniki?"
          ]
answers = [
           "2917",
           "Gulf of Thérmai of the Aegean Sea",
           "80 km (50 mi)"
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: How many metres is Olympus?
True Answer: 2917
-----------------------
Prediction SQUAD: 2917
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: No answer predicted!!
EM NEWSQA: 0
F1 NEWSQA: 0


Question: Where Olympus is near?
True Answer: Gulf of Thérmai of the Aegean Sea
-----------------------
Prediction SQUAD: gulf of thermai of the aegean sea
EM SQUAD: 0
F1 SQUAD: 0.6666666666666666
-----------------------
Prediction NewsQA: gulf of thermai of the aegean sea,
EM NEWSQA: 0
F1 NEWSQA: 0.6666666666666666


Question: How far away is Olympus from Thessaloniki?
True Answer: 80 km (50 mi)
-----------------------
Prediction SQUAD: 80 km
EM SQUAD: 0
F1 SQUAD: 0.6666666666666666
-----------------------
Prediction NewsQA: about 80 km ( 50 mi )
EM NEWSQA: 0
F1 NEWSQA: 0.888888888888889




##### Here we give it a a little big paragraph and still it is able to answer most questions apart from 6th one where some hoping is required and

In [None]:
context = """ The human brain is one of the most complex and fascinating organs in the human body. It weighs about 3 pounds and is respon
            sible for everything we do, think, and feel. The brain is divided into several parts, each with a specific function. The cerebral cortex, which is the outer 
            layer of the brain, is responsible for consciousness, thought, and movement. The cerebellum, located at the base of the brain, controls movement and 
            balance. The brainstem, which connects the brain to the spinal cord, controls involuntary functions such as breathing and heart rate. The brain is composed of over 
            100 billion neurons, which communicate with each other through electrical and chemical signals. The neurons are connected by synapses, which are the gaps between them. 
            The synapses allow the neurons to transmit information quickly and efficiently. The brain is also responsible for creating and storing 
            memories, which are formed through the process of encoding, storage, and retrieval."""

queries = [
           "How much does the human brain weigh?",
           "What is the function of the cerebellum?",
           "What is the cerebral cortex responsible for?",
           "What do synapses do?",
           "How are memories formed? ",
           "where is the organ responsible for consciousness, thought, and movement located?",
           "what do peacocks eat?"
          ]
answers = [
           "about 3 pounds",
           "controls movement and balance",
           "consciousness, thought, and movement",
           "allow the neurons to transmit information quickly and efficiently.",
           "encoding, storage, and retrieval",
           "outer layer of the brain",
           ""
          ]

for q,a in zip(queries,answers):
  give_an_answer(context,q,a)

Question: How much does the human brain weigh?
True Answer: about 3 pounds
-----------------------
Prediction SQUAD: about 3 pounds
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: about 3 pounds
EM NEWSQA: 1
F1 NEWSQA: 1.0


Question: What is the function of the cerebellum?
True Answer: controls movement and balance
-----------------------
Prediction SQUAD: controls movement and balance
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: controls movement and balance.
EM NEWSQA: 1
F1 NEWSQA: 1.0


Question: What is the cerebral cortex responsible for?
True Answer: consciousness, thought, and movement
-----------------------
Prediction SQUAD: consciousness, thought, and movement
EM SQUAD: 1
F1 SQUAD: 1.0
-----------------------
Prediction NewsQA: consciousness, thought, and movement.
EM NEWSQA: 1
F1 NEWSQA: 1.0


Question: What do synapses do?
True Answer: allow the neurons to transmit information quickly and efficiently.
-----------------------
Pre