In [None]:
#Install following libraries before first run. For subsequent runs, you may comment these
!pip install transformers
!pip install torch

#Import libraries
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import torch
import numpy as np

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 15.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 54.6MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 55.6MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [None]:
# Loading BERT model already fine-tuned on SQuAD Question Answer Dataset. This 1.3 GB download and may take sometime
# Note that I am using uncased model so all answers will be in lower case

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…




In [None]:
# Getting bert tokenizer
tokenizer_for_bert = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
def bert_answering_machine ( question, passage, max_len =  512):
    ''' Function to provide answer from passage for question asked.
        This function takes question as well as the passage 
        It retuns answer from the passage, along with start/end token index for the answer and start/end token scores
        The scores can be used to rank answers if we are searching answers for same question in multiple passages
        Value of max_len can not exceed 512. If length of question + passage + special tokens is bigger than max_len, function will truncate extra portion.
        
    '''
  
    #Tokenize input question and passage. Keeping maximum number of tokens as specified by max_len parameter. This will also add special tokens - [CLS] and [SEP]
    input_ids = tokenizer_for_bert.encode ( question, passage,  max_length= max_len, truncation=True)  
    
    
    #Getting number of tokens in 1st sentence (question) and 2nd sentence (passage)
    cls_index = input_ids.index(102) #Getting index of first SEP token
    len_question = cls_index + 1       # length of question (1st sentence)
    len_answer = len(input_ids)- len_question  # length of answer (2nd sentence)
    
    
    #BERT need Segment Ids to understand which tokens belong to sentence 1 and which to sentence 2
    segment_ids =  [0]*len_question + [1]*(len_answer)  #Segment ids will be 0 for question and 1 for answer
    
    #Converting token ids to tokens
    tokens = tokenizer_for_bert.convert_ids_to_tokens(input_ids) 
    
    
    # getting start and end scores for answer. Converting input arrays to torch tensors before passing to the model
    start_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[0]
    end_token_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]) )[1]

    #Converting scores tensors to numpy arrays so that we can use numpy functions
    start_token_scores = start_token_scores.detach().numpy().flatten()
    end_token_scores = end_token_scores.detach().numpy().flatten()
    
    #Picking start index and end index of answer based on start/end indices with highest scores
    answer_start_index = np.argmax(start_token_scores)
    answer_end_index = np.argmax(end_token_scores)

    #Getting scores for start token and end token of the answer. Also rounding it to 2 decimal digits
    start_token_score = np.round(start_token_scores[answer_start_index], 2)
    end_token_score = np.round(end_token_scores[answer_end_index], 2)
    
   
    #Combining subwords starting with ## so that we can see full words in output. Note tokenizer breaks words which are not in its vocab.
    answer = tokens[answer_start_index] #Answer starts with start index, we got based on highest score
    for i in range(answer_start_index + 1, answer_end_index + 1):
        if tokens[i][0:2] == '##':  # Token for a splitted word starts with ##
            answer += tokens[i][2:] # If token start with ## we remove ## and combine it with previous word so as to restore the unsplitted word
        else:
            answer += ' ' + tokens[i]  # If token does not start with ## we just put a space in between while combining tokens
            
    # Few patterns indicating that BERT does not get answer from the passage for question asked
    if ( answer_start_index == 0) or (start_token_score < 0 ) or  (answer == '[SEP]') or ( answer_end_index <  answer_start_index):
        answer = "Sorry!, I could not find  an answer in the passage."
    
    return ( answer_start_index, answer_end_index, start_token_score, end_token_score,  answer)


#Testing function
bert_answering_machine ("Which state john's friend lives", 'My name is John. I live in San Jose, California. Rob is my friend. He lives in Seattle, Washington')

(32, 32, 6.13, 6.94, 'washington')

In [None]:
# BERT Question-Answer Sample 1
passage="John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis"


print('Passage:\n', passage )
print (f'Length of the passage: {len(passage.split())} words')

question1 ="Who is John's sister" #BERT needs to apply some logic to answer this
print ('\nQuestion 1:\n', question1)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question1, passage)
print('\nAnswer from BERT: ', ans ,  '\n')


question2 ="Which college does John's sister attend"   #BERT needs to answer intermediate question (Question 1) to answer this 
print ('\nQuestion 2:\n', question2)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question2, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question3 ="Who is the president of UC Davis" # BERT can not answer this from this passage
print ('\nQuestion 3:\n', question3)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question3, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

Passage:
 John is a 10 year old boy. He is the son of Robert Smith.  Elizabeth Davis is Robert's wife. She teaches at UC Berkeley. Sophia Smith is Elizabeth's daughter. She studies at UC Davis
Length of the passage: 34 words

Question 1:
 Who is John's sister

Answer from BERT:  sophia smith 


Question 2:
 Which college does John's sister attend

Answer from BERT:  uc davis 


Question 3:
 Who is the president of UC Davis

Answer from BERT:  Sorry!, I could not find  an answer in the passage. 



In [None]:
# BERT Question-Answer Sample 2

passage = " Apple has told employees it'll provide them with paid time off to vote in the US presidential election on Nov. 3, according to a report. \
Workers, who wish to vote that Tuesday will be given up to four hours of pay, Bloomberg reported Friday citing an internal Apple memo. \
It follows Twitter in June making Election Day a paid holiday for US employees. For retail team members and hourly workers across the company, \
if you are scheduled to work this Election Day, we will be providing up to four hours of paid time off if you need it to get to the polls, \
said Deirdre O'Brien, Apple's senior vice president of retail and people, in the reported memo. Teams can also use this time to volunteer as an \
election worker at one of your local polling stations. Apple didn't immediately respond to a request for comment. \
Since Election Day in the US falls on a Tuesday, it can be difficult for people to find time outside of work to visit a polling place and vote "

print('Passage:\n', passage )
print (f'Length of the passage: {len(passage.split())} words')

question1 ="On what date we have Election Day"
print ('\nQuestion 1:\n', question1)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question1, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question2 ="What's the concern discussed here"
print ('\nQuestion 2:\n', question2)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question2, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question3 ="Who is Senior VP at Apple mentioned in this passage "
print ('\nQuestion 3:\n', question3)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question3, passage)
print('\nAnswer from BERT: ', ans ,  '\n')


question4 ="How Apple is addressing the issue "
print ('\nQuestion 4:\n', question4)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question4, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question5 ="What's the alternate use of paid time off "
print ('\nQuestion 5:\n', question5)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question5, passage)
print('\nAnswer from BERT: ', ans ,  '\n')



Passage:
  Apple has told employees it'll provide them with paid time off to vote in the US presidential election on Nov. 3, according to a report. Workers, who wish to vote that Tuesday will be given up to four hours of pay, Bloomberg reported Friday citing an internal Apple memo. It follows Twitter in June making Election Day a paid holiday for US employees. For retail team members and hourly workers across the company, if you are scheduled to work this Election Day, we will be providing up to four hours of paid time off if you need it to get to the polls, said Deirdre O'Brien, Apple's senior vice president of retail and people, in the reported memo. Teams can also use this time to volunteer as an election worker at one of your local polling stations. Apple didn't immediately respond to a request for comment. Since Election Day in the US falls on a Tuesday, it can be difficult for people to find time outside of work to visit a polling place and vote 
Length of the passage: 175 words


In [None]:
passage = '''hi my name is ALPESH, and i study at amity university Mumbai, i am 22 year old, i love nlp'''

print('Passage:\n', passage )
print (f'Length of the passage: {len(passage.split())} words')

question1 ="What is my name"
print ('\nQuestion 1:\n', question1)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question1, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question2 ="How old ALPESH is"
print ('\nQuestion 1:\n', question1)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question2, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

question3 ="where ALPESH study"
print ('\nQuestion 1:\n', question1)
#Getting answer from BERT
_, _ , _ , _, ans  = bert_answering_machine ( question3, passage)
print('\nAnswer from BERT: ', ans ,  '\n')

Passage:
 hi my name is ALPESH, and i study at amity university Mumbai, i am 22 year old, i love nlp
Length of the passage: 20 words

Question 1:
 What is my name

Answer from BERT:  alpesh 


Question 1:
 What is my name

Answer from BERT:  22 year old 


Question 1:
 What is my name

Answer from BERT:  amity university mumbai 

