In [16]:
#Import transformers library from hugging face
!pip install transformers



In [17]:
#Import needed 
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
from flask import Flask, render_template, redirect, url_for,request
from flask import make_response
app = Flask(__name__)

In [6]:

#This mounts your Google Drive to the Colab VM
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

#Load the SQUAD dataset into the project
coqa = pd.read_json('http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json')
coqa.head()


Unnamed: 0,version,data
0,1,"{'source': 'wikipedia', 'id': '3zotghdk5ibi9ce..."
1,1,"{'source': 'cnn', 'id': '3wj1oxy92agboo5nlq4r7..."
2,1,"{'source': 'gutenberg', 'id': '3bdcf01ogxu7zdn..."
3,1,"{'source': 'cnn', 'id': '3ewijtffvo7wwchw6rtya..."
4,1,"{'source': 'gutenberg', 'id': '3urfvvm165iantk..."


In [7]:
###Clean the data by removing the version column from the dataset and creating a new dataframe
###dataframe will be created by attaching every question answer-pair to its corresponding paragraph

#Remove the version column from the dataset
del coqa['version']

#required columns in our dataframe
cols = ["text","question","answer"]

#list of lists to create our dataframe
comp_list = []
for index, row in coqa.iterrows():
    for i in range(len(row["data"]["questions"])):
        temp_list = []
        temp_list.append(row["data"]["story"])
        temp_list.append(row["data"]["questions"][i]["input_text"])
        temp_list.append(row["data"]["answers"][i]["input_text"])
        comp_list.append(temp_list)

new_df = pd.DataFrame(comp_list, columns=cols) 
#saving the dataframe to csv file for further loading
new_df.to_csv("CoQA_data.csv", index=False)


In [8]:
clean_data = pd.read_csv("CoQA_data.csv")
clean_data.head()

Unnamed: 0,text,question,answer
0,"The Vatican Apostolic Library (), more commonl...",When was the Vat formally opened?,It was formally established in 1475
1,"The Vatican Apostolic Library (), more commonl...",what is the library for?,research
2,"The Vatican Apostolic Library (), more commonl...",for what subjects?,"history, and law"
3,"The Vatican Apostolic Library (), more commonl...",and?,"philosophy, science and theology"
4,"The Vatican Apostolic Library (), more commonl...",what was started in 2014?,a project


In [9]:
print("Number of question and answers: ", len(clean_data))

Number of question and answers:  108647


In [10]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad', return_dict=False)
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad',return_dict=False)

Downloading: 100%|██████████| 443/443 [00:00<00:00, 221kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 1.34G/1.34G [02:02<00:00, 10.9MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 2.01MB/s]
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 13.8kB/s]


In [11]:
#Example Question ans Answer pair in imported model
random_num = np.random.randint(0,len(clean_data))
question = clean_data["question"][random_num]
text = clean_data["text"][random_num]
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

The input has a total of 339 tokens.
[CLS]        101
does       2,515
woods      5,249
have       2,031
a          1,037
website    4,037
?          1,029
[SEP]        102
(          1,006
cnn       13,229
)          1,007
-          1,011
-          1,011
gill      12,267
##ette     7,585
said       2,056
saturday   5,095
it         2,009
was        2,001
"          1,000
limiting  14,879
"          1,000
golfer    20,601
tiger      6,816
woods      5,249
'          1,005
role       2,535
in         1,999
its        2,049
marketing   5,821
programs   3,454
to         2,000
give       2,507
him        2,032
the        1,996
privacy    9,394
he         2,002
needs      3,791
to         2,000
work       2,147
on         2,006
family     2,155
relationships   6,550
after      2,044
disclosure  19,380
##s        2,015
of         1,997
his        2,010
"          1,000
in         1,999
##fide    20,740
##lity    18,605
.          1,012
"          1,000
in         1,999
a          1,037
sta

In [12]:
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

In [13]:
# Run our example through the model.
start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                 token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text

In [14]:
# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
# First start with the first token.
answer = tokens[answer_start]

# Select the remaining answer tokens and join them with whitespace.
for i in range(answer_start + 1, answer_end + 1):
    

    #Reverses wordpiece tokenization ==> If it's a subword token, then recombine it with the previous token.
    #Wordpiece tokenization ==> rare words get broken down into subword/pieces
    #                       ==> '##' used to delimit tokens that have been split
    if tokens[i][0:2] == '##':
        answer += tokens[i][2:]
    
    # Otherwise, add a space then the token.
    else:
        answer += ' ' + tokens[i]

print('Question: ' + question)
print('Answer: "' + answer + '"')

Question: Does Woods have a website?
Answer: "his web site friday"


In [15]:
@app.route('/processQuestion')
def question_answer():
    #quest, inputtext;
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))