In [150]:
from nltk import word_tokenize
import pandas as pd
import re
data = pd.read_csv('Prediction_References2.csv', index_col = 0)
# data = data.loc[:, ~data.columns.str.contains('^Unnamed')]  #drop unnamed columns

data.head()

Unnamed: 0,question,predictions,references,context
0,What are the reopening rules for bars and rest...,Bars and restaurants may currently open for dr...,[' The reopening of indoor dining spaces has b...,Bars and restaurants may currently open for dr...
1,What are the reopening rules for bars and rest...,Bars and restaurants may currently open for dr...,[' Restaurants offering service in outdoor are...,Bars and restaurants may currently open for dr...
2,What are the reopening rules for bars and rest...,Bars and restaurants may currently open for dr...,[' Restaurants may offer in-person dining serv...,Bars and restaurants may currently open for dr...
3,What are the required safety protocols or soci...,If feasible and consistent with social distan...,[' Comply with state and CDC guidelines to pro...,Elective surgeries are permitted to resume in ...
4,What are the required safety protocols or soci...,If feasible and consistent with social distan...,"["" Utilizing telemedicine to the greatest exte...",Elective surgeries are permitted to resume in ...


## Preprocessing

In [151]:
def remove_brackets(column):
    column = re.sub(r'\[\'\s*|\[\"\s*',"", column)  #front brackets
    return re.sub(r'\s*\'\]|\"\]',"", column) #end brackets


def remove_whitespace(column): #remove any trailing whitespace
    if type(column) == str:
        return column.strip()
    
    
data['references'] = data['references'].apply(remove_brackets)    
data['predictions'] = data['predictions'].apply(remove_whitespace)
data['predictions'].fillna(0,inplace = True)



## Data Averages

Average Context Token Ct:  462 <br>
Average Reference Token Ct:  66 <br>
Average Prediction Token Ct:  69 <br>
Average Question Token Ct:  10 <br>


In [152]:
import re
def pattern_search(pattern,text):
    correct_list= []
    if re.search(pattern,text):
        match = re.search(pattern,text)
        answer_start = match.start()
        correct_list.append(answer_start)
    elif re.search(re.escape(pattern), text):
        match = re.search(re.escape(pattern),text)
        answer_start = match.start()    #get answer_start
        correct_list.append(answer_start)    #append to a list
    else:
        correct_list.append('NaN1')  #couldn't find a match
    return correct_list
    

def create_answer_starts(question,reference,predictions,context):
    correct_predictions = []
    no_prediction = []
    incorrect_prediction = []
    prediction_idx = []
    reference_idx = []
    for index,row in enumerate(question):
        pattern = reference[index]  
        text = context[index]  
        if predictions[index] == reference[index]:  #if the prediction is the same as the reference(gt), 
            correct_predictions.extend(pattern_search(pattern,text)) #find where the answer starts in the context
        if predictions[index] ==0:
            no_prediction.extend(pattern_search(pattern,text))    #find where the answer starts in the context
        if predictions[index] != reference[index] and predictions[index] != 0: #if the prediction is neither 0 nor the right one
            incorrect_prediction.extend(pattern_search(pattern,text))
        if predictions[index] != reference[index]:
            pred_pattern = predictions[index]
            if type(pred_pattern) == str:
                reference_idx.extend(pattern_search(pattern,text))  #create an index of references
                prediction_idx.extend(pattern_search(pred_pattern,text)) #create an index of predictions
            else:
                reference_idx.append('Null')  #create an index of references
                prediction_idx.append('Null') #create an index of predictions
        else: 
                reference_idx.append('correct')  #create an index of references
                prediction_idx.append('correct') #create an index of predictions
    return correct_predictions, no_prediction, incorrect_prediction, prediction_idx, reference_idx

def average_answer_start(alist):
    num_list = [num for num in alist if type(num) == int]
    return sum(num_list) /len(num_list)

                

correct_prediction, no_prediction, incorrect_prediction,prediction_idx, reference_idx = create_answer_starts(data['question'],data['references'],data['predictions'],data['context'])


print("Average Answer Start for Correct Predictions: ", average_answer_start(correct_prediction))
print()
print("Average Answer Start for NO Predictions: ", average_answer_start(no_prediction))
print()
print("Average Answer Start for Incorrect Predictions: ", average_answer_start(incorrect_prediction))

ct = 0
for i in prediction_idx:
    if type(i) == int:
        ct+=1

zipped = list(zip(reference_idx, prediction_idx))
ct =0
for values in zipped:
    if values[1] == 0: ct+=1
print(ct, " Incorrect predictions with answer starting at index 0")

dic = {}
dic1 = {}
for index, value in enumerate(zipped):
    if type(value[0]) == int and type(value[1]) == int:
        dic1[index] = value
        if value[0] > value[1]:
            dic[index] = value

print(len(dic1), "incorrect predictions") #73 incorrect predictions
print(len(dic),"/",len(dic1), "incorrect predictions had a reference with a higher “answer_start” index than the prediction provided.")

dic3 = {}
for key in dic1:
    if len(data['references'][key]) > len(data['predictions'][key]):
        dic3[key]= dic1[key]
print(len(dic3),"Instances in which the prediction was incorrect and the reference was longer than the prediction")


Average Answer Start for Correct Predictions:  244.25

Average Answer Start for NO Predictions:  1132.55

Average Answer Start for Incorrect Predictions:  666.9736842105264
23  Incorrect predictions with answer starting at index 0
73 incorrect predictions
47 / 73 incorrect predictions had a reference with a higher “answer_start” index than the prediction provided.
35 Instances in which the prediction was incorrect and the reference was longer than the prediction


In [153]:
#rough estimate of what question types we are working with
what_lst = []
is_lst = []
are_lst = []
when_lst = []
where_lst = []
how_lst = []
if_lst = []


for question in data['question']:
    if 'What' in question:
        what_lst.append(question)
    if 'Is' in question:
        is_lst.append(question)
    if 'Are' in question:
        are_lst.append(question)
    if 'When' in question:
        when_lst.append(question)
    if 'Where' in question:
        where_lst.append(question)
    if 'How' in question:
        how_lst.append(question)
    if 'If' in question:
        if_lst.append(question)
        
print(len(what_lst), "WHAT - questions")
print(len(how_lst), "HOW - questions")
print(len(are_lst), "ARE - questions")
print(len(is_lst), "IS - questions")
print(len(where_lst), "WHERE - questions")
print(len(when_lst), "WHEN - questions")
print(len(if_lst), "IF - questions")


76 WHAT - questions
11 HOW - questions
10 ARE - questions
8 IS - questions
4 WHERE - questions
1 WHEN - questions
0 IF - questions


In [154]:
context_length = []
question_length = []
references_length = []
predictions_length = []


def average_tokens(series, alist):
    for string in series:
        if type(string) == str:
            tkns = word_tokenize(string)
            alist.append(len(tkns))
    average = (sum(alist) / len(alist))

    return round(average)

print("Average Context Token Ct: ", average_tokens(data.context, context_length))
print("Average Question Token Ct: ", average_tokens(data.question, question_length))
print("Average Reference Token Ct: ", average_tokens(data.references, references_length))
print("Average Prediction Token Ct: ", average_tokens(data.predictions, predictions_length))



Average Context Token Ct:  462
Average Question Token Ct:  10
Average Reference Token Ct:  66
Average Prediction Token Ct:  69


## 119 incorrect predictions
 
Number of Incorrect Predictions 119 <br>
Average Token Ct of Context 476 <br>
Average Token Ct of References 73 <br>
Average Token Ct of Questions 10 <br>
Number of Incorrect Predictions with context > 1024 = 5


In [163]:
import nltk 
from nltk import word_tokenize

def average_tokens(string, alist):
    tkns = word_tokenize(string)
    alist.append(len(tkns))
    average = (sum(alist) / len(alist))
    return len(tkns), average


ct = 0  # of incorrect predictions
avg_context = []
avg_reference = []
avg_question = []
data_length = len(data)
for row in range(data_length):
    if data['predictions'][row] != data['references'][row]:
        ct+=1
#         print(data['question'][row])
        context_tokens, avg_cntx = average_tokens(data.loc[row,'context'], avg_context)
        reference_tokens, avg_ref = average_tokens(data.loc[row,'references'], avg_reference)
        question_tokens, avg_quest = average_tokens(data.loc[row,'question'], avg_question)
        
#         print("Length of Context:", (context_tokens))
#         print("Length of Reference (correct answer):",(reference_tokens))
#         if len(context_tokens) > 1024:
#             print("Context Longer than 1024")
        
#     print(" ##############")
    
print("Number of Incorrect Predictions",ct)
print("Average Token Ct of Context", round(avg_cntx))
print("Average Token Ct of References",round(avg_ref))
print("Average Token Ct of Questions",round(avg_quest))

                                    
    


Number of Incorrect Predictions 119
Average Token Ct of Context 476
Average Token Ct of References 73
Average Token Ct of Questions 10


## 40 Null Predictions

Average Token Ct of Context: 460  (only 25 contexts here were above average token ct) <br>
Average Token Ct of References: 73 <br>
Average Token Ct of Questions: 8


In [156]:

ct = 0  # of NULL Answers

avg_context = []
avg_references = []
avg_question = []

for row in range(data_length):
    if data['predictions'][row] == 0:
        ct+=1
        context_tokens, avg_cntx = average_tokens(data.loc[row,'context'], avg_context)
        reference_tokens, avg_ref = average_tokens(data.loc[row,'references'], avg_reference)
        question_tokens, avg_quest = average_tokens(data.loc[row,'question'], avg_question)

#         print(data['question'][row])
#         print("Length of Context:",(context_tokens))
#         print("Length of Reference (correct answer):",(reference_tokens))
# #         if len(context_tokens) > 1024:
# #             print("Context Longer than 1024")
#         print(" ##############")
    
    
print("Number of Null Answers", ct)
print("Average Token Ct of Context", round(avg_cntx))
print("Average Token Ct of References",round(avg_ref))
print("Average Token Ct of Questions",round(avg_quest))


ct=0
for tk_count in avg_context:
    if tk_count >= avg_cntx:
        ct+=1

print("Only ", ct, "contexts were above average length")


Number of Null Answers 40
Average Token Ct of Context 460
Average Token Ct of References 62
Average Token Ct of Questions 8
Only  25 contexts were above average length



## 20 Correct Responses

Average Token Ct of Context: 377 <br>
Average Token Ct of References: 24 <br>
Average Token Ct of Questions: 11 <br>

In [157]:
ct = 0

avg_context = []
avg_references = []
avg_question = []

for row in range(data_length):
    if data['predictions'][row] == data['references'][row]:
        ct+=1
#         print(data['question'][row])
#         print("prediction: ", data['predictions'][row], "reference: ",data['references'][row])
        context_tokens = word_tokenize(data['context'][row])
#         print(len(context_tokens))
        if len(context_tokens) > 1024:
            print("Context Longer than 1024")
        context_tokens, avg_cntx = average_tokens(data.loc[row,'context'], avg_context)
        reference_tokens, avg_ref = average_tokens(data.loc[row,'references'], avg_reference)
        question_tokens, avg_quest = average_tokens(data.loc[row,'question'], avg_question)

#         print(" ##############")


    
print("Number of Correct Answers", ct)
print("Average Token Ct of Context", round(avg_cntx))
print("Average Token Ct of References",round(avg_ref))
print("Average Token Ct of Questions",round(avg_quest))


Context Longer than 1024
Number of Correct Answers 20
Average Token Ct of Context 377
Average Token Ct of References 58
Average Token Ct of Questions 11


## 4 Yes/No questions in the data

In [158]:
ct = 0
for reference in data.references:
    if reference == "Yes" or reference == "No":
        ct+=1
print(ct)

4


## 4 Yes/No predictions in the data

In [159]:
ct = 0
for prediction in data.predictions:
    if prediction  == "Yes" or prediction == "No":
        ct+=1
print(ct)

4


## 58 Duplicate Questions (with different answers)

In [160]:
duplicateRowsDF = data['question'][data['question'].duplicated()]
drl = len(duplicateRowsDF)

print("There are:", drl ," Instances of Repeating Questions")

There are: 58  Instances of Repeating Questions


## Fuzzy Matching

In [161]:
from fuzzywuzzy import fuzz 

ratio_list = []
for row in range(len(data)):
    if data['predictions'][row] != 0:
        str1 = data['predictions'][row]
        str2 = data['references'][row]
        Ratio = fuzz.ratio(str1.lower(),str2.lower())
        ratio_list.append(Ratio)
    else:
        ratio_list.append('NaN')
        
ct = 0
for index, item in enumerate(ratio_list):
    if item != 'NaN' and item >= 90:       #take all the predictions that are 90 or higher
#         print(data['predictions'][index])
#         print()
#         print(data['references'][index])
#         print('@@@@@@@@@@@@@@@@@@@@@@@@')
        ct+=1

In [162]:
print("With Fuzzy Matching there are", ct, "correct predictions")

With Fuzzy Matching there are 24 correct predictions
