In [41]:
from transformers import AutoTokenizer, RobertaTokenizer
import json
import numpy as np
import string
import eval_functions as evals
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import nltk
import pandas as pd

In [42]:
scores_data = []
with open('all_models_pred_small_dev.json') as f:
    scores_data = json.load(f)
    
multi_dataset = []
with open('multi_dataset.json') as f:
    multi_dataset = json.load(f)

In [43]:
def get_answers(sorted_i_j, tokens, k, space_token):
    """
    params: sorted_i_j (dict of (i, j) index pairs and their summed score (from get_score_dict))
            tokens (list of tokens from question and passage)
            k (number of answers to be returned (num of true_answers))
    returns: predicted answers from the model
    """
    counter = 0
    answers_indices = [] # i, j pairs of answers
    answers = [] # string answers 
    while len(answers) < k and counter < len(sorted_i_j):
        pair = sorted_i_j[counter]
        if pair[1] >= pair[0] and pair[0] != 0: #end token after start token
            if counter == 0: # first i,j
                answer = tokens[pair[0]] #first token (i)
                for i in range(pair[0] + 1, pair[1] + 1):
                    if space_token == '##': #for bert
                        if tokens[i][0:2] == space_token:
                            answer += tokens[i]
                        else: answer += ' ' + tokens[i]
                    elif tokens[i][0:1] != space_token:
                        answer += tokens[i]
                    else:
                        answer += ' ' + tokens[i]
                answers_indices.append(sorted_i_j[counter])
                answer = answer.replace(space_token, '')
                answers.append(answer)
            elif (pair[0] > sorted_i_j[counter-1][1]) or (sorted_i_j[counter-1][0] > pair[1]):
                for old_pair in sorted_i_j[:counter]:
                    if pair[0] not in range(old_pair[0], old_pair[1]):
                        add_answer = True
                    else:
                        add_answer = False
                        break
                # ^start token of current is after end token of prev; end token of current is before start of prev
                if add_answer == True:
                    answer = tokens[pair[0]]
                    for i in range(pair[0] + 1, pair[1] + 1):
                        if space_token == '##':
                            if tokens[i][0:2] == space_token:
                                answer += tokens[i]
                            else: answer += ' ' + tokens[i]
                        elif tokens[i][0:1] != space_token:
                            answer += tokens[i]
                        else:
                            answer += ' ' + tokens[i]
                    answers_indices.append(sorted_i_j[counter])
                    answer = answer.replace(space_token, '')
                    answers.append(answer)
        counter += 1
    if len(answers_indices) < k:
        while len(answers_indices) < k:
            answers_indices.append((-1, -1))
    for i in answers_indices:
        if i == (-1, -1):
            answers.append('')
    return answers

def get_score_dict(start_scores, end_scores):
    '''    
    params: start and end score arrays
    returns: dictionary of (i, j) pairs sorted by descending order of sum of scores
    i = start score index
    j = end score index
    score = sum of start_scores[i] and end_scores[j]
    '''
    start_scores_copy = start_scores.copy()
    end_scores_copy = end_scores.copy()
    i_j_scores = []
    for x in range(len(start_scores)):
        i = np.argmax(start_scores_copy)
        start_scores_copy[i] = 0
        j = np.argmax(end_scores_copy)
        end_scores_copy[j] = 0
        i_j_scores.append((i, j))
    return i_j_scores

In [44]:
def exact_match(true_answers, pred_answers):
    count = 0
    true_answers_lower = [answer.lower() for answer in true_answers]
    pred_answers_lower = [answer.lower() for answer in pred_answers]
    for answer in pred_answers_lower:
        if answer in true_answers_lower:
            count += 1
        if answer == '' and 'noAnswer' in true_answers:
            count += 1
    em = count / len(true_answers)
    return em

In [45]:
def get_true_pred_arrays(true_answers, pred_answers, passage, model_name):
    """
    params: true_answers: list of true answers
            pred_answers: list of predicted answers
            passage: original passage string
            model_name: 'bert-base', 'roberta-large', etc.
    returns: true_array: array of 1s and 0s, same length as len(passage)
                         1 = character is in a true answer
                         0 = character is not in a true answer
             pred_array: same as true_array, but for pred_answers
    """
    true_array = np.zeros(len(passage))
    for answer in true_answers:
        true_array = set_ones(answer, passage, true_array, model_name)

    pred_array = np.zeros(len(passage))
    for answer in pred_answers:
        if answer != '':
            pred_array = set_ones(answer, passage, pred_array, model_name)
    return true_array, pred_array

In [46]:
def set_ones(answer, passage, array, model_name):
    """
    sets the ones (answer chars) in the array of zeros (non-answer chars)
    """
    if ('( ' in answer or ' )' in answer) or (' ,' in answer or ' .' in answer):
        answer = answer.replace('( ', '(')
        answer = answer.replace(' )', ')')
        answer = answer.replace(' ,', ',')
        answer = answer.replace(' .', '.')
        
    if model_name == 'roberta-base' or model_name == 'roberta-large':
        start_id = passage.find(answer)
    else:
        start_id = passage.lower().find(answer.lower())
    
    if start_id + len(answer) >= len(passage):
        for index in range(start_id, start_id + len(answer) - 1):
            array[index] = 1
    else:
        for index in range(start_id, start_id + len(answer)):
            array[index] = 1
        
    return array

In [47]:
def avg_bleu_score(true_answers, pred_answers, model_name):
    """
    returns the blue score for each question set
    """
    total = 0
    references = []
    if model_name == 'roberta-base' or model_name == 'roberta-large':
        for true_answer in true_answers:
            references.append(true_answer.split())
    else:
        for true_answer in true_answers:
            references.append([word.lower() for word in true_answer.split()])
    for pred_answer in pred_answers: # calc bleu for each pred_answer
        hypothesis = pred_answer.split()
        curr_score = nltk.bleu_score.sentence_bleu(references, hypothesis, weights=[1.])
        total += curr_score

    return total/len(pred_answers)


In [54]:
model_names = [('twmkn9/bert-base-uncased-squad2', 'bert-base')]
""", ('deepset/bert-large-uncased-whole-word-masking-squad2', 'bert-large'), 
               ('deepset/roberta-base-squad2', 'roberta-base'), ("ahotrod/roberta_large_squad2", 'roberta-large'), 
               ('twmkn9/albert-base-v2-squad2', 'albert-base'), ('ktrapeznikov/albert-xlarge-v2-squad-v2', 'albert-xlarge')]
"""
               
for model_name in model_names:
    print(model_name[1])
    #get tokenizers
    if model_name[0] == 'deepset/roberta-base-squad2':
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
        space_token = 'Ġ'
    elif model_name[0] == "ahotrod/roberta_large_squad2":
        tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        space_token = 'Ġ'
    else:
        tokenizer = AutoTokenizer.from_pretrained(model_name[0])
        if model_name[1] == 'bert-base' or model_name[1] == 'bert-large':
            space_token = '##'
        else:
            space_token = '▁'
            
    # reset averages
    avg_exact_match, avg_f1_micro, avg_f1_c1, avg_prec_micro, avg_prec_c1, avg_recall_micro, avg_recall_c1, avg_bleu = 0, 0, 0, 0, 0, 0, 0, 0
    tn_sum, fp_sum, fn_sum, tp_sum = 0, 0, 0, 0
    
    # get answers
    for score_set in scores_data[model_name[1]]: # each model
        # replace w multi
        for triplet in multi_dataset['multi_dataset']: # match to original dataset
            if triplet['question_id'] == score_set['question_id']:
                question = triplet['question']
                passage = triplet['passage']
        tokens = tokenizer.convert_ids_to_tokens(tokenizer(question, passage)['input_ids'])
        sorted_i_j = get_score_dict(score_set['start_scores'], score_set['end_scores'])
        true_answers = score_set['true_answers']
        pred_answers = get_answers(sorted_i_j, tokens, len(score_set['true_answers']), space_token)

        # get metrics
        true_array, pred_array = get_true_pred_arrays(true_answers, pred_answers, passage, model_name[1])
        
        tn, fp, fn, tp = confusion_matrix(true_array, pred_array).ravel()
        
        e_m = exact_match(true_answers, pred_answers)
        avg_exact_match += e_m
        
        tn_sum += tn
        fp_sum += fp
        fn_sum += fn
        tp_sum += tp

        prec_c1 = precision_score(true_array, pred_array, average='binary', pos_label=1.)
        avg_prec_c1 += prec_c1
        
        recall_c1 = recall_score(true_array, pred_array, average='binary', pos_label=1.)
        avg_recall_c1 += recall_c1
        
        f1_c1 = f1_score(true_array, pred_array, average='binary', pos_label=1.)
        avg_f1_c1 += f1_c1

        bleu = avg_bleu_score(true_answers, pred_answers, model_name[1])
        avg_bleu += bleu
        
    avg_prec_micro = tp_sum / (tp_sum + fp_sum)
    avg_recall_micro = tp_sum / (tp_sum + fn_sum)
    avg_f1_micro = 2 * ((avg_prec_micro*avg_recall_micro) / (avg_prec_micro + avg_recall_micro))
            
    print('avg_em: {}'.format(avg_exact_match/len(scores_data[model_name[1]])))
    print('avg_f1_micro: {}'.format(avg_f1_micro))
    print('avg_f1_c1: {}'.format(avg_f1_c1/len(scores_data[model_name[1]])))
    print('avg_prec_micro: {}'.format(avg_prec_micro))
    print('avg_prec_c1: {}'.format(avg_prec_c1/len(scores_data[model_name[1]])))
    print('avg_recall_micro: {}'.format(avg_recall_micro))
    print('avg_recall_c1: {}'.format(avg_recall_c1/len(scores_data[model_name[1]])))
    print('avg_bleu: {}'.format(avg_bleu/len(scores_data[model_name[1]])))
    print(' ')


# metrics_df = pd.DataFrame.from_dict(metrics_data, orient='index')
# display(metrics_df)

    
        

bert-base
avg_em: 0.25
avg_f1_micro: 0.4752475247524752
avg_f1_c1: 0.47342342342342336
avg_prec_micro: 0.7741935483870968
avg_prec_c1: 0.8205128205128206
avg_recall_micro: 0.34285714285714286
avg_recall_c1: 0.39439534231200896
avg_bleu: 0.3333333333333333
 


In [51]:
data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
df = pd.DataFrame.from_dict(data, orient='index', columns=[1, 3, 4, 5])
display(df)

Unnamed: 0,1,3,4,5
row_1,3,2,1,0
row_2,a,b,c,d


In [338]:
# model --> metric --> answer

\begin{tabular}{lllll}
\toprule
{} &  0 &  1 &  2 &  3 \\
\midrule
row\_1 &  3 &  2 &  1 &  0 \\
row\_2 &  a &  b &  c &  d \\
\bottomrule
\end{tabular}

