In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import numpy as np
import json


In [None]:
def get_scores(question, answer_text):
    input_ids = tokenizer.encode(question, answer_text)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    # Search input_ids for first instance of `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # number of seg A tokens includes the [SEP] token itself.
    num_seg_a = sep_index + 1

    # remainder are seg B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)
    start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                 token_type_ids=torch.tensor([segment_ids]))
    start_scores = start_scores.detach().numpy()[0]
    start_scores[0] = 0
    end_scores = end_scores.detach().numpy()[0]
    end_scores[0] = 0
    return tokens, start_scores, end_scores

In [None]:
def get_score_dict(start_scores, end_scores):
    '''
    i = start score index
    j = end score index
    score = sum of start_scores[i] and end_scores[j]
    '''
    start_scores_copy = start_scores.copy()
    end_scores_copy = end_scores.copy()
    i_j_scores = {}
    for x in range(len(start_scores)):
        score = 0
        i = np.argmax(start_scores_copy)
        score += start_scores_copy[i]
        start_scores_copy[i] = 0
        j = np.argmax(end_scores_copy)
        score += end_scores_copy[j]
        end_scores_copy[j] = 0
        i_j_scores[(i, j)] = score
    return i_j_scores

In [None]:
def get_answers(sorted_i_j, tokens):
    counter = 0
    answers_indices = [] # i, j pairs of answers; might not need?
    answers = [] # string answers 
    while len(answers_indices) < k and counter < len(sorted_i_j):
        pair = sorted_i_j[counter]
        if pair[1] >= pair[0]: #end token after start token
            if counter == 0: # first i,j
                answer = tokens[pair[0]] #first token (i)
                for i in range(pair[0] + 1, pair[1] + 1):
                    if tokens[i][0:1] != '▁':
                        answer += tokens[i]
                    else:
                        answer += ' ' + tokens[i]
                answers_indices.append(sorted_i_j[counter])
                answers.append(answer)
            elif (pair[0] >= sorted_i_j[counter-1][1]) or (sorted_i_j[counter-1][0] >= pair[1]):
                # ^start token of current span is after end token of previous span; end token of current span is before start of prev
                answer = tokens[pair[0]]
                for i in range(pair[0] + 1, pair[1] + 1):
                    if tokens[i][0:1] != '▁':
                        answer += tokens[i]
                    else:
                        answer += ' ' + tokens[i]
                answers_indices.append(sorted_i_j[counter])
                answers.append(answer)
        counter += 1
    if len(answers_indices) < k:
        while len(answers_indices) < k:
            answers_indices.append((-1, -1))
    for i in answers_indices:
        if i == (-1, -1):
            answers.append('')
    return answers


In [None]:
data = []
import json
with open('multi_dataset.json') as f:
    data = json.load(f)

all_models_results = {}

# models:
# 1. bert-base: twmkn9/bert-base-uncased-squad2
# 2. bert-large: deepset/bert-large-uncased-whole-word-masking-squad2
# 3. roberta-base: deepset/roberta-base-squad2
# 4. roberta-large: ahotrod/roberta_large_squad2
# 5. albert-base: twmkn9/albert-base-v2-squad2
# 6. albert-large: ktrapeznikov/albert-xlarge-v2-squad-v2

model_names = ['twmkn9/bert-base-uncased-squad2', 'deepset/bert-large-uncased-whole-word-masking-squad2', 'deepset/roberta-base-squad2', 'ahotrod/roberta_large_squad2', 'twmkn9/albert-base-v2-squad2', 'ktrapeznikov/albert-xlarge-v2-squad-v2']


In [None]:
for model_name in model_names:
    model_results = []

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)

    for triplet in data['multi_dataset']:
        id_true_pred = {}
        id_true_pred['question_id'] = triplet['question_id']
        id_true_pred['true_answers'] = triplet['answers']

        question = triplet['question']
        answer_text = triplet['passage']
        tokens, start_scores, end_scores = get_scores(question, answer_text)
        sorted_i_j = []
        for key in get_score_dict(start_scores, end_scores):
            sorted_i_j.append(key)
        answers = get_answers(sorted_i_j, tokens)

        id_true_pred['predicted_answers'] = answers
        model_results.append(id_true_pred)
        
    if model_name == 'twmkn9/bert-base-uncased-squad2':
        all_model_results['bert-base'] = model_results
    elif model_name == 'deepset/bert-large-uncased-whole-word-masking-squad2':
        all_model_results['bert-large'] = model_results
    elif model_name == 'deepset/roberta-base-squad2':
        all_model_results['roberta-base'] = model_results
    elif model_name == 'ahotrod/roberta_large_squad2':
        all_model_results['roberta-large'] = model_results
    elif model_name == 'twmkn9/albert-base-v2-squad2':
        all_model_results['albert-base'] = model_results
    elif model_name == 'ktrapeznikov/albert-xlarge-v2-squad-v2':
        all_model_results['albert-xlarge'] = model_results

In [None]:
with open('all_models_predictions.json', 'w') as f:
    json.dump(data, f, indent=4)