In this notebook we perform ensembling on the prediction obtained from BERT, ALBERT, RoBERTa, DistilBERT, and BiDAF. 

After training, every model generates a file called. 'predictions'. Contents of this file include a predicted answer for every question. 
We predictions from all of the above models to create an ensemble based on majority voting.  

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import json 
  
f = open('/content/drive/MyDrive/all_predictions/predictions_albert.json',) 
data_albert = json.load(f) 
f.close() 

g = open('/content/drive/MyDrive/all_predictions/predictions_bert.json',) 
data_bert = json.load(g) 
g.close() 

h = open('/content/drive/MyDrive/all_predictions/predictions_distilbert.json',) 
data_distilbert = json.load(h) 
h.close() 

i = open('/content/drive/MyDrive/all_predictions/predictions_roberta.json',) 
data_roberta = json.load(i) 
i.close() 

In [3]:
import pandas as pd

colnames=['ids', 'bidaf answers'] 
bidaf_df = pd.read_csv('/content/dev_submission.csv', names=colnames, header=None)
bidaf_df.shape

(11731, 2)

In [4]:
bidaf_df.head(12)

Unnamed: 0,ids,bidaf answers
0,Id,Predicted
1,56ddde6b9a695914005b9628,France
2,56ddde6b9a695914005b9629,10th and 11th centuries
3,56ddde6b9a695914005b962a,"Denmark, Iceland and Norway"
4,56ddde6b9a695914005b962b,Rollo
5,56ddde6b9a695914005b962c,10th
6,56dddf4066d3e219004dad5f,Norman
7,56dddf4066d3e219004dad60,Richard I
8,56dddf4066d3e219004dad61,Christian
9,56dde0379a695914005b9636,"Norseman, Viking"


In [5]:
import numpy as np
bidaf_df_fin = bidaf_df.replace(np.nan, '', regex=True)

In [6]:
data_bidaf = {}

for index, row in bidaf_df_fin.iterrows():
    data_bidaf[row['ids']] =  row['bidaf answers']

del data_bidaf['Id']    

In [7]:
ids = []
for key in data_albert:
    ids.append(key)

In [8]:
answers_albert = []
answers_bert = []
answers_distilbert = []
answers_roberta = []

for id in ids:
    answers_albert.append(data_albert[id])

for id in ids:
    answers_bert.append(data_bert[id])

for id in ids:
    answers_distilbert.append(data_distilbert[id])

for id in ids:
    answers_roberta.append(data_roberta[id])
    

In [9]:
bidaf_ids = []
for key in data_bidaf:
    bidaf_ids.append(key)

In [10]:
answers_bidaf = []
for id in bidaf_ids:
    answers_bidaf.append(data_bidaf[id])

In [11]:
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value not in lst2] 
    return lst3 

In [12]:
extra_ids = intersection(ids, bidaf_ids)

In [13]:
for id in extra_ids:
    answers_bidaf.append(data_roberta[id])

In [14]:
all_preds = [] 
for i in range(len(ids)):
    all_preds.append([ids[i], answers_albert[i], answers_bert[i], answers_distilbert[i], answers_roberta[i], answers_bidaf[i]])

In [15]:
all_preds[0]

['56ddde6b9a695914005b9628', 'France', 'France', 'France', 'France.', 'France']

In [16]:
import pandas as pd
df = pd.DataFrame(all_preds, columns = ['ids', 'albert answers', 'bert answers', 'distilbert answers', 'roberta answers', 'bidaf answers'])
df.head()

Unnamed: 0,ids,albert answers,bert answers,distilbert answers,roberta answers,bidaf answers
0,56ddde6b9a695914005b9628,France,France,France,France.,France
1,56ddde6b9a695914005b9629,10th and 11th centuries,10th and 11th centuries,10th and 11th centuries,10th and 11th centuries,10th and 11th centuries
2,56ddde6b9a695914005b962a,"Denmark, Iceland and Norway","Denmark, Iceland and Norway","Denmark, Iceland and Norway","Denmark, Iceland and Norway","Denmark, Iceland and Norway"
3,56ddde6b9a695914005b962b,,Rollo,Rollo,,Rollo
4,56ddde6b9a695914005b962c,10th,10th,10th,10th,10th


In [17]:
df.shape

(11873, 6)

In [18]:
from collections import Counter
import operator

fin_preds = []
for i in range(len(ids)):
    d = Counter(all_preds[i][1:])
    fin_preds.append([ids[i], max(d.items(), key=operator.itemgetter(1))[0]])

In [19]:
df_fin = pd.DataFrame(fin_preds, columns = ['ids', 'ensembled answers'])
df_fin.head()

Unnamed: 0,ids,ensembled answers
0,56ddde6b9a695914005b9628,France
1,56ddde6b9a695914005b9629,10th and 11th centuries
2,56ddde6b9a695914005b962a,"Denmark, Iceland and Norway"
3,56ddde6b9a695914005b962b,Rollo
4,56ddde6b9a695914005b962c,10th


Replace no answers with roberta answers


In [20]:
fin_preds_dict = {}

for l in fin_preds:
    fin_preds_dict[l[0]] = l[1]
    

In [21]:
for i in data_roberta:
    if data_roberta[i]=='':
        fin_preds_dict[i] = data_roberta[i]

In [22]:
processed_fin_preds = []

for key in fin_preds_dict.keys():
    processed_fin_preds.append([key, fin_preds_dict[key]])

In [23]:
processed_fin_preds[0]

['56ddde6b9a695914005b9628', 'France']

In [24]:
# !git clone https://github.com/huggingface/transformers \
# && cd transformers \
# && git checkout a3085020ed0d81d4903c50967687192e3101e770 

In [25]:
# !pip install ./transformers
# !pip install tensorboardX

In [26]:
!pip install transformers
!pip install tensorboardX

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 5.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 21.9MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 36.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=d0b6d99700b9e

In [27]:
import pickle
with open('/content/drive/MyDrive/all_predictions/examples.pickle', 'rb') as handle:
    examples = pickle.load(handle)

with open('/content/drive/MyDrive/all_predictions/predictions.pickle', 'rb') as handle:
    predictions = pickle.load(handle)

In [28]:
import collections
import json
import math
import re
import string

from transformers import BasicTokenizer

# from ...utils import logging

In [29]:
def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [30]:
def get_raw_scores(examples, preds):
    """
    Computes the exact and f1 scores from the examples and the model predictions
    """
    exact_scores = {}
    f1_scores = {}

    for example in examples:
        qas_id = example.qas_id
        gold_answers = [answer["text"] for answer in example.answers if normalize_answer(answer["text"])]

        if not gold_answers:
            # For unanswerable questions, only correct answer is empty string
            gold_answers = [""]

        if qas_id not in preds:
            print("Missing prediction for %s" % qas_id)
            continue

        prediction = preds[qas_id]
        exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers)
        f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers)

    return exact_scores, f1_scores


In [31]:
def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
    new_scores = {}
    for qid, s in scores.items():
        pred_na = na_probs[qid] > na_prob_thresh
        if pred_na:
            new_scores[qid] = float(not qid_to_has_ans[qid])
        else:
            new_scores[qid] = s
    return new_scores

In [32]:
def make_eval_dict(exact_scores, f1_scores, qid_list=None):
    if not qid_list:
        total = len(exact_scores)
        return collections.OrderedDict(
            [
                ("exact", 100.0 * sum(exact_scores.values()) / total),
                ("f1", 100.0 * sum(f1_scores.values()) / total),
                ("total", total),
            ]
        )
    else:
        total = len(qid_list)
        return collections.OrderedDict(
            [
                ("exact", 100.0 * sum(exact_scores[k] for k in qid_list) / total),
                ("f1", 100.0 * sum(f1_scores[k] for k in qid_list) / total),
                ("total", total),
            ]
        )

In [33]:
def merge_eval(main_eval, new_eval, prefix):
    for k in new_eval:
        main_eval["%s_%s" % (prefix, k)] = new_eval[k]

In [34]:
def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
    cur_score = num_no_ans
    best_score = cur_score
    best_thresh = 0.0
    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
    for i, qid in enumerate(qid_list):
        if qid not in scores:
            continue
        if qid_to_has_ans[qid]:
            diff = scores[qid]
        else:
            if preds[qid]:
                diff = -1
            else:
                diff = 0
        cur_score += diff
        if cur_score > best_score:
            best_score = cur_score
            best_thresh = na_probs[qid]

    has_ans_score, has_ans_cnt = 0, 0
    for qid in qid_list:
        if not qid_to_has_ans[qid]:
            continue
        has_ans_cnt += 1

        if qid not in scores:
            continue
        has_ans_score += scores[qid]

    return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt


In [35]:
def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
    best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
    best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
    main_eval["best_exact"] = best_exact
    main_eval["best_exact_thresh"] = exact_thresh
    main_eval["best_f1"] = best_f1
    main_eval["best_f1_thresh"] = f1_thresh
    main_eval["has_ans_exact"] = has_ans_exact
    main_eval["has_ans_f1"] = has_ans_f1

In [36]:
def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
    cur_score = num_no_ans
    best_score = cur_score
    best_thresh = 0.0
    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
    for _, qid in enumerate(qid_list):
        if qid not in scores:
            continue
        if qid_to_has_ans[qid]:
            diff = scores[qid]
        else:
            if preds[qid]:
                diff = -1
            else:
                diff = 0
        cur_score += diff
        if cur_score > best_score:
            best_score = cur_score
            best_thresh = na_probs[qid]
    return 100.0 * best_score / len(scores), best_thresh

In [37]:
def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
    best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
    best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)

    main_eval["best_exact"] = best_exact
    main_eval["best_exact_thresh"] = exact_thresh
    main_eval["best_f1"] = best_f1
    main_eval["best_f1_thresh"] = f1_thresh

In [38]:
def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0):
    qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples}
    has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer]
    no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer]

    if no_answer_probs is None:
        no_answer_probs = {k: 0.0 for k in preds}

    exact, f1 = get_raw_scores(examples, preds)

    exact_threshold = apply_no_ans_threshold(
        exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold
    )
    f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold)

    evaluation = make_eval_dict(exact_threshold, f1_threshold)

    if has_answer_qids:
        has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids)
        merge_eval(evaluation, has_ans_eval, "HasAns")

    if no_answer_qids:
        no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids)
        merge_eval(evaluation, no_ans_eval, "NoAns")

    if no_answer_probs:
        find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer)

    return evaluation

In [39]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()


def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

In [42]:
tuples_fin_preds = []
for l in processed_fin_preds:
    id, ans = l
    tuples_fin_preds.append((id, ans))


In [None]:
from collections import OrderedDict
dict_l = OrderedDict(tuples_fin_preds)
print(dict_l)

In [44]:
fin_res = squad_evaluate(examples, dict_l)

In [45]:
fin_res

OrderedDict([('exact', 80.3503747999663),
             ('f1', 82.93309783393421),
             ('total', 11873),
             ('HasAns_exact', 71.87921727395411),
             ('HasAns_f1', 77.0520699362856),
             ('HasAns_total', 5928),
             ('NoAns_exact', 88.7973086627418),
             ('NoAns_f1', 88.7973086627418),
             ('NoAns_total', 5945),
             ('best_exact', 80.3503747999663),
             ('best_exact_thresh', 0.0),
             ('best_f1', 82.93309783393411),
             ('best_f1_thresh', 0.0)])