In [None]:
# Copyright 2024, Nikolaos Panagiotou, npan1990@gmail.com
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#     http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import json

import transformers
import datasets
import pandas as pd
import numpy as np
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

# Evaluate SQUAD

This is a notebook that helps the evaluation of the SQUAD dev set using custom fine tuned approaches.
This is a simplistic implementation of F1 and EM scores and can not be directly comparred to the original scores.

In [2]:
DEV_FILE = '../data/squad/dev-v2.0.json'

In [3]:
devset_d = json.load(open(DEV_FILE))

In [4]:
devset_df = pd.DataFrame(devset_d['data'])
devset_df['paragraphs_number'] = devset_df['paragraphs'].str.len()
devset_df['doc_id'] = range(1,devset_df.shape[0]+1)
paragraphs_devset_df = devset_df.explode(['paragraphs'])
paragraphs_devset_df['paragraph_id'] = range(1,paragraphs_devset_df.shape[0]+1)
paragraphs_devset_df['qa'] = paragraphs_devset_df.apply(lambda x: x['paragraphs']['qas'], axis=1)
paragraphs_devset_df['context'] = paragraphs_devset_df.apply(lambda x: x['paragraphs']['context'], axis=1)
paragraphs_devset_df['number_qa'] = paragraphs_devset_df['qa'].str.len()

q_devset_df = paragraphs_devset_df.explode(['qa'])
q_devset_df['question'] = q_devset_df.apply(lambda x: x['qa']['question'], axis=1)
q_devset_df['answers'] = q_devset_df.apply(lambda x: x['qa']['answers'], axis=1)
q_devset_df['answers_number'] = q_devset_df['answers'].str.len()

qa_devset_df = q_devset_df.explode(['answers'])
positive_qa_devset_df = qa_devset_df[~qa_devset_df['answers'].isnull()].copy()
positive_qa_devset_df['answer'] = positive_qa_devset_df.apply(lambda x: x['answers']['text'], axis=1)

In [37]:
# For custom model
checkpoint = 'npan1990/squad-distil-bert'
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint).to('cuda:0')
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [6]:
# for pretrained model
checkpoint = 'distilbert/distilbert-base-cased-distilled-squad'
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint).to('cuda:0')
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [13]:
# for distilbert model
checkpoint = 'distilbert-base-uncased'
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint).to('cuda:0')
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer, device='cuda:0')

In [15]:
questions = []
for index,row in positive_qa_devset_df.iterrows():
    questions.append(
        {'context': row['context'],
         'question': row['question']})

In [16]:
answers = question_answerer(questions)

In [17]:
positive_qa_devset_df['predicted_answer'] = [answer['answer'] for answer in answers]

In [18]:
def compute_all_metrics(predicted_answers, correct_answers):
    f1s = []
    ems = []
    for i in range(len(predicted_answers)):
        predicted_answer = predicted_answers[i]
        correct_answer = correct_answers[i]
        tokenized_predicted_answer = predicted_answer.split(r'\s+')
        tokenized_answer = correct_answer.split(r'\s+')

        predicted_answer_s = set(predicted_answer)
        correct_answer_s = set(correct_answer)

        tp = float(len(predicted_answer_s & correct_answer_s))
        fp = float(len(predicted_answer_s - correct_answer_s))
        fn = float(len(correct_answer_s - predicted_answer_s))

        try:
            pr = tp / (tp+fp)
            rc = tp / len(correct_answer_s)
            f1 = 2*pr*rc/(pr+rc)
        except ZeroDivisionError:
            f1 = 0.0

        f1s.append(f1)

        if predicted_answer == correct_answer:
            ems.append(1)
        else:
            ems.append(0)

    return np.average(f1s), np.average(ems)

In [19]:
compute_all_metrics(positive_qa_devset_df['predicted_answer'].tolist(),positive_qa_devset_df['answer'].tolist())

(0.38472900782734726, 0.00014776869273963155)