In [1]:
import numpy as np
import pandas as pd
import nltk
import os
import json
import transformers
import torch
import random
from torch import cuda
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, AlbertTokenizer, DebertaV2Tokenizer, DistilBertTokenizer, ElectraTokenizer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
CUDA_LAUNCH_BLOCKING=1

In [3]:
random_seed = 42
random.seed(random_seed)

# Set the random seed for NumPy (if used)
np.random.seed(random_seed)

# Set the random seed for PyTorch
torch.manual_seed(random_seed)

# Additional steps if using CUDA (GPU)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [4]:
def index_converter(context, context_tokenized):
    """
    Maps start str index to a tokenized index
    """
    mapper = {}
    curr = ''
    token_idx = 0
    for i, char in enumerate(context):
        if char != ' ' and char != '\n' and char != '\t' and char != '\r': # making sure current char is not whitespace
            curr += char
            if curr == context_tokenized[token_idx]:
                start = i - len(curr) + 1
                for j in range(start, i+1):
                    mapper[j] = (curr, token_idx)                
                curr = ''
                token_idx += 1
    if token_idx != len(context_tokenized): # skipping the data in case of issue with spanning
        return None
    return mapper

def preprocess_data(dataset, is_training=True, tokenized=True):
    """
    Parse the json_data object into a pandas readable data representation (list of dicts)
    """
    
    def _tokenize(seq):
        """
        Minimizes errors between tokenizers and encodings.
        Recommended in the paper BiDAF (Seo et al., 2016)
        """
        return [t.replace("``", '"').replace("''", '"') for t in seq.split()]
    
    examples = [] # store rows of data here for qa
    
    tokenization_errors = 0
    misaligned_ans_errors = 0
    num_impossibles = 0
    num_questions = 0
    
    for article_id in tqdm(range(len(dataset['data']))): # for each context
        paragraphs = dataset['data'][article_id]['paragraphs']
        for paragraph_id in range(len(paragraphs)):
            questions = dataset['data'][article_id]['paragraphs'][paragraph_id]['qas']
            
            context = paragraphs[paragraph_id]['context']
            context_tokenized = _tokenize(context)
                    
            for qid in range(len(questions)): # loop through questions
                num_questions += 1
                
                question = questions[qid]['question']
                question_tokenized = _tokenize(question)
                qas_id = questions[qid]['id']
                
                is_impossible = questions[qid]['is_impossible']
                
                if is_impossible: # check if question is impossible to answer
                    num_impossibles += 1
                    examples.append({'qas_id': qas_id, 
                                     'question':question_tokenized if tokenized else question, 
                                     'context': context_tokenized if tokenized else context, 
                                     'answer':'', 
                                     'is_impossible': is_impossible,
                                     'start_pos': -1, 
                                     'end_pos':-1,
                                    'santiy_check': context_tokenized[-1:0]})
                    continue
                    
                # question is not impossible, continue parsing
                answers = questions[qid]['answers']
                
                for ans_id in range(len(answers)): # for each answer
                    answer = answers[ans_id]['text']
                    start_pos = answers[ans_id]['answer_start'] # inclusive start index in raw context
                    end_pos = start_pos + len(answer) #exclusive end index in raw context
                          
                    if context[start_pos:end_pos] != answer:
                        misaligned_ans_errors += 1
                        continue
                        
                    if tokenized:
                        mapper = index_converter(context, context_tokenized)
                        if mapper is None:
                            tokenization_errors += 1
                            continue
                        
                        start_pos = mapper[start_pos][1]
                        end_pos = mapper[end_pos-1][1] # inclusive
                    
                    examples.append({'qas_id': qas_id, 
                                     'question':question_tokenized if tokenized else question, 
                                     'context': context_tokenized if tokenized else context, 
                                     'answer':answer, 
                                     'is_impossible': is_impossible,
                                     'start_pos': start_pos, 
                                     'end_pos':end_pos,
                                    'santiy_check': context_tokenized[start_pos:end_pos+1] if tokenized else context[start_pos:end_pos+1]})
            
                    
    print('No. of questions:{}'.format(num_questions))
    return examples

Loading the dev file

In [5]:
# Loading validation data
val_file = open('/kaggle/input/squad-2/dev-v2.0.json')
val_data = json.load(val_file)

In [6]:
val_processed = preprocess_data(val_data)
random.shuffle(val_processed)
val_df = pd.DataFrame(val_processed)

100%|██████████| 35/35 [00:10<00:00,  3.28it/s]

No. of questions:11873





Displaying few extracted samples

In [7]:
val_df.head()

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check
0,5737958b1c456719005744c4,"[What, actually, causes, rigidity, in, matter?]","[It, is, a, common, misconception, to, ascribe...",the Pauli exclusion principle,False,33,36,"[the, Pauli, exclusion, principle.[citation]"
1,572757bef1498d1400e8f694,"[School, desegregation, in, the, United, State...","[In, many, parts, of, the, United, States,, af...",African-American,False,65,65,[African-American]
2,5733fc6ed058e614000b6711,"[How, much, gun, powder, was, destroyed, in, a...","[Governor, Vaudreuil,, who, harboured, ambitio...","45,000 pounds",False,72,73,"[45,000, pounds]"
3,5a67b94bf038b7001ab0c444,"[How, many, Swedish, students, were, enrolled,...","[In, Sweden,, pupils, are, free, to, choose, a...",,True,-1,-1,[]
4,57308ddc396df919000961a4,"[Imperialism, is, most, often, associated, wit...","[The, principles, of, imperialism, are, often,...",the British Empire,False,13,15,"[the, British, Empire]"


In [15]:
# Functions for Calculation of Evaluation Metrics

def normalize_text(s):
    """Removing articles, punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_f1(prediction, truth):
    ''' Calculates the F1 score'''
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)

def compute_exact_match(prediction, truth):
    ''' Computes the exact match score '''
    return int(normalize_text(prediction) == normalize_text(truth))

In [16]:
def encode_data(processed_data, tokenizer, max_len, max_query_len):
    """
    Converts examples of data into input format tensors based on the tokenizer
    """
    context_length_errors = 0
    encoded_data = []
    for sample in tqdm(processed_data):
        question_raw = ' '.join(sample['question'])
        context_raw = ' '.join(sample['context'])
        if len(question_raw) > max_query_len:
            question_raw = question_raw[:max_query_len]
        
        # encode the data using the tokenizer
        encoded = tokenizer.encode_plus(question_raw, context_raw,
                                        max_length=max_len,
                                        padding='max_length',
                                        truncation='only_second',
                                        return_token_type_ids=True)
        if sample['is_impossible']:
            start = -1
            end = -1
        else: # Adjust the start_pos and end_pos 
            input_ids = encoded['input_ids']
            answer_ids = tokenizer.encode(sample['answer']) # get token ids for answer to compare
            start, end = 0, 0 # defaults to this, if encode_plus performed truncation which included answer
            for i in range(len(input_ids)):
                if input_ids[i: i+len(answer_ids[1:-1])] == answer_ids[1:-1]:
                    start = i
                    end = i + len(answer_ids[1:-1]) - 1
                    break
            
        ids = encoded['input_ids']
        token_type_ids = encoded['token_type_ids']
        mask = encoded['attention_mask']
        
        assert len(ids) == max_len
        assert len(token_type_ids) == max_len
        assert len(mask) == max_len
        
        encoded_data.append({'ids': ids,
                      'token_type_ids': token_type_ids,
                      'mask': mask,
                      'start_pos': start,
                      'end_pos': end})        
    return encoded_data

In [17]:
def predict(model, tokenizer, testing_loader):
    ''' Predicts the answers for the model and tokenizer on the loader provided'''
    model.eval()
    preds = []
    targs = []
    pred_answers = []
    target_answers = []
    with torch.no_grad():
        for data in tqdm(testing_loader):
            data = tuple(d.to(device) for d in data)
            inputs = {'input_ids': data[0],
                'attention_mask':  data[1], 
                'token_type_ids':  data[2],
                'start_positions': data[3], 
                'end_positions':   data[4]}
            output = model(**inputs)
            starts = output[1]
            ends = output[2]
            start_preds = []
            end_preds = []
            
            target_starts = data[3]
            target_ends = data[4]

            for s,e in zip(starts, ends):
                start_pred = torch.argmax(s)
                start_preds.append(start_pred)
                end_pred = torch.argmax(e)
                end_preds.append(end_pred)

            for i, (s,e) in enumerate(zip(start_preds, target_ends)):
                preds.append((s,e))
                predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                pred_answers.append(predicted_answer)
            
            for i, (s,e) in enumerate(zip(target_starts, target_ends)):
                targs.append((s,e))
                actual_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                target_answers.append(actual_answer)

        pred_answers = [item if item != '[CLS]' else '' for item in pred_answers]
    return pred_answers, target_answers

In [None]:
def create_dataloader(encodings):
    ''' Creation of dataloader for the encodings'''
    input_ids = torch.tensor([sample['ids'] for sample in encodings], dtype=torch.long)
    input_mask = torch.tensor([sample['mask'] for sample in encodings], dtype=torch.long)
    segment_ids = torch.tensor([sample['token_type_ids'] for sample in encodings], dtype=torch.long)
    start_positions = torch.tensor([sample['start_pos'] for sample in encodings], dtype=torch.long)
    end_positions = torch.tensor([sample['end_pos'] for sample in encodings], dtype=torch.long)
    dataset = TensorDataset(input_ids, input_mask, segment_ids, start_positions, end_positions)
    params = {'batch_size': BATCH_SIZE,
                    'shuffle': False,
                    'num_workers': 0
                    }    
    data_loader = DataLoader(dataset, **params)
    return data_loader

In [19]:
# Model parameters
MAX_SEQ_LEN = 512
MAX_QN_LEN = 128
BATCH_SIZE = 8

In [20]:
device = 'cuda' if cuda.is_available() else 'cpu'

Loading finetuned BERT model and pre-trained tokenizer

In [21]:
bert_model = torch.load('/kaggle/input/squad2-models/fine_tuned_bert2-2.model')
bert_model.to(device)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [22]:
# Creating encodings for BERT
bert_encodings = encode_data(val_processed, bert_tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)
bert_dataloader = create_dataloader(bert_encodings)

100%|██████████| 26232/26232 [02:56<00:00, 148.67it/s]


In [25]:
# Performing predictions for BERT
bert_predictions, actual_answer = predict(bert_model, bert_tokenizer, bert_dataloader)

100%|██████████| 3279/3279 [07:38<00:00,  7.16it/s]


Loading finetuned ALBERT model and pre-trained tokenizer

In [26]:
albert_model = torch.load('/kaggle/input/squad2-models/fine_tuned_albert2.model')
albert_model.to(device)
albert_tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [27]:
# Creating encodings for ALBERT
albert_encodings = encode_data(val_processed, albert_tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)
albert_dataloader = create_dataloader(albert_encodings)

100%|██████████| 26232/26232 [01:17<00:00, 337.35it/s]


In [28]:
# Performing predictions for ALBERT
albert_predictions, actual_answer = predict(albert_model, albert_tokenizer, albert_dataloader)

100%|██████████| 3279/3279 [08:48<00:00,  6.21it/s]


Loading finetuned DEBERTA model and pre-trained tokenizer

In [29]:
deberta_model = torch.load('/kaggle/input/squad2-models/fine_tuned_deberta1.model')
deberta_model.to(device)
deberta_tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [30]:
# Creating encodings for DEBERTA
deberta_encodings = encode_data(val_processed, deberta_tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)
deberta_dataloader = create_dataloader(deberta_encodings)

100%|██████████| 26232/26232 [00:44<00:00, 591.74it/s]


In [31]:
# Performing predictions for DEBERTA
deberta_predictions, actual_answer = predict(deberta_model, deberta_tokenizer, deberta_dataloader)

100%|██████████| 3279/3279 [11:50<00:00,  4.61it/s]


Loading finetuned DISTILBERT model and pre-trained tokenizer

In [32]:
distilbert_model = torch.load('/kaggle/input/squad2-models/fine_tuned_distilbert2.model')
distilbert_model.to(device)
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [33]:
def predict_distilbert(model, tokenizer, testing_loader):
    model.eval()
    preds = []
    targs = []
    pred_answers = []
    target_answers = []
    with torch.no_grad():
        for data in tqdm(testing_loader):
            data = tuple(d.to(device) for d in data)
            inputs = {'input_ids': data[0],
                'attention_mask':  data[1], 
                'start_positions': data[3], 
                'end_positions':   data[4]}
            output = model(**inputs)
            starts = output[1]
            ends = output[2]
            start_preds = []
            end_preds = []
            
            target_starts = data[3]
            target_ends = data[4]

            for s,e in zip(starts, ends):
                start_pred = torch.argmax(s)
                start_preds.append(start_pred)
                end_pred = torch.argmax(e)
                end_preds.append(end_pred)

            for i, (s,e) in enumerate(zip(start_preds, target_ends)):
                preds.append((s,e))
                predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                pred_answers.append(predicted_answer)
            
            for i, (s,e) in enumerate(zip(target_starts, target_ends)):
                targs.append((s,e))
                actual_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][i][s : e+1]))
                target_answers.append(actual_answer)

        pred_answers = [item if item != '[CLS]' else '' for item in pred_answers]
    return pred_answers, target_answers

In [34]:
# Creating encodings for DISTILBERT
distilbert_encodings = encode_data(val_processed, distilbert_tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)
distilbert_dataloader = create_dataloader(distilbert_encodings)

100%|██████████| 26232/26232 [02:58<00:00, 147.14it/s]


In [35]:
# Performing predictions for DISTILBERT
distilbert_predictions, actual_answer = predict_distilbert(distilbert_model,distilbert_tokenizer, distilbert_dataloader)

100%|██████████| 3279/3279 [03:56<00:00, 13.87it/s]


Loading finetuned ELECTRA model and pre-trained tokenizer

In [36]:
electra_model = torch.load('/kaggle/input/squad2-models/fine_tuned_electra2.model')
electra_model.to(device)
electra_tokenizer = ElectraTokenizer.from_pretrained('google/electra-base-discriminator')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

In [37]:
# Creating encodings for ELECTRA
electra_encodings = encode_data(val_processed, electra_tokenizer, MAX_SEQ_LEN, MAX_QN_LEN)
electra_dataloader = create_dataloader(electra_encodings)

100%|██████████| 26232/26232 [03:04<00:00, 142.42it/s]


In [38]:
# Performing predictions for ELECTRA
electra_predictions, actual_answer = predict(electra_model,electra_tokenizer, electra_dataloader)

100%|██████████| 3279/3279 [07:32<00:00,  7.24it/s]


In [39]:
# Adding predictions in the dataframe
val_df['bert_predictions'] = bert_predictions
val_df['albert_predictions'] = albert_predictions
val_df['deberta_predictions'] = deberta_predictions
val_df['distilbert_predictions'] = distilbert_predictions
val_df['electra_predictions'] = electra_predictions

In [40]:
val_has_ans = val_df[~val_df['is_impossible']]

In [41]:
val_no_ans = val_df[val_df['is_impossible']]

In [42]:
def categorize_question_type(question):
    question = list(map(str.lower, question))
    if 'what' in question:
        return 'What'
    elif 'who' in question:
        return 'Who'
    elif 'when' in question:
        return 'When'
    elif 'where' in question:
        return 'Where'
    elif 'why' in question:
        return 'Why'
    elif 'how' in question:
        return 'How'
    else:
        return 'Other'

In [43]:
val_df['question_type'] = val_df['question'].apply(categorize_question_type)

In [44]:
val_df

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check,bert_predictions,albert_predictions,deberta_predictions,distilbert_predictions,electra_predictions,question_type
0,5737958b1c456719005744c4,"[What, actually, causes, rigidity, in, matter?]","[It, is, a, common, misconception, to, ascribe...",the Pauli exclusion principle,False,33,36,"[the, Pauli, exclusion, principle.[citation]",pauli exclusion principle,[CLS] what actually causes rigidity in matter?...,Pauli exclusion principle,pauli exclusion principle,pauli exclusion principle,What
1,572757bef1498d1400e8f694,"[School, desegregation, in, the, United, State...","[In, many, parts, of, the, United, States,, af...",African-American,False,65,65,[African-American],african - american,african-american,African-American,african - american,african - american,What
2,5733fc6ed058e614000b6711,"[How, much, gun, powder, was, destroyed, in, a...","[Governor, Vaudreuil,, who, harboured, ambitio...","45,000 pounds",False,72,73,"[45,000, pounds]","45 , 000 pounds","45,000 pounds","45,000 pounds","45 , 000 pounds","45 , 000 pounds",How
3,5a67b94bf038b7001ab0c444,"[How, many, Swedish, students, were, enrolled,...","[In, Sweden,, pupils, are, free, to, choose, a...",,True,-1,-1,[],,,,,,How
4,57308ddc396df919000961a4,"[Imperialism, is, most, often, associated, wit...","[The, principles, of, imperialism, are, often,...",the British Empire,False,13,15,"[the, British, Empire]",the british empire,british empire,British Empire,british empire,british empire,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26227,5725d662ec44d21400f3d68b,"[Which, park, hosts, the, largest, Civil, War,...","[Fresno, has, three, large, public, parks,, tw...",Kearney Park,False,65,66,"[Kearney, Park]",kearney park,kearney park,Kearney Park,kearney park,kearney park,Other
26228,5733d7cbd058e614000b63ae,"[What, rule, did, some, native, live, under?]","[In, between, the, French, and, the, British,,...",Iroquois,False,90,90,[Iroquois],iroquois,,Iroquois,,,What
26229,56e1c2eee3433e1400423138,"[Decision, problems, capable, of, being, solve...","[But, bounding, the, computation, time, above,...",complexity class P,False,93,95,"[complexity, class, P,]",[CLS] decision problems capable of being solve...,[CLS] decision problems capable of being solve...,polynomial time belon[SEP]But bounding the com...,complexity class p,complexity class p,What
26230,57111b95a58dae1900cd6c51,"[What, German, poet, was, descended, from, Hug...","[Frederick, William,, Elector, of, Brandenburg...",Theodor Fontane,False,38,39,"[Theodor, Fontane,]",,theodor fontane,Theodor Fontane,theodor fontane,theodor fontane,What


In [45]:
models = ['Bert','Albert', 'DeBerta', 'DistilBert', 'Electra']
predictions = ['bert_predictions', 'albert_predictions', 'deberta_predictions', 'distilbert_predictions', 'electra_predictions']

In [46]:
def compare_models(df):
    scores = []
    for model, model_prediction in zip(models, predictions):
        em_score = 0
        f1_score = 0
        for predicted_ans, actual_ans in zip(df[model_prediction], df['answer']):
            em_score += compute_exact_match(predicted_ans, actual_ans)
            f1_score += compute_f1(predicted_ans, actual_ans)
        em_score /= len(df)
        f1_score /= len(df)
        scores.append({'Model': model, 'EM Score': em_score, 'F1 Score':f1_score})
    return scores

In [47]:
# Computing overall score
dev_scores = compare_models(val_df)

In [48]:
dev_score_df = pd.DataFrame(dev_scores)

Evaluation Scores obtained for each models in the Dev Data

In [49]:
dev_score_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.641278,0.731684
1,Albert,0.616956,0.704614
2,DeBerta,0.727089,0.797077
3,DistilBert,0.62195,0.713455
4,Electra,0.703378,0.796313


## Analysis for Has Answer and No Answer Questions

In [79]:
val_has_ans.head()

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check,bert_predictions,albert_predictions,deberta_predictions,distilbert_predictions,electra_predictions
0,5737958b1c456719005744c4,"[What, actually, causes, rigidity, in, matter?]","[It, is, a, common, misconception, to, ascribe...",the Pauli exclusion principle,False,33,36,"[the, Pauli, exclusion, principle.[citation]",pauli exclusion principle,[CLS] what actually causes rigidity in matter?...,Pauli exclusion principle,pauli exclusion principle,pauli exclusion principle
1,572757bef1498d1400e8f694,"[School, desegregation, in, the, United, State...","[In, many, parts, of, the, United, States,, af...",African-American,False,65,65,[African-American],african - american,african-american,African-American,african - american,african - american
2,5733fc6ed058e614000b6711,"[How, much, gun, powder, was, destroyed, in, a...","[Governor, Vaudreuil,, who, harboured, ambitio...","45,000 pounds",False,72,73,"[45,000, pounds]","45 , 000 pounds","45,000 pounds","45,000 pounds","45 , 000 pounds","45 , 000 pounds"
4,57308ddc396df919000961a4,"[Imperialism, is, most, often, associated, wit...","[The, principles, of, imperialism, are, often,...",the British Empire,False,13,15,"[the, British, Empire]",the british empire,british empire,British Empire,british empire,british empire
5,5730088e947a6a140053cfb0,"[What, long, term, agenda, was, the, acts, of,...","[The, views, of, Ali, Shariati,, ideologue, of...",conspiracy,False,94,94,[conspiracy],conspiracy,conspiracy,conspiracy,[CLS] what long term agenda was the acts of pl...,conspiracy


In [80]:
val_no_ans.head()

Unnamed: 0,qas_id,question,context,answer,is_impossible,start_pos,end_pos,santiy_check,bert_predictions,albert_predictions,deberta_predictions,distilbert_predictions,electra_predictions
3,5a67b94bf038b7001ab0c444,"[How, many, Swedish, students, were, enrolled,...","[In, Sweden,, pupils, are, free, to, choose, a...",,True,-1,-1,[],,,,,
14,5ad266f6d7d075001a429200,"[If, an, internal, force, acts, on, the, syste...","[This, means, that, in, a, closed, system, of,...",,True,-1,-1,[],,,,,
16,5a2c41c8bfd06b001a5aeaa8,"[When, did, Microsoft, decide, to, appeal, the...","[In, July, 2013,, the, English, High, Court, o...",,True,-1,-1,[],,,,,
17,5ad3f6f5604f3c001a3ffa09,"[Where, did, the, Normans, invade, in, the, 11...","[The, Normans, had, a, profound, effect, on, I...",,True,-1,-1,[],,,,,
21,5ad40ac5604f3c001a3fffc8,"[What, type, of, medicine, did, otachi, reject?]","[The, physicians, of, the, Yuan, court, came, ...",,True,-1,-1,[],,,,,


In [52]:
has_ans_scores = compare_models(val_has_ans)

In [53]:
has_ans_score_df = pd.DataFrame(has_ans_scores)

Evaluation Scores for Samples that have golden truth answer

In [54]:
has_ans_score_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.536156,0.653055
1,Albert,0.504707,0.618053
2,DeBerta,0.647114,0.737612
3,DistilBert,0.511165,0.629484
4,Electra,0.616454,0.736624


In [55]:
no_ans_scores = compare_models(val_no_ans)

In [56]:
no_ans_score_df = pd.DataFrame(no_ans_scores)

Evaluation Scores for Samples that have no golden truth answer

In [57]:
no_ans_score_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,1.0,1.0
1,Albert,1.0,1.0
2,DeBerta,1.0,1.0
3,DistilBert,1.0,1.0
4,Electra,1.0,1.0


## Question Type Analysis

In [58]:
question_types = ['What', 'Who', 'When', 'Where', 'Why', 'How', 'Other']
qn_type_scores = []
val_what_df = val_df[val_df['question_type']=='What']
val_who_df = val_df[val_df['question_type']=='Who']
val_when_df = val_df[val_df['question_type']=='When']
val_where_df = val_df[val_df['question_type']=='Where']
val_why_df = val_df[val_df['question_type']=='Why']
val_how_df = val_df[val_df['question_type']=='How']
val_other_df = val_df[val_df['question_type']=='Other']

### What Type

In [59]:
what_scores = compare_models(val_what_df)
what_scores_df = pd.DataFrame(what_scores)
what_scores_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.634664,0.722074
1,Albert,0.605488,0.689604
2,DeBerta,0.717242,0.780953
3,DistilBert,0.615449,0.702584
4,Electra,0.704068,0.790446


### Who Type

In [60]:
who_scores = compare_models(val_who_df)
who_scores_df = pd.DataFrame(who_scores)
who_scores_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.739531,0.812106
1,Albert,0.706857,0.781374
2,DeBerta,0.793373,0.845801
3,DistilBert,0.739991,0.813007
4,Electra,0.795214,0.865493


### When Type

In [61]:
when_scores = compare_models(val_when_df)
when_scores_df = pd.DataFrame(when_scores)
when_scores_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.714617,0.806692
1,Albert,0.693735,0.777122
2,DeBerta,0.757541,0.830342
3,DistilBert,0.717517,0.806435
4,Electra,0.756381,0.852898


### Where Type

In [62]:
where_scores = compare_models(val_where_df)
where_scores_df = pd.DataFrame(where_scores)
where_scores_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.617481,0.736341
1,Albert,0.556391,0.687157
2,DeBerta,0.667293,0.778904
3,DistilBert,0.546053,0.671162
4,Electra,0.661654,0.794295


### Why Type

In [63]:
why_scores = compare_models(val_why_df)
why_scores_df = pd.DataFrame(why_scores)
why_scores_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.473214,0.663275
1,Albert,0.415179,0.625578
2,DeBerta,0.495536,0.713588
3,DistilBert,0.404018,0.647365
4,Electra,0.497768,0.725139


### How Type

In [64]:
how_scores = compare_models(val_how_df)
how_scores_df = pd.DataFrame(how_scores)
how_scores_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.587697,0.690993
1,Albert,0.592457,0.695071
2,DeBerta,0.757232,0.846101
3,DistilBert,0.569755,0.67516
4,Electra,0.628707,0.749112


### Other Type

In [65]:
other_scores = compare_models(val_other_df)
other_scores_df = pd.DataFrame(other_scores)
other_scores_df

Unnamed: 0,Model,EM Score,F1 Score
0,Bert,0.6452,0.724686
1,Albert,0.645595,0.71323
2,DeBerta,0.74358,0.80124
3,DistilBert,0.622284,0.702282
4,Electra,0.718688,0.798826


## False No Answer Predictions

In [66]:
false_empty_counts = []
for model, model_prediction in zip(models, predictions):
    count = val_df[((val_df[model_prediction] == '')) & (val_df['answer'] != '')].shape[0]
    ratio = count/len(val_df)
    false_empty_counts.append({'Model': model, 'count': count, 'incorrect_ratio': ratio})

In [67]:
false_empty_preds_df = pd.DataFrame(false_empty_counts)
false_empty_preds_df

Unnamed: 0,Model,count,incorrect_ratio
0,Bert,1127,0.042963
1,Albert,1959,0.07468
2,DeBerta,1592,0.060689
3,DistilBert,1486,0.056648
4,Electra,1578,0.060156


## Qn Length Analysis

In [69]:
val_df["Answer Length"] = val_df["answer"].apply(len)
val_df["Answer Length"].describe()

count    26232.000000
mean        15.461383
std         19.323377
min          0.000000
25%          3.000000
50%         10.000000
75%         20.000000
max        160.000000
Name: Answer Length, dtype: float64

In [70]:
val_df["bert_predictions"].apply(len).describe()

count    26232.000000
mean        86.125877
std        204.713788
min          0.000000
25%          0.000000
50%         12.000000
75%         37.250000
max       2612.000000
Name: bert_predictions, dtype: float64

In [71]:
val_df["albert_predictions"].apply(len).describe()

count    26232.000000
mean        99.688548
std        214.965429
min          0.000000
25%          0.000000
50%         12.000000
75%         51.000000
max       2160.000000
Name: albert_predictions, dtype: float64

In [72]:
val_df["deberta_predictions"].apply(len).describe()

count    26232.000000
mean        67.871150
std        174.099937
min          0.000000
25%          0.000000
50%         11.000000
75%         28.000000
max       1832.000000
Name: deberta_predictions, dtype: float64

In [73]:
val_df["distilbert_predictions"].apply(len).describe()

count    26232.000000
mean        90.005985
std        211.544306
min          0.000000
25%          0.000000
50%         12.000000
75%         39.000000
max       2612.000000
Name: distilbert_predictions, dtype: float64

In [74]:
val_df["electra_predictions"].apply(len).describe()

count    26232.000000
mean        40.486276
std        118.138139
min          0.000000
25%          0.000000
50%         11.000000
75%         24.000000
max       1739.000000
Name: electra_predictions, dtype: float64

In [75]:
answer_length_comp = []
for model, model_prediction in zip(models, predictions):
    val_df["Answer Length Difference"] = val_df[model_prediction].apply(lambda x: len(x.split())) - val_df["answer"].apply(lambda x: len(x.split()))

    # Identify if answers are too short, too long, or within a certain range
    short_threshold = -5  # Define the threshold for too short
    long_threshold = 5    # Define the threshold for too long

    val_df["Answer Length Category"] = val_df["Answer Length Difference"].apply(
        lambda diff: "Too Short" if diff < short_threshold else (
            "Too Long" if diff > long_threshold else "Within Range"
        )
    )

    # Count the occurrences of each category
    category_counts = val_df["Answer Length Category"].value_counts()
    ans_len_result = {'Model': model}
    ans_len_result.update(category_counts.to_dict())
    answer_length_comp.append(ans_len_result)

ans_len_df = pd.DataFrame(answer_length_comp)

In [76]:
ans_len_df

Unnamed: 0,Model,Within Range,Too Long,Too Short
0,Bert,20975,5077,180
1,Albert,20104,5876,252
2,DeBerta,22301,3677,254
3,DistilBert,20731,5288,213
4,Electra,23489,2451,292
