# Define tool and model of the tool

In [1]:
import sys

TOOLS_NAME_NER = "ner"
MODEL_TOOLS_NAME_NER = "ageng-anugrah/indobert-large-p2-finetuned-ner"

TOOLS_NAME_POS = "token-classification"
MODEL_TOOLS_NAME_POS = "ageng-anugrah/indobert-large-p2-finetuned-chunking"

MODEL_SIMILARITY_NAME = "paraphrase-multilingual-mpnet-base-v2"

SAMPLE = sys.maxsize
#SAMPLE = 50

# Import anything

In [2]:
import transformers
import evaluate
import torch
import operator
import re
import sys
import collections
import string
import contextlib
import gc
import random

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from evaluate import load
from nusacrowd import NusantaraConfigHelper
from datetime import datetime
from huggingface_hub import notebook_login
from tqdm import tqdm
from huggingface_hub import HfApi
from sentence_transformers import SentenceTransformer, util

from datasets import (
    load_dataset, 
    Dataset,
    DatasetDict
)
from transformers import (
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback, 
    AutoModelForQuestionAnswering,
    AutoModelForTokenClassification,
    pipeline
)

# Retrieve QA dataset

In [3]:
conhelps = NusantaraConfigHelper()
data_qas = conhelps.filtered(lambda x: 'idk_mrc' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas['train'])
df_validation = pd.DataFrame(data_qas['validation'])
df_test = pd.DataFrame(data_qas['test'])

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_train['context']))):
    for j in df_train["qas"][i]:
        if len(j['answers']) != 0:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": j['answers'][0]['text'], 
                                                           "answer_start": j['answers'][0]['answer_start'], 
                                                           "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                           ignore_index=True)
        else:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": str(), 
                                                           "answer_start": 0, 
                                                           "answer_end": 0}}, 
                                                           ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_validation['context']))):
    for j in df_validation["qas"][i]:
        if len(j['answers']) != 0:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)        

cols = ['context', 'question', 'answer']
new_df_test = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_test['context']))):
    for j in df_test["qas"][i]:
        if len(j['answers']) != 0:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)
test_dataset = Dataset.from_dict(new_df_test)

data_qas = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})
data_qas



  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 3659/3659 [00:16<00:00, 217.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 358/358 [00:01<00:00, 263.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 378/378 [00:01<00:00, 261.21it/s]


DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9332
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 764
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 844
    })
})

# Convert to NLI, with hypothesis being just do concat question & answer

## Convert Dataset to DataFrame format

In [4]:
data_qas_train_df = pd.DataFrame(data_qas["train"][:SAMPLE])
data_qas_val_df = pd.DataFrame(data_qas["validation"][:SAMPLE])
data_qas_test_df = pd.DataFrame(data_qas["test"][:SAMPLE])

## Retrieve answer text only

In [5]:
def retrieve_answer_text(data):
    for i in range(len(data)):
        data['answer'][i] = data['answer'][i]['text']
    return data

In [6]:
data_qas_train_df = retrieve_answer_text(data_qas_train_df)
data_qas_val_df = retrieve_answer_text(data_qas_val_df)
data_qas_test_df = retrieve_answer_text(data_qas_test_df)

## Delete all unanswerable row

In [7]:
data_qas_train_df = data_qas_train_df[data_qas_train_df['answer'] != '']
data_qas_val_df = data_qas_val_df[data_qas_val_df['answer'] != '']
data_qas_test_df = data_qas_test_df[data_qas_test_df['answer'] != '']

### Reset index number

In [8]:
data_qas_train_df = data_qas_train_df.reset_index(drop=True)
data_qas_val_df = data_qas_val_df.reset_index(drop=True)
data_qas_test_df = data_qas_test_df.reset_index(drop=True)

## Create NLI dataset from copy of QA dataset above

In [9]:
data_nli_train_df = data_qas_train_df.copy()
data_nli_val_df = data_qas_val_df.copy()
data_nli_test_df = data_qas_test_df.copy()

In [10]:
#data_nli_wrong_train_df = data_qas_train_df.copy()
#data_nli_wrong_val_df = data_qas_val_df.copy()
#data_nli_wrong_test_df = data_qas_test_df.copy()

## Convert context pair to premise (only renaming column)

In [11]:
data_nli_train_df = data_nli_train_df.rename(columns={"context": "premise"})
data_nli_val_df = data_nli_val_df.rename(columns={"context": "premise"})
data_nli_test_df = data_nli_test_df.rename(columns={"context": "premise"})

# Add contradiction label cases

## Import pipeline to create contradiction cases

In [12]:
seed_value = 42
random.seed(seed_value)

In [13]:
nlp_tools_ner = pipeline(task = TOOLS_NAME_NER, 
                     model = MODEL_TOOLS_NAME_NER, 
                     tokenizer = AutoTokenizer.from_pretrained(MODEL_TOOLS_NAME_NER, 
                                                               model_max_length=512, 
                                                               truncation=True),
                     aggregation_strategy = 'simple')

In [14]:
nlp_tools_pos = pipeline(task = TOOLS_NAME_POS, 
                     model = MODEL_TOOLS_NAME_POS, 
                     tokenizer = AutoTokenizer.from_pretrained(MODEL_TOOLS_NAME_POS, 
                                                               model_max_length=512, 
                                                               truncation=True),
                     aggregation_strategy = 'simple')

## Add NER and chunking tag column in DataFrame

In [15]:
def add_ner_and_chunking_tag(data, ner=nlp_tools_ner, pos=nlp_tools_pos):
    
    data['ner_tag_answer'] = ""
    data['chunking_tag_answer'] = ""
    
    data['ner_tag_premise'] = ""
    data['chunking_tag_premise'] = ""
    
    for i in tqdm(range(len(data))):
        
        answer = data['answer'][i]
        premise = data['premise'][i]
        
        ner_premise_array = []
        chunking_premise_array = []
        
        try:
            ner_tag_answer = (ner(answer)[0]['entity_group'], answer)
        except:
            ner_tag_answer = ("NULL", answer)
        
        try:
            chunking_tag_answer = (pos(answer)[0]['entity_group'], answer)
        except:
            chunking_tag_answer = ("NULL", answer)
            
        data['ner_tag_answer'][i] = ner_tag_answer
        data['chunking_tag_answer'][i] = chunking_tag_answer
        
        if len(ner(data['premise'][i])) == 0:
            ner_tag_premise = "NO TOKEN DETECTED"
            ner_premise_array.append(ner_tag_premise)
        
        else:
            for j in tqdm(ner(data['premise'][i])):
                ner_tag_premise = (j['entity_group'], j['word'])
                ner_premise_array.append(ner_tag_premise)
        
        if len(pos(data['premise'][i])) == 0:
            chunking_tag_premise = "NO TOKEN DETECTED"
            chunking_premise_array.append(chunking_tag_premise)
        
        else:
            for j in tqdm(pos(data['premise'][i])):
                chunking_tag_premise = (j['entity_group'], j['word'])
                chunking_premise_array.append(chunking_tag_premise)
                                                
        data['ner_tag_premise'][i] = ner_premise_array
        data['chunking_tag_premise'][i] = chunking_premise_array  
    
    return data

In [16]:
data_nli_train_df = add_ner_and_chunking_tag(data_nli_train_df)
data_nli_val_df = add_ner_and_chunking_tag(data_nli_val_df)
data_nli_test_df = add_ner_and_chunking_tag(data_nli_test_df)

  0%|                                                                                          | 0/5042 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 28/28 [00:00<00:00, 176337.11it/s][A

100%|███████████████████████████████████████████████████████████████████████████████| 81/81 [00:00<00:00, 592912.08it/s][A
  0%|                                                                                | 1/5042 [00:05<7:24:35,  5.29s/it]
100%|███████████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 170879.05it/s][A

100%|███████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 311806.77it/s][A
  0%|                                                                                | 2/5042 [00:11<7:55:29,  5.66s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 80018.52it/s][A

100%|█████████

# Create wrong answer

This is the flow to create wrong answer:

1. Check the NER and POS/Chunking labels of the right_answer and context/premise.

2. Search and group NER and POS/Chunking labels that match the right_answer throughout the context/premise.

3. Perform NER classification. There will be two branches here, namely:

   3a. If the NER of the right_answer can be detected, then calculate the distance using semantic similarity or word vectors between the right_answer and various possible wrong_answers with the same NER as the right_answer. Once done, proceed to the final wrong_answer.
   
   3b. If the NER of the right_answer cannot be detected (NULL) or context/premise does not contain any of NER of right_answer, then the POS/Chunking of the right_answer will be identified.
   
4. Perform POS/Chunking classification. Continuation from point 3b. There will be two more branches:

   4a. If the POS/Chunking of the right_answer can be detected, then calculate the distance using semantic similarity or word vectors between the right_answer and various possible wrong_answers with the same POS/Chunking as the right_answer. Once done, proceed to the final wrong_answer.
   
   4b. If the POS/Chunking of the right_answer cannot be detected (NULL) or context/premise does not contain any of NER of right_answer, then the final wrong_answer will be chosen based on a random word (random_word) from the context/premise.

In [17]:
model_similarity = SentenceTransformer(MODEL_SIMILARITY_NAME)

def return_similarity_sorted_array(right_answer, sentence_array, rank=0, model=model_similarity):
    
    embedding_right_answer = model.encode([right_answer], convert_to_tensor=True)
    embedding_sentence_array = model.encode(sentence_array, convert_to_tensor=True)
    
    cosine_scores = util.pytorch_cos_sim(embedding_right_answer, embedding_sentence_array)
    
    sorted_indices = cosine_scores.argsort(descending=True)[0]
    sorted_array = [sentence_array[i] for i in sorted_indices]
    
    return sorted_array

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: paraphrase-multilingual-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


In [18]:
def remove_values_with_hash(arr):
    return [item for item in arr if "#" not in item]

In [19]:
def select_random_word(text):
    words = re.findall(r'\w+', text)
    random_word = random.choice(words)
    return random_word

In [20]:
def create_wrong_answer(data):
    
    data['same_ner_tag_answer'] = ""
    data['same_chunking_tag_answer'] = ""
    data['wrong_answer'] = ""
    data['plausible_answer_based_on_method'] = ""
    
    for i in tqdm(range(len(data))):
        
        right_answer = data['answer'][i]
        premise = data['premise'][i]
        
        # Grouped with the same NER group, between answer and word of premise
        same_ner_tag_answer_array = []
        ner_tag_answer = data['ner_tag_answer'][i][0]
        ner_tag_premise = data['ner_tag_premise'][i]
        
        for ner_tag in ner_tag_premise:
            
            if isinstance(ner_tag, tuple):
                ner_tag_word = ner_tag[0]
            else:
                ner_tag_word = None
            
            if ner_tag_answer == ner_tag_word:
                same_ner_tag_answer_array.append(ner_tag[1])
        
        same_ner_tag_answer_array = remove_values_with_hash(same_ner_tag_answer_array)
        data['same_ner_tag_answer'][i] = same_ner_tag_answer_array
        
        # Grouped with the same Chunking group, between answer and word of premise
        same_chunking_tag_answer_array = []
        chunking_tag_answer = data['chunking_tag_answer'][i][0]
        chunking_tag_premise = data['chunking_tag_premise'][i]
        
        for chunking_tag in chunking_tag_premise:
            
            if isinstance(chunking_tag, tuple):
                chunking_tag_word = chunking_tag[0]
            else:
                chunking_tag_word = None
            
            if chunking_tag_answer == chunking_tag_word:
                same_chunking_tag_answer_array.append(chunking_tag[1])
        
        same_chunking_tag_answer_array = remove_values_with_hash(same_chunking_tag_answer_array)
        data['same_chunking_tag_answer'][i] = same_chunking_tag_answer_array
        
        # Start to create wrong answer
        plausible_answer_array = ""
        
        # Perform NER classification
        
        # If the NER of the right_answer can be detected, then calculate the distance using semantic 
        # similarity or word vectors between the right_answer and various possible wrong_answers with 
        # the same NER as the right_answer. Once done, proceed to the final wrong_answer.
        if same_ner_tag_answer_array != []:
            wrong_answer_array = return_similarity_sorted_array(right_answer, same_ner_tag_answer_array)
            plausible_answer_array = wrong_answer_array
            wrong_answer = wrong_answer_array[0]
            
        # If the NER of the right_answer cannot be detected (NULL) or context/premise does not contain 
        # any of NER of right_answer, then the POS/Chunking of the right_answer will be identified.
        
        # Perform POS/Chunking classification
        else:
            
            # If the POS/Chunking of the right_answer can be detected, then calculate the distance 
            # using semantic similarity or word vectors between the right_answer and various possible 
            # wrong_answers with the same POS/Chunking as the right_answer. Once done, proceed to the 
            # final wrong_answer.
            
            if same_chunking_tag_answer_array != []:
                wrong_answer_array = return_similarity_sorted_array(right_answer, same_chunking_tag_answer_array)
                plausible_answer_array = wrong_answer_array
                wrong_answer = wrong_answer_array[0]
            
            # If the POS/Chunking of the right_answer cannot be detected (NULL) or context/premise 
            # does not contain any of NER of right_answer, then the final wrong_answer will be chosen 
            # based on a random word (random_word) from the context/premise.
            
            else:
                plausible_answer_array = premise.split()
                wrong_answer = select_random_word(premise)
        
        data['wrong_answer'][i] = wrong_answer
        data['plausible_answer_based_on_method'][i] = plausible_answer_array
    
    return data       

In [21]:
data_nli_train_df = create_wrong_answer(data_nli_train_df)
data_nli_val_df = create_wrong_answer(data_nli_val_df)
data_nli_test_df = create_wrong_answer(data_nli_test_df)

  0%|                                                                                          | 0/5042 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|                                                                                | 1/5042 [00:01<2:15:23,  1.61s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|                                                                                  | 3/5042 [00:01<38:56,  2.16it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|                                                                                  | 5/5042 [00:01<22:11,  3.78it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|                                                                                  | 7/5042 [00:02<15:14,  5.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|▏                                                                                | 10/5042 [00:02<09:56,  8.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|▏                                                                                | 13/5042 [00:02<07:34, 11.06it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|▏                                                                                | 15/5042 [00:02<06:46, 12.36it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|▎                                                                                | 18/5042 [00:02<05:46, 14.49it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|▎                                                                                | 21/5042 [00:02<05:09, 16.23it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|▍                                                                                | 24/5042 [00:02<04:45, 17.56it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▍                                                                                | 27/5042 [00:03<04:33, 18.30it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▍                                                                                | 29/5042 [00:03<04:30, 18.53it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▌                                                                                | 32/5042 [00:03<04:21, 19.18it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▌                                                                                | 35/5042 [00:03<04:14, 19.69it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▌                                                                                | 38/5042 [00:03<04:10, 19.95it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▋                                                                                | 41/5042 [00:03<04:04, 20.45it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▋                                                                                | 44/5042 [00:03<04:03, 20.53it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▊                                                                                | 47/5042 [00:03<04:02, 20.60it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▊                                                                                | 50/5042 [00:04<04:07, 20.17it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  1%|▊                                                                                | 53/5042 [00:04<04:06, 20.26it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

# Prevent the wrong answer from being the same as the right answer, via overlapping span

In [22]:
def find_substring_span(long_string, substring):
    long_string = long_string.lower()
    substring = substring.lower()
    
    start_index = long_string.find(substring)
    
    if start_index != -1:
        end_index = start_index + len(substring) - 1
        return start_index, end_index
    else:
        return None

In [23]:
def check_overlap(span1, span2):
    return span1[0] <= span2[1] and span2[0] <= span1[1]

In [24]:
def prevent_same_answer(data):
    
    data['properties'] = ""
    
    for i in tqdm(range(len(data))):
        
        premise = data['premise'][i]
        right_answer = data['answer'][i]
        wrong_answer = data['wrong_answer'][i]
        plausible_answer_array = data['plausible_answer_based_on_method'][i]
        
        right_answer_span = find_substring_span(premise, right_answer)
        wrong_answer_span = find_substring_span(premise, wrong_answer)
        
        try:
            is_span = check_overlap(right_answer_span, wrong_answer_span)
        
        except:
            is_span = True # Exit plan, if right/wrong answer cannot be detected in the premise. Edge cases.

        if is_span:
            plausible_answer_array = [item for item in plausible_answer_array \
                                      if item not in [right_answer, wrong_answer]]

            if len(plausible_answer_array) <= 1:
                wrong_answer = select_random_word(premise)

            else:
                wrong_answer = plausible_answer_array[0] # Take the highest value in the sorted array
            
            data['properties'] = "Detected span that is the SAME as the right answer, search randomly from plausible answers"
        
        else:
            data['properties'] = "Detected span that is DIFFERENT from the right answer, the wrong answer is not changed"
        
        data['wrong_answer'][i] = wrong_answer
        
    return data

In [25]:
data_nli_train_df = prevent_same_answer(data_nli_train_df)
data_nli_val_df = prevent_same_answer(data_nli_val_df)
data_nli_test_df = prevent_same_answer(data_nli_test_df)

# Split to two dataset: right dataset & wrong dataset

In [26]:
def move_to_column_number(data, column_name="hypothesis", column_num=3):

    cols = list(data.columns)
    cols.remove(column_name)
    cols.insert(column_num, column_name)

    data = data[cols]
    
    return data

In [27]:
columns_to_exclude = ['wrong_answer']

data_nli_right_train_df = data_nli_train_df.drop(columns=columns_to_exclude).copy()
data_nli_right_val_df = data_nli_val_df.drop(columns=columns_to_exclude).copy()
data_nli_right_test_df = data_nli_test_df.drop(columns=columns_to_exclude).copy()

In [28]:
columns_to_exclude = ['answer']

data_nli_wrong_train_df = data_nli_train_df.drop(columns=columns_to_exclude).copy()
data_nli_wrong_val_df = data_nli_val_df.drop(columns=columns_to_exclude).copy()
data_nli_wrong_test_df = data_nli_test_df.drop(columns=columns_to_exclude).copy()

data_nli_wrong_train_df.rename(columns={'wrong_answer': 'answer'}, inplace=True)
data_nli_wrong_val_df.rename(columns={'wrong_answer': 'answer'}, inplace=True)
data_nli_wrong_test_df.rename(columns={'wrong_answer': 'answer'}, inplace=True)

data_nli_wrong_train_df = move_to_column_number(data_nli_wrong_train_df, "answer", 2)
data_nli_wrong_val_df = move_to_column_number(data_nli_wrong_val_df, "answer", 2)
data_nli_wrong_test_df = move_to_column_number(data_nli_wrong_test_df, "answer", 2)

# Convert question-answer pair to hypothesis

In [29]:
def convert_question_and_answer_to_hypothesis(data):
    for i in range(len(data)):
        data['hypothesis'] = data['question'] + ' ' + data['answer']
    return data

In [30]:
data_nli_right_train_df = convert_question_and_answer_to_hypothesis(data_nli_right_train_df)
data_nli_right_val_df = convert_question_and_answer_to_hypothesis(data_nli_right_val_df)
data_nli_right_test_df = convert_question_and_answer_to_hypothesis(data_nli_right_test_df)

data_nli_right_train_df = move_to_column_number(data_nli_right_train_df, "hypothesis", 3)
data_nli_right_val_df = move_to_column_number(data_nli_right_val_df, "hypothesis", 3)
data_nli_right_test_df = move_to_column_number(data_nli_right_test_df, "hypothesis", 3)

In [31]:
data_nli_wrong_train_df = convert_question_and_answer_to_hypothesis(data_nli_wrong_train_df)
data_nli_wrong_val_df = convert_question_and_answer_to_hypothesis(data_nli_wrong_val_df)
data_nli_wrong_test_df = convert_question_and_answer_to_hypothesis(data_nli_wrong_test_df)

data_nli_wrong_train_df = move_to_column_number(data_nli_wrong_train_df, "hypothesis", 3)
data_nli_wrong_val_df = move_to_column_number(data_nli_wrong_val_df, "hypothesis", 3)
data_nli_wrong_test_df = move_to_column_number(data_nli_wrong_test_df, "hypothesis", 3)

# Add label: entailment & contradiction

In [32]:
data_nli_right_train_df['label'] = 'entailment'
data_nli_right_val_df['label'] = 'entailment'
data_nli_right_test_df['label'] = 'entailment'

data_nli_right_train_df = move_to_column_number(data_nli_right_train_df, "label", 4)
data_nli_right_train_df = move_to_column_number(data_nli_right_val_df, "label", 4)
data_nli_right_train_df = move_to_column_number(data_nli_right_test_df, "label", 4)

In [33]:
data_nli_wrong_train_df['label'] = 'contradiction'
data_nli_wrong_val_df['label'] = 'contradiction'
data_nli_wrong_test_df['label'] = 'contradiction'

data_nli_wrong_train_df = move_to_column_number(data_nli_wrong_train_df, "label", 4)
data_nli_wrong_val_df = move_to_column_number(data_nli_wrong_val_df, "label", 4)
data_nli_wrong_test_df = move_to_column_number(data_nli_wrong_test_df, "label", 4)

# Concat the right and wrong NLI to one NLI dataset

In [34]:
data_nli_train_df_final = pd.concat([data_nli_right_train_df, data_nli_wrong_train_df], axis=0, ignore_index=True)
data_nli_val_df_final = pd.concat([data_nli_right_val_df, data_nli_wrong_val_df], axis=0, ignore_index=True)
data_nli_test_df_final = pd.concat([data_nli_right_test_df, data_nli_wrong_test_df], axis=0, ignore_index=True)

# Convert to DataFrame format to CSV

In [35]:
data_nli_train_df_final.to_csv("data_nli_train_df.csv", index=False)
data_nli_val_df_final.to_csv("data_nli_val_df.csv", index=False)
data_nli_test_df_final.to_csv("data_nli_test_df.csv", index=False)