# Define tool and model of the tool, to create contradiction cases

In [1]:
import sys

TOOLS_NAME = "ner"
MODEL_TOOLS_NAME = "ageng-anugrah/indobert-large-p2-finetuned-ner"
SAMPLE = sys.maxsize

# Import anything

In [2]:
import transformers
import evaluate
import torch
import operator
import re
import sys
import collections
import string
import contextlib
import gc
import random

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from evaluate import load
from nusacrowd import NusantaraConfigHelper
from datetime import datetime
from huggingface_hub import notebook_login
from tqdm import tqdm
from huggingface_hub import HfApi

from datasets import (
    load_dataset, 
    Dataset,
    DatasetDict
)
from transformers import (
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback, 
    AutoModelForQuestionAnswering,
    AutoModelForTokenClassification,
    pipeline
)

# Retrieve QA dataset

In [3]:
conhelps = NusantaraConfigHelper()
data_qas = conhelps.filtered(lambda x: 'idk_mrc' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas['train'])
df_validation = pd.DataFrame(data_qas['validation'])
df_test = pd.DataFrame(data_qas['test'])

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_train['context']))):
    for j in df_train["qas"][i]:
        if len(j['answers']) != 0:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": j['answers'][0]['text'], 
                                                           "answer_start": j['answers'][0]['answer_start'], 
                                                           "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                           ignore_index=True)
        else:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": str(), 
                                                           "answer_start": 0, 
                                                           "answer_end": 0}}, 
                                                           ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_validation['context']))):
    for j in df_validation["qas"][i]:
        if len(j['answers']) != 0:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)        

cols = ['context', 'question', 'answer']
new_df_test = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_test['context']))):
    for j in df_test["qas"][i]:
        if len(j['answers']) != 0:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)
test_dataset = Dataset.from_dict(new_df_test)

data_qas = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})
data_qas



  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 3659/3659 [00:17<00:00, 215.09it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 358/358 [00:01<00:00, 286.10it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 378/378 [00:01<00:00, 260.24it/s]


DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9332
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 764
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 844
    })
})

# Convert to NLI, with hypothesis being just do concat question & answer

## Convert Dataset to DataFrame format

In [4]:
data_qas_train_df = pd.DataFrame(data_qas["train"][:SAMPLE])
data_qas_val_df = pd.DataFrame(data_qas["validation"][:SAMPLE])
data_qas_test_df = pd.DataFrame(data_qas["test"][:SAMPLE])

## Retrieve answer text only

In [5]:
def retrieve_answer_text(data):
    for i in range(len(data)):
        data['answer'][i] = data['answer'][i]['text']
    return data

In [6]:
data_qas_train_df = retrieve_answer_text(data_qas_train_df)
data_qas_val_df = retrieve_answer_text(data_qas_val_df)
data_qas_test_df = retrieve_answer_text(data_qas_test_df)

## Create NLI dataset from copy of QA dataset above

In [7]:
data_nli_right_train_df = data_qas_train_df.copy()
data_nli_right_val_df = data_qas_val_df.copy()
data_nli_right_test_df = data_qas_test_df.copy()

## Convert context pair to premise (only renaming column)

In [8]:
data_nli_right_train_df = data_nli_right_train_df.rename(columns={"context": "premise"})
data_nli_right_val_df = data_nli_right_val_df.rename(columns={"context": "premise"})
data_nli_right_test_df = data_nli_right_test_df.rename(columns={"context": "premise"})

## Add entailment label for all row

In [9]:
data_nli_right_train_df['label'] = 'entailment'
data_nli_right_val_df['label'] = 'entailment'
data_nli_right_test_df['label'] = 'entailment'

# Add contradiction label cases

## Import pipeline to create contradiction cases

In [10]:
seed_value = 42
random.seed(seed_value)

tokenizer_kwargs = {'max_length': 512}

In [11]:
nlp_tools = pipeline(task = TOOLS_NAME, 
                     model = MODEL_TOOLS_NAME, 
                     tokenizer = AutoTokenizer.from_pretrained(MODEL_TOOLS_NAME, 
                                                               model_max_length=512, 
                                                               truncation=True))

## Retrieve entity from answer column, by creating wrong answer NLI dataset

In [12]:
def retrieve_only_entity(data, nlp_tools=nlp_tools):
    
    data['ner_label'] = ""
    
    for i in tqdm(range(len(data))):
        
        entity_array = []
        ner_result = nlp_tools(data['answer'][i])
        
        for j in ner_result:
            entity = j['entity'][2:]
            entity_array.append(entity)
        
        data['ner_label'][i] = set(entity_array)
        
    return data

In [None]:
data_nli_wrong_train_df = retrieve_only_entity(data_nli_right_train_df)
data_nli_wrong_val_df = retrieve_only_entity(data_nli_right_val_df)
data_nli_wrong_test_df = retrieve_only_entity(data_nli_right_test_df)

 10%|███████▌                                                                      | 901/9332 [22:51<5:22:07,  2.29s/it]

## Find the same entity in the premise

In [None]:
def find_wrong_answer_in_premise(data, nlp_tools=nlp_tools):
    
    data['wrong_answer'] = ""
    
    for i in tqdm(range(len(data))):
        
        entity_array = []
        
        ner_answer = data['ner_label'][i]
        ner_premise = nlp_tools(data['premise'][i])
        
        if ner_answer == set():
            # Choose random answer, like: random word in premise
            premise = data['premise'][0]
            random_word = random.choice(premise.split())
            data['wrong_answer'][i] = random_word
            
        else:
            
            plausible_answer = []
        
            for j in ner_answer:

                for k in ner_premise:
                    
                    entity_premise = k['entity'][2:]
                    word_entity_premise = k['word']

                    if j == entity_premise:
                        for l in data['answer'][i]:
                            if l.lower() != word_entity_premise:
                                if '#' not in word_entity_premise:
                                    plausible_answer.append(word_entity_premise)
            
            if len(plausible_answer) != 0:
                wrong_answer = random.choice(plausible_answer)
                data['wrong_answer'][i] = wrong_answer
            
            else:
                premise = data['premise'][0]
                random_word = random.choice(premise.split())
                data['wrong_answer'][i] = random_word
                
    data = data.drop('answer', axis=1)
    data = data.rename(columns={'wrong_answer': 'answer'})
    
    return data

In [None]:
data_nli_wrong_train_df = find_wrong_answer_in_premise(data_nli_wrong_train_df)
data_nli_wrong_val_df = find_wrong_answer_in_premise(data_nli_wrong_val_df)
data_nli_wrong_test_df = find_wrong_answer_in_premise(data_nli_wrong_test_df)

## For that wrong answer, assign contradiction label

In [None]:
data_nli_wrong_train_df['label'] = 'contradiction'
data_nli_wrong_val_df['label'] = 'contradiction'
data_nli_wrong_test_df['label'] = 'contradiction'

# Concat the right and wrong NLI to one NLI dataset

In [None]:
data_nli_train_df = pd.concat([data_nli_right_train_df, data_nli_wrong_train_df], axis=0, ignore_index=True)
data_nli_val_df = pd.concat([data_nli_right_val_df, data_nli_wrong_val_df], axis=0, ignore_index=True)
data_nli_test_df = pd.concat([data_nli_right_test_df, data_nli_wrong_test_df], axis=0, ignore_index=True)

# Convert question-answer pair to hypothesis

In [None]:
def convert_question_and_answer_to_hypothesis(data):
    for i in range(len(data)):
        data['hypothesis'] = data['question'] + ' ' + data['answer']
    return data

In [None]:
data_nli_train_df = convert_question_and_answer_to_hypothesis(data_nli_train_df)
data_nli_val_df = convert_question_and_answer_to_hypothesis(data_nli_val_df)
data_nli_test_df = convert_question_and_answer_to_hypothesis(data_nli_test_df)

# Drop another column other than: premise, hypothesis, and label

In [None]:
columns_to_keep = ['premise', 'hypothesis', 'label']
columns_to_drop = [col for col in data_nli_train_df.columns if col not in columns_to_keep]

In [None]:
data_nli_train_df = data_nli_train_df.drop(columns=columns_to_drop)
data_nli_val_df = data_nli_val_df.drop(columns=columns_to_drop)
data_nli_test_df = data_nli_test_df.drop(columns=columns_to_drop)

# Convert to DataFrame format to CSV

In [None]:
data_nli_train_df.to_csv("data_nli_train_ner_df.csv", index=False)
data_nli_val_df.to_csv("data_nli_val_ner_df.csv", index=False)
data_nli_test_df.to_csv("data_nli_test_ner_df.csv", index=False)