# Define tool and model of the tool

In [1]:
!nvidia-smi

Wed Sep 13 09:42:52 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.161.03   Driver Version: 470.161.03   CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  On   | 00000000:06:00.0 Off |                    0 |
| N/A   32C    P0    43W / 300W |      3MiB / 32510MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-SXM2...  On   | 00000000:07:00.0 Off |                    0 |
| N/A   34C    P0    42W / 300W |      3MiB / 32510MiB |      0%      Default |
|       

In [2]:
import os, torch
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
os.environ['CUDA_VISIBLE_DEVICES'] = '6'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import sys

TASK_NER_NAME = "ner"
MODEL_NER_NAME = "ageng-anugrah/indobert-large-p2-finetuned-ner"

TASK_CHUNKING_NAME = "token-classification"
MODEL_CHUNKING_NAME = "ageng-anugrah/indobert-large-p2-finetuned-chunking"

MODEL_SIMILARITY_NAME = "paraphrase-multilingual-mpnet-base-v2"
URL_STOPWORD = "https://raw.githubusercontent.com/6/stopwords-json/master/stopwords-all.json"

TASK_PARAPHRASER_NAME = "text2text-generation"
MODEL_PARAPHRASER_NAME = ""

# SAMPLE = sys.maxsize
SAMPLE = 100

# Import anything

In [4]:
import transformers
import evaluate
import torch
import operator
import re
import sys
import collections
import string
import contextlib
import gc
import random
import string
import requests

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from evaluate import load
from nusacrowd import NusantaraConfigHelper
from datetime import datetime
from huggingface_hub import notebook_login
from tqdm import tqdm
from huggingface_hub import HfApi
from sentence_transformers import SentenceTransformer, util

from datasets import (
    load_dataset, 
    Dataset,
    DatasetDict
)
from transformers import (
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback, 
    AutoModelForQuestionAnswering,
    AutoModelForTokenClassification,
    pipeline
)

# Retrieve QA dataset

In [5]:
print("PROGRAM STARTED")

PROGRAM STARTED


In [6]:
conhelps = NusantaraConfigHelper()
data_qas = conhelps.filtered(lambda x: 'idk_mrc' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas['train'])
df_validation = pd.DataFrame(data_qas['validation'])
df_test = pd.DataFrame(data_qas['test'])

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_train['context']))):
    for j in df_train["qas"][i]:
        if len(j['answers']) != 0:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": j['answers'][0]['text'], 
                                                           "answer_start": j['answers'][0]['answer_start'], 
                                                           "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                           ignore_index=True)
        else:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": str(), 
                                                           "answer_start": 0, 
                                                           "answer_end": 0}}, 
                                                           ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_validation['context']))):
    for j in df_validation["qas"][i]:
        if len(j['answers']) != 0:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)        

cols = ['context', 'question', 'answer']
new_df_test = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_test['context']))):
    for j in df_test["qas"][i]:
        if len(j['answers']) != 0:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)
test_dataset = Dataset.from_dict(new_df_test)

data_qas = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})
data_qas



  0%|          | 0/3 [00:00<?, ?it/s]

100%|███████████████████████████████████████████████████████████████████████████████| 3659/3659 [01:46<00:00, 34.32it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 358/358 [00:08<00:00, 42.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 378/378 [00:08<00:00, 43.43it/s]


DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 9332
    })
    validation: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 764
    })
    test: Dataset({
        features: ['context', 'question', 'answer'],
        num_rows: 844
    })
})

# Convert to NLI, with hypothesis being just do concat question & answer

## Convert Dataset to DataFrame format

In [7]:
seed_value = 42
random.seed(seed_value)

In [8]:
if SAMPLE == sys.maxsize:
    data_qas_train_df = pd.DataFrame(data_qas["train"][:SAMPLE])
    data_qas_val_df = pd.DataFrame(data_qas["validation"][:SAMPLE])
    data_qas_test_df = pd.DataFrame(data_qas["test"][:SAMPLE])

else:
    data_qas_train_df = (pd.DataFrame(data_qas["train"])).sample(n=SAMPLE, random_state=42)
    data_qas_val_df = (pd.DataFrame(data_qas["validation"])).sample(n=SAMPLE, random_state=42)
    data_qas_test_df = (pd.DataFrame(data_qas["test"])).sample(n=SAMPLE, random_state=42)

    data_qas_train_df = data_qas_train_df.reset_index(drop=True)
    data_qas_val_df = data_qas_val_df.reset_index(drop=True)
    data_qas_test_df = data_qas_test_df.reset_index(drop=True)

## Retrieve answer text only

In [9]:
def retrieve_answer_text(data):
    for i in range(len(data)):
        data['answer'][i] = data['answer'][i]['text']
    return data

In [10]:
data_qas_train_df = retrieve_answer_text(data_qas_train_df)
data_qas_val_df = retrieve_answer_text(data_qas_val_df)
data_qas_test_df = retrieve_answer_text(data_qas_test_df)

## Delete all unanswerable row

In [11]:
data_qas_train_df = data_qas_train_df[data_qas_train_df['answer'] != '']
data_qas_val_df = data_qas_val_df[data_qas_val_df['answer'] != '']
data_qas_test_df = data_qas_test_df[data_qas_test_df['answer'] != '']

### Reset index number

In [12]:
data_qas_train_df = data_qas_train_df.reset_index(drop=True)
data_qas_val_df = data_qas_val_df.reset_index(drop=True)
data_qas_test_df = data_qas_test_df.reset_index(drop=True)

In [13]:
print(len(data_qas_train_df))
print(len(data_qas_val_df))
print(len(data_qas_test_df))

53
50
53


## Create NLI dataset from copy of QA dataset above

In [14]:
data_nli_train_df = data_qas_train_df.copy()
data_nli_val_df = data_qas_val_df.copy()
data_nli_test_df = data_qas_test_df.copy()

## Convert context pair to premise (only renaming column)

In [15]:
data_nli_train_df = data_nli_train_df.rename(columns={"context": "premise"})
data_nli_val_df = data_nli_val_df.rename(columns={"context": "premise"})
data_nli_test_df = data_nli_test_df.rename(columns={"context": "premise"})

# Add contradiction label cases

## Import pipeline to create contradiction cases

In [16]:
nlp_tools_ner = pipeline(task = TASK_NER_NAME, 
                     model = MODEL_NER_NAME, 
                     tokenizer = AutoTokenizer.from_pretrained(MODEL_NER_NAME, 
                                                               model_max_length=512, 
                                                               truncation=True),
                     aggregation_strategy = 'simple')

In [17]:
nlp_tools_chunking = pipeline(task = TASK_CHUNKING_NAME, 
                     model = MODEL_CHUNKING_NAME, 
                     tokenizer = AutoTokenizer.from_pretrained(MODEL_CHUNKING_NAME, 
                                                               model_max_length=512, 
                                                               truncation=True),
                     aggregation_strategy = 'simple')

## Add NER and chunking tag column in DataFrame

In [18]:
def remove_space_after_number_and_punctuation(text):
    pattern = r'(\d+)\s*([.,])\s*(?=\S|$)'
    cleaned_text = re.sub(pattern, r'\1\2', text)
    return cleaned_text

In [23]:
def add_premise_tag(data, tag, index, premise_array, ner=nlp_tools_ner, chunking=nlp_tools_chunking):

    if tag == "ner": tools=ner
    else: tools=chunking
    
    if len(tools(data['premise'][index])) == 0:
        premise_array.append("NO TOKEN DETECTED")
    
    else:
        for j in tools(data['premise'][index]):
            tag_premise = (j['entity_group'], remove_space_after_number_and_punctuation(j['word']))
            premise_array.append(tag_premise)

    return premise_array

In [33]:
def remove_punctuation(text):
    return text.strip(string.punctuation)

In [46]:
def add_row_tag(answer, tag, premise_array, ner=nlp_tools_ner, chunking=nlp_tools_chunking):

    if tag == "ner": tools=ner
    else: tools=chunking

    tag_answer_list = []
    
    if len(premise_array) != 0:
        for i in premise_array:
            
            label_from_premise_tag = i[0]
            word_from_premise_tag = remove_space_after_number_and_punctuation(i[1])
            
            splitted_word_from_premise_tag = set(remove_punctuation(text) for text in word_from_premise_tag.split())
            
            # With assumption, that I do not divide label when
            # there is more than one label in one word answer.
            # Instead, I give a NULL.
            
            if word_from_premise_tag.lower() == answer.lower():
                tag_answer = (label_from_premise_tag, word_from_premise_tag)
                break
            
            # Or, I could do this: to reducing NULL label with char not really with string.
            # But, the tradeoff is real-answer can be replace with word in premise_array
            #elif answer.lower() in word_from_premise_tag:
            #    tag_answer = (label_from_premise_tag, answer.lower())
            #    break
            
            else:
                tag_answer = ("NULL", answer)
        tag_answer_list.append(tag_answer)

    else:
        tag_answer = ("NULL", answer)
        tag_answer_list.append(tag_answer)
        
    return tag_answer_list

In [47]:
arr_premise = [('PLACE', 'kota ho chi minh'), ('PLACE', 'vietnam'), ('PLACE', 'thanh pho ho chi minh'), ('PLACE', 'vietnam'), ('PLACE', 'sungai mekong'), ('PLACE', 'prey nok'), ('PLACE', 'kamboja'), ('PLACE', 'vietnam'), ('PLACE', 'saigon'), ('PLACE', 'vietnam'), ('PLACE', 'koloni perancis cochinchina'), ('PLACE', 'vietnam selatan'), ('PLACE', 'saigon'), ('PLACE', 'provinsi gia'), ('PLACE', 'kota ho chi minh'), ('PLACE', 'saigon'), ('PLACE', 'sungai saigon'), ('PLACE', 'china selatan')]

x = add_row_tag("Ho Chi Minh", "ner", arr_premise)
x

[('PLACE', 'ho chi minh')]

In [35]:
def add_ner_and_chunking_all_tag(data):
    
    data['ner_tag_answer'] = ""
    data['chunking_tag_answer'] = ""
    
    data['ner_tag_premise'] = ""
    data['chunking_tag_premise'] = ""
    
    for i in tqdm(range(len(data))):
        
        answer = data['answer'][i]
        premise = data['premise'][i]
        
        ner_premise_array = []
        chunking_premise_array = []
                                                
        data['ner_tag_premise'][i] = add_premise_tag(data, "ner", i, ner_premise_array)
        data['chunking_tag_premise'][i] = add_premise_tag(data, "chunking", i, chunking_premise_array)
        
        data['ner_tag_answer'][i] = add_row_tag(answer, "ner", data['ner_tag_premise'][i])
        data['chunking_tag_answer'][i] = add_row_tag(answer, "chunking", data['chunking_tag_premise'][i])
        
        print("answer:", answer)
        print("ner_tag_premise:", data['ner_tag_premise'][i])
        print("ner_tag_answer:", data['ner_tag_answer'][i])
        print()
    
    return data

In [36]:
data_nli_train_df = add_ner_and_chunking_all_tag(data_nli_train_df)
data_nli_val_df = add_ner_and_chunking_all_tag(data_nli_val_df)
data_nli_test_df = add_ner_and_chunking_all_tag(data_nli_test_df)

  2%|█▌                                                                                  | 1/53 [00:05<04:45,  5.49s/it]

answer: hitam di atas, merah di tengah, dan kuning ("emas") di bawah
ner_tag_premise: [('PLACE', 'jerman'), ('PLACE', 'jerman barat'), ('PLACE', 'jerman')]
ner_tag_answer: [('NULL', 'hitam di atas, merah di tengah, dan kuning ("emas") di bawah')]



  4%|███▏                                                                                | 2/53 [00:17<08:10,  9.62s/it]

answer: Paus
ner_tag_premise: [('PERSON', 'paus'), ('PLACE', 'dutch'), ('PLACE', 'latin'), ('PLACE', 'roma'), ('PLACE', 'roma'), ('PERSON', 'santo petrus'), ('PERSON', 'yesus'), ('PERSON', 'paus fransiskus'), ('PERSON', 'paus benediktus xvi')]
ner_tag_answer: [('PERSON', 'paus')]



  6%|████▊                                                                               | 3/53 [00:26<07:39,  9.20s/it]

answer: 1 Januari 2002
ner_tag_premise: [('PLACE', 'uni'), ('ORGANISATION', 'eropa')]
ner_tag_answer: [('NULL', '1 Januari 2002')]



  8%|██████▎                                                                             | 4/53 [00:39<08:32, 10.46s/it]

answer: 36.000 km2
ner_tag_premise: [('PLACE', 'kabupaten kaimana'), ('PLACE', 'provinsi papua barat'), ('PLACE', 'indonesia'), ('PLACE', 'kabupaten kaimana'), ('PLACE', 'kabupaten sarmi'), ('PLACE', 'kabupaten keerom'), ('PLACE', 'kabupaten sorong selatan'), ('PLACE', 'kabupaten radja ampat'), ('PLACE', 'kabupaten pegunungan bintang'), ('PLACE', 'kabupaten yahukimo'), ('PLACE', 'kabupaten tolikara'), ('PLACE', 'kabupaten waropen'), ('PLACE', 'kabupaten kaimana'), ('PLACE', 'kabupaten boven digoel'), ('PLACE', 'kabupaten mappi'), ('PLACE', 'kabupaten asmat'), ('PLACE', 'kabupaten teluk bintuni'), ('PLACE', 'kabupaten wondama'), ('PLACE', 'papua'), ('PLACE', 'distrik kaimana'), ('PLACE', 'kabupaten kaimana')]
ner_tag_answer: [('NULL', '36.000 km2')]



  9%|███████▉                                                                            | 5/53 [00:48<08:00, 10.01s/it]

answer: Wielkopolska
ner_tag_premise: [('PERSON', 'plwacz'), ('PLACE', 'pl'), ('PERSON', '##wa'), ('PLACE', 'kalisz'), ('PLACE', 'poznan'), ('PLACE', 'uj'), ('PLACE', 'Nakło'), ('PLACE', 'wielkopolsk'), ('PLACE', 'sungai warta')]
ner_tag_answer: [('NULL', 'Wielkopolska')]



 11%|█████████▌                                                                          | 6/53 [00:59<08:17, 10.58s/it]

answer: melakukan operasi aritmetika dan logika terhadap data yang diambil dari memori atau dari informasi yang dimasukkan melalui beberapa perangkat keras, seperti papan tombol, pemindai, tuas kontrol, maupun tetikus
ner_tag_premise: ['NO TOKEN DETECTED']
ner_tag_answer: [('NULL', 'melakukan operasi aritmetika dan logika terhadap data yang diambil dari memori atau dari informasi yang dimasukkan melalui beberapa perangkat keras, seperti papan tombol, pemindai, tuas kontrol, maupun tetikus')]



 13%|███████████                                                                         | 7/53 [01:09<07:50, 10.23s/it]

answer: 28
ner_tag_premise: [('ORGANISATION', 'uni eropa'), ('ORGANISATION', 'ue'), ('PLACE', 'eropa'), ('ORGANISATION', 'uni eropa'), ('ORGANISATION', 'ma'), ('PLACE', '##astrich'), ('ORGANISATION', 'ue')]
ner_tag_answer: [('NULL', '28')]



 15%|████████████▋                                                                       | 8/53 [01:21<08:02, 10.73s/it]

answer: 358,55km²
ner_tag_premise: [('PLACE', 'kota palembang'), ('PLACE', 'provinsi sumatera selatan'), ('PLACE', 'palembang'), ('PLACE', 'sumatera'), ('PLACE', 'medan'), ('PLACE', 'kota palembang'), ('PLACE', 'jakabaring'), ('PLACE', 'tanjung api - api'), ('PLACE', 'kota palembang')]
ner_tag_answer: [('NULL', '358,55km²')]



 17%|██████████████▎                                                                     | 9/53 [01:31<07:46, 10.59s/it]

answer: 9.000.000km2
ner_tag_premise: [('PLACE', 'sahara'), ('PLACE', 'afrika'), ('PLACE', 'samudra atlantik'), ('PLACE', 'laut merah'), ('PLACE', 'laut tengah'), ('PLACE', 'sahel'), ('PLACE', 'mauritania'), ('PLACE', 'mesir'), ('PLACE', 'afrika'), ('PLACE', 'afrika utara'), ('PLACE', 'afrika')]
ner_tag_answer: [('NULL', '9.000.000km2')]



 19%|███████████████▋                                                                   | 10/53 [01:37<06:38,  9.27s/it]

answer: Armonk, New York, Amerika Serikat
ner_tag_premise: [('ORGANISATION', 'international business machines corporation'), ('ORGANISATION', 'ibm'), ('ORGANISATION', 'nyse'), ('PLACE', 'amerika serikat'), ('ORGANISATION', 'ibm'), ('PLACE', 'armonk'), ('PLACE', 'new york'), ('PLACE', 'amerika serikat')]
ner_tag_answer: [('NULL', 'Armonk, New York, Amerika Serikat')]



 21%|█████████████████▏                                                                 | 11/53 [01:44<05:53,  8.42s/it]

answer: Pontianak
ner_tag_premise: [('PLACE', 'belanda'), ('PLACE', 'gouvernement borneo'), ('PLACE', 'banjarmasin'), ('PLACE', 'residentie westerafdeeling van borneo'), ('PLACE', 'pontianak')]
ner_tag_answer: [('PLACE', 'pontianak')]



 23%|██████████████████▊                                                                | 12/53 [01:58<06:51, 10.03s/it]

answer: Ho Chi Minh
ner_tag_premise: [('PLACE', 'kota ho chi minh'), ('PLACE', 'vietnam'), ('PLACE', 'thanh pho ho chi minh'), ('PLACE', 'vietnam'), ('PLACE', 'sungai mekong'), ('PLACE', 'prey nok'), ('PLACE', 'kamboja'), ('PLACE', 'vietnam'), ('PLACE', 'saigon'), ('PLACE', 'vietnam'), ('PLACE', 'koloni perancis cochinchina'), ('PLACE', 'vietnam selatan'), ('PLACE', 'saigon'), ('PLACE', 'provinsi gia'), ('PLACE', 'kota ho chi minh'), ('PLACE', 'saigon'), ('PLACE', 'sungai saigon'), ('PLACE', 'china selatan')]
ner_tag_answer: [('NULL', 'Ho Chi Minh')]



 25%|████████████████████▎                                                              | 13/53 [02:15<08:10, 12.26s/it]

answer: stimulus yang tidak atau belum menghasilkan sebuah respon tertentu
ner_tag_premise: [('PERSON', 'pavlov'), ('PERSON', 'pavlov'), ('PERSON', 'pavlov')]
ner_tag_answer: [('NULL', 'stimulus yang tidak atau belum menghasilkan sebuah respon tertentu')]



 26%|█████████████████████▉                                                             | 14/53 [02:18<06:10,  9.49s/it]

answer: 1920an
ner_tag_premise: [('PLACE', 'indonesia'), ('PLACE', 'indonesia'), ('PLACE', 'belanda')]
ner_tag_answer: [('NULL', '1920an')]



 28%|███████████████████████▍                                                           | 15/53 [02:35<07:29, 11.82s/it]

answer: Nate dengan mentor dan figur ayah, Victor "Sully" Sullivan
ner_tag_premise: [('PERSON', 'nate'), ('PERSON', 'victor " sully " sullivan'), ('PLACE', 'arab'), ('PLACE', 'rub'), ('PLACE', 'al khali desert'), ('PLACE', '##s'), ('PERSON', 'elena fisher'), ('PERSON', 'chloe frazer'), ('PERSON', 'charlie cutter'), ('PERSON', 'salim'), ('PERSON', 'nate'), ('PERSON', 'katherine marlowe'), ('PERSON', 'talbot'), ('PERSON', 'ramses')]
ner_tag_answer: [('NULL', 'Nate dengan mentor dan figur ayah, Victor "Sully" Sullivan')]



 30%|█████████████████████████                                                          | 16/53 [02:46<07:03, 11.45s/it]

answer: diperuntukkan bagi wanita
ner_tag_premise: [('PLACE', 'yunani'), ('PLACE', 'yunani'), ('PLACE', 'india')]
ner_tag_answer: [('NULL', 'diperuntukkan bagi wanita')]



 32%|██████████████████████████▌                                                        | 17/53 [02:49<05:23,  9.00s/it]

answer: substansi organik yang dibutuhkan organisme untuk fungsi normal dari sistem tubuh, pertumbuhan, pemeliharaan kesehatan
ner_tag_premise: ['NO TOKEN DETECTED']
ner_tag_answer: [('NULL', 'substansi organik yang dibutuhkan organisme untuk fungsi normal dari sistem tubuh, pertumbuhan, pemeliharaan kesehatan')]



 34%|████████████████████████████▏                                                      | 18/53 [02:56<04:50,  8.31s/it]

answer: 1.812,8km2
ner_tag_premise: [('PLACE', 'kabupaten lamongan'), ('PLACE', 'kabupaten lamongan'), ('PLACE', 'provinsi jawa timur'), ('PLACE', 'kabupaten lamongan')]
ner_tag_answer: [('NULL', '1.812,8km2')]



 36%|█████████████████████████████▊                                                     | 19/53 [03:05<04:52,  8.61s/it]

answer: 21 Maret 1685
ner_tag_premise: [('PERSON', 'johann sebastian bach'), ('PLACE', 'kota eisenach'), ('PLACE', 'jerman'), ('PERSON', 'johann ambrosius bach'), ('PERSON', 'johann sebastian'), ('PERSON', 'bach'), ('PLACE', 'ohrdruf'), ('PERSON', 'johann christoph bach'), ('PLACE', 'ohrdruf'), ('PERSON', 'bach'), ('PLACE', 'lyceum'), ('PERSON', 'bach'), ('PERSON', 'bach'), ('PERSON', 'bach')]
ner_tag_answer: [('NULL', '21 Maret 1685')]



 38%|███████████████████████████████▎                                                   | 20/53 [03:15<04:57,  9.02s/it]

answer: 1 Juli 1971
ner_tag_premise: [('PERSON', 'nicolaas jouwe'), ('ORGANISATION', 'opm'), ('PERSON', 'seth jafeth roemkorem'), ('PERSON', 'jacob hendrik prai'), ('PLACE', 'papua'), ('PERSON', 'roemkorem'), ('PERSON', 'prai'), ('PLACE', 'papua barat')]
ner_tag_answer: [('NULL', '1 Juli 1971')]



 40%|████████████████████████████████▉                                                  | 21/53 [03:26<05:09,  9.68s/it]

answer: California, Amerika Serikat
ner_tag_premise: [('PLACE', 'amerika'), ('PLACE', 'california'), ('PLACE', 'amerika serikat')]
ner_tag_answer: [('NULL', 'California, Amerika Serikat')]



 42%|██████████████████████████████████▍                                                | 22/53 [03:39<05:24, 10.47s/it]

answer: 4 Februari 1937
ner_tag_premise: [('ORGANISATION', 'walt disney'), ('ORGANISATION', 'rko radio pictures'), ('PLACE', 'bioskop'), ('PLACE', 'carthay circle'), ('PLACE', 'amerika serikat'), ('ORGANISATION', '##s grimm'), ('PERSON', 'snow white'), ('PLACE', 'amerika serikat'), ('ORGANISATION', 'walt disney'), ('ORGANISATION', 'disney'), ('ORGANISATION', 'walt disney')]
ner_tag_answer: [('NULL', '4 Februari 1937')]



 43%|████████████████████████████████████                                               | 23/53 [03:48<05:07, 10.24s/it]

answer: Karl Marx
ner_tag_premise: [('PERSON', 'karl marx'), ('PERSON', 'marx'), ('PERSON', 'marx'), ('PERSON', 'friedrich engels')]
ner_tag_answer: [('PERSON', 'karl marx')]



 45%|█████████████████████████████████████▌                                             | 24/53 [03:57<04:42,  9.74s/it]

answer: Kastilia
ner_tag_premise: [('PLACE', 'republik dominika'), ('PLACE', 'karibia'), ('PLACE', 'eropa'), ('PLACE', 'afrika'), ('PLACE', 'kastilia'), ('PLACE', 'spanyol'), ('PLACE', 'inggris'), ('PLACE', 'perancis'), ('PLACE', 'jerman'), ('PLACE', 'haiti'), ('PLACE', 'italia'), ('PLACE', 'eropa'), ('PLACE', 'afrika'), ('PLACE', 'taino')]
ner_tag_answer: [('PLACE', 'kastilia')]



 47%|███████████████████████████████████████▏                                           | 25/53 [04:00<03:36,  7.72s/it]

answer: Pangeran Samudra
ner_tag_premise: [('PERSON', 'pangeran samudra'), ('PLACE', 'kerajaan banjar'), ('PERSON', 'sultan suriansyah'), ('PERSON', 'khat'), ('PERSON', '##ib dayan')]
ner_tag_answer: [('PERSON', 'pangeran samudra')]



 49%|████████████████████████████████████████▋                                          | 26/53 [04:15<04:25,  9.82s/it]

answer: 78,772km
ner_tag_premise: [('PLACE', 'skotlandia'), ('PLACE', 'pulau britania raya'), ('PLACE', 'benua eropa'), ('PLACE', 'skotlandia'), ('PLACE', 'jawa'), ('PLACE', 'inggris'), ('PLACE', 'skotlandia'), ('PLACE', 'basin sungai tweed'), ('PLACE', 'solway firth'), ('PLACE', 'skotlandia'), ('PLACE', 'atlantik'), ('PLACE', 'laut utara'), ('PLACE', 'pulau irlandia'), ('PLACE', 'kintyre'), ('PLACE', 'norwegia'), ('PLACE', 'kepulauan faroe')]
ner_tag_answer: [('NULL', '78,772km')]



 51%|██████████████████████████████████████████▎                                        | 27/53 [04:25<04:18,  9.93s/it]

answer: 1838
ner_tag_premise: [('PLACE', 'pulau pitcairn'), ('PLACE', 'inggris'), ('PLACE', 'pulau pitcairn'), ('PLACE', 'inggris'), ('PLACE', 'inggris'), ('PLACE', 'pulau norfolk'), ('PLACE', 'pulau pitcairn'), ('PLACE', 'pulau norfolk'), ('PLACE', 'morayshire'), ('PLACE', 'pulau norfork'), ('PLACE', 'pitcairn'), ('PLACE', 'norfolk'), ('PLACE', 'pulau pitcairn')]
ner_tag_answer: [('NULL', '1838')]



 53%|███████████████████████████████████████████▊                                       | 28/53 [04:37<04:25, 10.61s/it]

answer: 1939
ner_tag_premise: [('PERSON', 'albert einstein'), ('PLACE', 'as'), ('PERSON', 'franklin d. roosevelt'), ('PLACE', 'as'), ('PLACE', 'manhattan'), ('PLACE', 'jerman'), ('PLACE', 'jerman'), ('PLACE', 'jepang')]
ner_tag_answer: [('NULL', '1939')]



 55%|█████████████████████████████████████████████▍                                     | 29/53 [04:43<03:40,  9.17s/it]

answer: 1951
ner_tag_premise: [('PERSON', 'david mcclure brinkley'), ('ORGANISATION', 'nbc'), ('ORGANISATION', 'abc'), ('PLACE', 'amerika serikat'), ('PLACE', 'north carolina')]
ner_tag_answer: [('NULL', '1951')]



 57%|██████████████████████████████████████████████▉                                    | 30/53 [04:53<03:36,  9.42s/it]

answer: keinginan mereka untuk tidak terlibat dalam konfrontasi ideologi Barat-Timur
ner_tag_premise: [('PLACE', 'asia'), ('PLACE', 'afrika'), ('PLACE', 'bandung'), ('PLACE', 'indonesia'), ('PERSON', 'josip broz tito'), ('PLACE', 'yugoslavia'), ('PERSON', 'soekarno'), ('PLACE', 'indonesia'), ('PERSON', 'gamal abdul nasser'), ('PLACE', 'mesir'), ('PERSON', 'pandit jawaharlal nehru'), ('PLACE', 'india'), ('PERSON', 'kwame nkrumah'), ('PLACE', 'ghana')]
ner_tag_answer: [('NULL', 'keinginan mereka untuk tidak terlibat dalam konfrontasi ideologi Barat-Timur')]



 58%|████████████████████████████████████████████████▌                                  | 31/53 [04:57<02:53,  7.88s/it]

answer: 1679
ner_tag_premise: [('PLACE', 'kesultanan kanoman'), ('PLACE', 'kesultanan kasepuhan'), ('PERSON', 'sultan anom i pangeran muhammad badrudin kartawijaya')]
ner_tag_answer: [('NULL', '1679')]



 60%|██████████████████████████████████████████████████                                 | 32/53 [05:10<03:17,  9.42s/it]

answer: 25 April 1833
ner_tag_premise: [('PERSON', 'marko miljanov popovic'), ('PLACE', 'brda'), ('PLACE', 'montenegro'), ('PERSON', 'danilo i'), ('PLACE', 'montenegro'), ('PLACE', 'kuci'), ('PLACE', 'utsman'), ('PLACE', 'montenegro'), ('PERSON', 'miljanov'), ('PERSON', 'nikola i'), ('PLACE', 'montenegro'), ('PERSON', 'olgivanna lloyd wright'), ('PERSON', 'frank lloyd wright'), ('PLACE', 'amerika serikat')]
ner_tag_answer: [('NULL', '25 April 1833')]



 62%|███████████████████████████████████████████████████▋                               | 33/53 [05:15<02:41,  8.09s/it]

answer: kumpulan benda langit yang terdiri atas sebuah bintang yang disebut Matahari dan semua objek yang terikat oleh gaya gravitasinya
ner_tag_premise: ['NO TOKEN DETECTED']
ner_tag_answer: [('NULL', 'kumpulan benda langit yang terdiri atas sebuah bintang yang disebut Matahari dan semua objek yang terikat oleh gaya gravitasinya')]



 64%|█████████████████████████████████████████████████████▏                             | 34/53 [05:21<02:21,  7.46s/it]

answer: 1930
ner_tag_premise: [('ORGANISATION', 'majalah'), ('ORGANISATION', 'panji pustaka jakarta'), ('ORGANISATION', 'sinar deli medan'), ('PERSON', 'h. moh. said'), ('ORGANISATION', 'harian waspada'), ('PERSON', 'h. moh. said')]
ner_tag_answer: [('NULL', '1930')]



 66%|██████████████████████████████████████████████████████▊                            | 35/53 [05:35<02:50,  9.49s/it]

answer: B&W Seaplane
ner_tag_premise: [('PERSON', 'william boeing'), ('PERSON', 'george conrad westervelt'), ('PLACE', 'as'), ('ORGANISATION', 'boeing'), ('PERSON', 'westervelt'), ('PERSON', 'curtiss'), ('ORGANISATION', 'boeing'), ('PERSON', 'glenn martin'), ('PERSON', 'glenn martin'), ('PERSON', 'martin'), ('ORGANISATION', 'boeing'), ('ORGANISATION', 'boeing'), ('PERSON', 'cdr. g. c. westervelt'), ('ORGANISATION', 'boeing'), ('PLACE', 'seattle lake union'), ('ORGANISATION', 'boeing')]
ner_tag_answer: [('NULL', 'B&W Seaplane')]



 68%|████████████████████████████████████████████████████████▍                          | 36/53 [05:40<02:18,  8.17s/it]

answer: 2004-2007
ner_tag_premise: [('PERSON', 'christoph blocher'), ('PLACE', 'schaffhausen'), ('PLACE', 'swiss'), ('PLACE', 'swiss'), ('ORGANISATION', 'dewan federal swiss'), ('ORGANISATION', 'partai rakyat swiss'), ('ORGANISATION', 'ems'), ('ORGANISATION', 'chemie')]
ner_tag_answer: [('NULL', '2004-2007')]



 70%|█████████████████████████████████████████████████████████▉                         | 37/53 [05:48<02:07,  8.00s/it]

answer: Masohi
ner_tag_premise: [('PLACE', 'kabupaten maluku tengah'), ('PLACE', 'maluku'), ('PLACE', 'indonesia'), ('PLACE', 'masohi'), ('PLACE', 'pulau seram'), ('PLACE', 'kecamatan amahai'), ('PLACE', 'tehoru'), ('PLACE', 'kota masohi'), ('PLACE', 'pulau ambon'), ('PLACE', 'kecamatan leihitu'), ('PLACE', 'salahutu')]
ner_tag_answer: [('PLACE', 'masohi')]



 72%|███████████████████████████████████████████████████████████▌                       | 38/53 [05:57<02:05,  8.39s/it]

answer: pita frekuensi radio tertinggi
ner_tag_premise: [('PLACE', 'english'), ('ORGANISATION', 'eh'), ('ORGANISATION', 'eh')]
ner_tag_answer: [('NULL', 'pita frekuensi radio tertinggi')]



 74%|█████████████████████████████████████████████████████████████                      | 39/53 [06:10<02:15,  9.71s/it]

answer: Triticum spp
ner_tag_premise: [('PLACE', 'nepal'), ('ORGANISATION', 'triticum')]
ner_tag_answer: [('NULL', 'Triticum spp')]



 75%|██████████████████████████████████████████████████████████████▋                    | 40/53 [06:21<02:10, 10.07s/it]

answer: 1809
ner_tag_premise: [('PERSON', 'samuel thomas von sommering'), ('PERSON', 'baron schilling'), ('PERSON', 'carl friedrich gauss'), ('PERSON', 'wilhelm weber'), ('PLACE', 'gottingen'), ('PERSON', 'william fothergill cooke'), ('ORGANISATION', 'great western railway'), ('PLACE', 'inggris'), ('PLACE', 'inggris'), ('PLACE', 'stasiun'), ('PLACE', 'paddington'), ('PLACE', 'west drayton')]
ner_tag_answer: [('NULL', '1809')]



 77%|████████████████████████████████████████████████████████████████▏                  | 41/53 [06:25<01:40,  8.37s/it]

answer: mempersiapkan anggotanya dalam berbagai bidang pelayanan (pendidikan, sosial, politik, kemasyarakatan, dll) di Indonesia
ner_tag_premise: [('ORGANISATION', 'gerakan angkatan muda kristen indonesia'), ('ORGANISATION', 'gamki'), ('PLACE', 'indonesia')]
ner_tag_answer: [('NULL', 'mempersiapkan anggotanya dalam berbagai bidang pelayanan (pendidikan, sosial, politik, kemasyarakatan, dll) di Indonesia')]



 79%|█████████████████████████████████████████████████████████████████▊                 | 42/53 [06:34<01:31,  8.35s/it]

answer: Gudeg
ner_tag_premise: [('PLACE', 'jawa'), ('PLACE', 'yogyakarta'), ('PLACE', 'jawa tengah')]
ner_tag_answer: [('NULL', 'Gudeg')]



 81%|███████████████████████████████████████████████████████████████████▎               | 43/53 [06:39<01:15,  7.52s/it]

answer: 14
ner_tag_premise: [('ORGANISATION', 'ui')]
ner_tag_answer: [('NULL', '14')]



 83%|████████████████████████████████████████████████████████████████████▉              | 44/53 [06:49<01:12,  8.06s/it]

answer: 24
ner_tag_premise: [('ORGANISATION', 'tv tokyo jepang'), ('ORGANISATION', 'akb48'), ('ORGANISATION', 'ske48'), ('ORGANISATION', 'sdn48')]
ner_tag_answer: [('NULL', '24')]



 85%|██████████████████████████████████████████████████████████████████████▍            | 45/53 [06:54<00:58,  7.29s/it]

answer: Peso
ner_tag_premise: [('ORGANISATION', 'mx'), ('PLACE', 'meksiko')]
ner_tag_answer: [('NULL', 'Peso')]



 87%|████████████████████████████████████████████████████████████████████████           | 46/53 [07:05<00:59,  8.43s/it]

answer: Hindia Timur
ner_tag_premise: [('PLACE', 'asia'), ('PLACE', 'asia'), ('PLACE', 'asia'), ('PLACE', 'hindia timur'), ('PLACE', 'sumatra'), ('PLACE', '##dang')]
ner_tag_answer: [('PLACE', 'hindia timur')]



 89%|█████████████████████████████████████████████████████████████████████████▌         | 47/53 [07:10<00:44,  7.34s/it]

answer: pemain sepak bola berkewarganegaraan Inggris
ner_tag_premise: [('PERSON', 'jordan spence'), ('PLACE', 'inggris'), ('ORGANISATION', 'milton keynes dons')]
ner_tag_answer: [('NULL', 'pemain sepak bola berkewarganegaraan Inggris')]



 91%|███████████████████████████████████████████████████████████████████████████▏       | 48/53 [07:21<00:41,  8.32s/it]

answer: 1293
ner_tag_premise: [('PLACE', 'kerajaan majapahit'), ('PLACE', 'javanes'), ('PLACE', 'nagari kara'), ('PLACE', 'wilwati'), ('PLACE', 'jawa timur'), ('PLACE', 'indonesia'), ('PLACE', 'nusantara'), ('PERSON', 'hayam wuruk')]
ner_tag_answer: [('NULL', '1293')]



 92%|████████████████████████████████████████████████████████████████████████████▋      | 49/53 [07:29<00:33,  8.40s/it]

answer: Agra, India
ner_tag_premise: [('PLACE', 'taj mahal'), ('PLACE', 'urdu'), ('PLACE', 'agra'), ('PLACE', 'india'), ('PLACE', 'mughal'), ('PERSON', 'shah jahan'), ('PERSON', 'jahangir'), ('PLACE', 'persia'), ('PERSON', 'arjumand banu begum'), ('PLACE', 'mumtaz - ul -'), ('PLACE', 'mumtaz mahal'), ('PLACE', 'taj mahal'), ('PLACE', 'mughal')]
ner_tag_answer: [('NULL', 'Agra, India')]



 94%|██████████████████████████████████████████████████████████████████████████████▎    | 50/53 [07:34<00:21,  7.29s/it]

answer: suatu istilah yang mulai digunakan pada tahun 1920-an untuk mengistilahkan jenis media yang secara khusus didesain untuk mencapai masyarakat yang sangat luas
ner_tag_premise: ['NO TOKEN DETECTED']
ner_tag_answer: [('NULL', 'suatu istilah yang mulai digunakan pada tahun 1920-an untuk mengistilahkan jenis media yang secara khusus didesain untuk mencapai masyarakat yang sangat luas')]



 96%|███████████████████████████████████████████████████████████████████████████████▊   | 51/53 [07:42<00:14,  7.48s/it]

answer: 03 April 1973
ner_tag_premise: [('PERSON', 'martin cooper'), ('ORGANISATION', 'motorola'), ('ORGANISATION', 'motorola'), ('PERSON', 'cooper'), ('PERSON', 'cooper')]
ner_tag_answer: [('NULL', '03 April 1973')]



 96%|███████████████████████████████████████████████████████████████████████████████▊   | 51/53 [08:03<00:18,  9.47s/it]


KeyboardInterrupt: 

# Create wrong answer

This is the flow to create wrong answer:

1. Check the NER and POS/Chunking labels of the right_answer and context/premise.

2. Search and group NER and POS/Chunking labels that match the right_answer throughout the context/premise.

3. Perform NER classification. There will be two branches here, namely:

   3a. If the NER of the right_answer can be detected, then calculate the distance using semantic similarity or word vectors between the right_answer and various possible wrong_answers with the same NER as the right_answer. Once done, proceed to the final wrong_answer.
   
   3b. If the NER of the right_answer cannot be detected (NULL) or context/premise does not contain any of NER of right_answer, then the POS/Chunking of the right_answer will be identified.
   
4. Perform POS/Chunking classification. Continuation from point 3b. There will be two more branches:

   4a. If the POS/Chunking of the right_answer can be detected, then calculate the distance using semantic similarity or word vectors between the right_answer and various possible wrong_answers with the same POS/Chunking as the right_answer. Once done, proceed to the final wrong_answer.
   
   4b. If the POS/Chunking of the right_answer cannot be detected (NULL) or context/premise does not contain any of NER of right_answer, then the final wrong_answer will be chosen based on a random word (random_word) from the context/premise.

In [None]:
model_similarity = SentenceTransformer(MODEL_SIMILARITY_NAME)

def return_similarity_sorted_array(right_answer, sentence_array, model=model_similarity):
    
    right_answer = right_answer.lower()
    
    embedding_right_answer = model.encode([right_answer], convert_to_tensor=True, device=device)
    embedding_sentence_array = model.encode(sentence_array, convert_to_tensor=True, device=device)
    
    cosine_scores = util.pytorch_cos_sim(embedding_right_answer, embedding_sentence_array)
    
    sorted_indices = cosine_scores.argsort(descending=True)[0]
    sorted_array = [sentence_array[i] for i in sorted_indices]
    
    return sorted_array

In [None]:
def remove_values_with_hash(arr):
    return [item for item in arr if "#" not in item]

In [None]:
response = requests.get(URL_STOPWORD)

if response.status_code == 200:
    stopword_data = response.json()
else:
    print("Failed to download JSON.")

stopword_data = set([item for sublist in list(stopword_data.values()) for item in sublist])
stopword_data

In [None]:
def select_random_word(text, stopword_data=stopword_data):
    words = re.findall(r'\w+', text.lower())
    filtered_words = [word for word in words if word not in stopword_data and word not in string.punctuation]
    random_word = random.choice(filtered_words)
    return random_word

In [None]:
def grouping_same_tag(tag_answers, tag_premises, same_tag_array):

    for tag_premise in tag_premises:

        label_tag_premise = tag_premise[0]
        word_premise = tag_premise[1]

        for tag_answer in tag_answers:
            
            label_tag_answer = tag_answer[0]
            
            if label_tag_answer == label_tag_premise:
                same_tag_array.append(word_premise)

    return remove_values_with_hash(same_tag_array)

In [None]:
def remove_punctuation(text):
    return text.strip(string.punctuation)

In [None]:
def contains_only_punctuation(text):
    return all(char in string.punctuation for char in text)

In [None]:
def filtering_plausible_answer(answer, plausible_answer_array):
    
    answer = answer.lower()
    
    plausible_answer_array = [item.lower().strip() for item in plausible_answer_array]
    plausible_answer_array = [string for string in plausible_answer_array if not contains_only_punctuation(string)]
    plausible_answer_array = [remove_punctuation(text) for text in plausible_answer_array]
    
    final_plausible_answer_array = []
    answer_words = set(remove_punctuation(text) for text in answer.split())
    
    for plausible_answer in plausible_answer_array:
        plausible_answer_words = set(plausible_answer.split())
        if not plausible_answer_words.intersection(answer_words):
            final_plausible_answer_array.append(plausible_answer)
    
    return final_plausible_answer_array

In [None]:
def sorting_similarity(data, right_answer, index, tag, plausible_answer_array, premise):

    if tag == "ner": slice='same_ner_tag_answer'
    elif tag == "chunking": slice='same_chunking_tag_answer'
    else: slice=None

    # Find all the sorted (by similarity) plausible wrong answer, 
    # and remove hask & punctuation only answer
    if slice != None:
        wrong_answer_array = return_similarity_sorted_array(right_answer, data[slice][index])
    else:
        wrong_answer_array = return_similarity_sorted_array(right_answer, plausible_answer_array)
    
    plausible_answer_array = remove_values_with_hash(wrong_answer_array)
    plausible_answer_array = filtering_plausible_answer(right_answer, plausible_answer_array)
    
    try:
        # Only return the most similar to right_answer
        wrong_answer = plausible_answer_array[0].strip()
        
        if tag == "ner": 
            properties = "IDENTICAL NER labels were found, and the highest similarity score same NER array was selected"
        elif tag == "chunking":
            properties = "IDENTICAL Chunking labels were found, and the highest similarity score from same Chunking array was selected"
        else:
            properties = "NO CHUNKING labels were found, and the highest similarity score from plausible answer was selected"
    except:
        wrong_answer = select_random_word(premise)
        
        if tag == "ner": 
            properties = "Detected (NER) wrong answer that is the SAME as the right answer, search random word from premise"
        elif tag == "chunking":
            properties = "Detected (Chunking) wrong answer that is the SAME as the right answer, search random word from premise"
        else:
            properties = "Detected (Random) wrong answer that is the SAME as the right answer, search random word from premise"
    
    assert isinstance(wrong_answer, str)
    assert isinstance(plausible_answer_array, list)
    
    return wrong_answer, plausible_answer_array, properties

In [None]:
def create_wrong_answer(data):
    
    data['same_ner_tag_answer'] = ""
    data['same_chunking_tag_answer'] = ""
    data['wrong_answer'] = ""
    data['plausible_answer_based_on_method'] = ""
    data['properties'] = ""
    
    for i in tqdm(range(len(data))):
        
        right_answer = data['answer'][i]
        premise = data['premise'][i]

        same_ner_tag_answer_array = []
        same_chunking_tag_answer_array = []

        ner_tag_answer = data['ner_tag_answer'][i]
        ner_tag_premise = data['ner_tag_premise'][i]

        chunking_tag_answer = data['chunking_tag_answer'][i]
        chunking_tag_premise = data['chunking_tag_premise'][i]
        
        # Grouped with the same NER & Chunking group, between answer and word of premise
        data['same_ner_tag_answer'][i] = grouping_same_tag(ner_tag_answer,
                                                           ner_tag_premise,
                                                           same_ner_tag_answer_array)
        
        data['same_chunking_tag_answer'][i] = grouping_same_tag(chunking_tag_answer, 
                                                                chunking_tag_premise, 
                                                                same_chunking_tag_answer_array)
        
        # Start to create wrong answer
        plausible_answer_array = []

        # Perform NER classification
        # If the NER of the right_answer can be detected, then calculate the distance using semantic 
        # similarity or word vectors between the right_answer and various possible wrong_answers with 
        # the same NER as the right_answer. Once done, proceed to the final wrong_answer.
        if data['same_ner_tag_answer'][i] != []:
            wrong_answer, plausible_answer_array, properties = sorting_similarity(data, right_answer, \
                                                                      i, "ner", plausible_answer_array, premise)
            
        # If the NER of the right_answer cannot be detected (NULL) or context/premise does not contain 
        # any of NER of right_answer, then the POS/Chunking of the right_answer will be identified.
        # Perform POS/Chunking classification
        else:
            # If the POS/Chunking of the right_answer can be detected, then calculate the distance 
            # using semantic similarity or word vectors between the right_answer and various possible 
            # wrong_answers with the same POS/Chunking as the right_answer. Once done, proceed to the 
            # final wrong_answer.
            if data['same_chunking_tag_answer'][i] != []:
                wrong_answer, plausible_answer_array, properties = sorting_similarity(data, right_answer, \
                                                                          i, "chunking", plausible_answer_array, premise)
            
            # If the POS/Chunking of the right_answer cannot be detected (NULL) or context/premise 
            # does not contain any of NER of right_answer, then the final wrong_answer will be chosen 
            # based on a random word (random_word) from the context/premise.
            else:
                for chunking_tag in chunking_tag_premise:
                    plausible_answer_array.append(chunking_tag[1])

                wrong_answer, plausible_answer_array, properties = sorting_similarity(data, right_answer, \
                                                                          i, "none", plausible_answer_array, premise)
        data['properties'][i] = properties
        data['wrong_answer'][i] = wrong_answer
        data['plausible_answer_based_on_method'][i] = plausible_answer_array
            
    return data       

In [None]:
def create_wrong_answer_with_removing_invalid_data(data):
    
    data['same_ner_tag_answer'] = ""
    data['same_chunking_tag_answer'] = ""
    data['wrong_answer'] = ""
    data['plausible_answer_based_on_method'] = ""
    data['properties'] = ""
    
    for i in tqdm(range(len(data))):
        
        right_answer = data['answer'][i]
        premise = data['premise'][i]

        same_ner_tag_answer_array = []
        same_chunking_tag_answer_array = []

        ner_tag_answer = data['ner_tag_answer'][i]
        ner_tag_premise = data['ner_tag_premise'][i]

        chunking_tag_answer = data['chunking_tag_answer'][i]
        chunking_tag_premise = data['chunking_tag_premise'][i]
        
        # Grouped with the same NER & Chunking group, between answer and word of premise
        data['same_ner_tag_answer'][i] = grouping_same_tag(ner_tag_answer,
                                                           ner_tag_premise,
                                                           same_ner_tag_answer_array)
        
        data['same_chunking_tag_answer'][i] = grouping_same_tag(chunking_tag_answer, 
                                                                chunking_tag_premise, 
                                                                same_chunking_tag_answer_array)
        
        # Start to create wrong answer
        plausible_answer_array = []
        
        # Golden rules: If same_NER isn't there, just drop it. If NER is NULL, check chunking
        
        if ner_tag_answer[0][0] == "NULL":
            if data['same_chunking_tag_answer'][i] != []:
                wrong_answer, plausible_answer_array, properties = sorting_similarity(data, right_answer, \
                                                                          i, "chunking", plausible_answer_array, premise)
            else:
                for chunking_tag in chunking_tag_premise:
                    plausible_answer_array.append(chunking_tag[1])

                wrong_answer, plausible_answer_array, properties = sorting_similarity(data, right_answer, \
                                                                          i, "none", plausible_answer_array, premise)
            data['properties'][i] = properties
            data['wrong_answer'][i] = wrong_answer
            data['plausible_answer_based_on_method'][i] = plausible_answer_array
            continue

        # Perform NER classification
        # If the NER of the right_answer can be detected, then calculate the distance using semantic 
        # similarity or word vectors between the right_answer and various possible wrong_answers with 
        # the same NER as the right_answer. Once done, proceed to the final wrong_answer.
        if data['same_ner_tag_answer'][i] != [] and ner_tag_answer[0][0] != "NULL":
            wrong_answer, plausible_answer_array, properties = sorting_similarity(data, right_answer, \
                                                                      i, "ner", plausible_answer_array, premise)
            
        # If the NER of the right_answer cannot be detected (NULL) or context/premise does not contain 
        # any of NER of right_answer, then drop that particular row data.
        else:
            data.drop(i, inplace=True)
            data.reset_index(drop=True)
            
        data['properties'][i] = properties
        data['wrong_answer'][i] = wrong_answer
        data['plausible_answer_based_on_method'][i] = plausible_answer_array
            
    return data       

In [None]:
data_nli_train_df = create_wrong_answer(data_nli_train_df)
data_nli_val_df = create_wrong_answer(data_nli_val_df)
data_nli_test_df = create_wrong_answer(data_nli_test_df)

In [None]:
for i in range(len(data_nli_train_df)):
    print("Iterasi:", i)
    #print("Premise:", data_nli_train_df['premise'][i])
    print("Right answer:", data_nli_train_df['answer'][i])
    print("Wrong answer:", data_nli_train_df['wrong_answer'][i])
    #print("Same NER tag answer:", data_nli_train_df['same_ner_tag_answer'][i])
    #print("Same Chunking tag answer:", data_nli_train_df['same_chunking_tag_answer'][i])
    #print("Chunking tag premise", data_nli_train_df['chunking_tag_premise'][i])
    #print("Plausible answer:", data_nli_train_df['plausible_answer_based_on_method'][i])
    #print("Properties:", data_nli_train_df['properties'][i])
    #print("Overlap:", check_string_overlap(data_nli_train_df['answer'][i].lower(), data_nli_train_df['wrong_answer'][i].lower()))
    print()

In [None]:
print(len(data_nli_train_df))
print(len(data_nli_val_df))
print(len(data_nli_test_df))

In [None]:
def test_create_wrong_answer(data):
    assert all(data['properties'] != '')
    assert all(data['wrong_answer'] != '')

In [None]:
test_create_wrong_answer(data_nli_train_df)
test_create_wrong_answer(data_nli_val_df)
test_create_wrong_answer(data_nli_test_df)

# Split to two dataset: right dataset & wrong dataset

In [None]:
def move_to_column_number(data, column_name="hypothesis", column_num=3):

    cols = list(data.columns)
    cols.remove(column_name)
    cols.insert(column_num, column_name)

    data = data[cols]
    
    return data

In [None]:
columns_to_exclude = ['wrong_answer']

data_nli_right_train_df = data_nli_train_df.drop(columns=columns_to_exclude).copy()
data_nli_right_val_df = data_nli_val_df.drop(columns=columns_to_exclude).copy()
data_nli_right_test_df = data_nli_test_df.drop(columns=columns_to_exclude).copy()

In [None]:
columns_to_exclude = ['answer']

data_nli_wrong_train_df = data_nli_train_df.drop(columns=columns_to_exclude).copy()
data_nli_wrong_val_df = data_nli_val_df.drop(columns=columns_to_exclude).copy()
data_nli_wrong_test_df = data_nli_test_df.drop(columns=columns_to_exclude).copy()

data_nli_wrong_train_df.rename(columns={'wrong_answer': 'answer'}, inplace=True)
data_nli_wrong_val_df.rename(columns={'wrong_answer': 'answer'}, inplace=True)
data_nli_wrong_test_df.rename(columns={'wrong_answer': 'answer'}, inplace=True)

data_nli_wrong_train_df = move_to_column_number(data_nli_wrong_train_df, "answer", 2)
data_nli_wrong_val_df = move_to_column_number(data_nli_wrong_val_df, "answer", 2)
data_nli_wrong_test_df = move_to_column_number(data_nli_wrong_test_df, "answer", 2)

# Convert question-answer pair to hypothesis

In [None]:
nlp_tools_paraphraser = pipeline(task = TASK_PARAPHRASER_NAME, 
                     model = MODEL_PARAPHRASER_NAME, 
                     tokenizer = AutoTokenizer.from_pretrained(MODEL_PARAPHRASER_NAME, 
                                                               model_max_length=512, 
                                                               truncation=True))

In [None]:
def convert_question_and_answer_to_hypothesis(data):
    for i in range(len(data)):
        #data['hypothesis'] = data['question'] + ' ' + data['answer']
        data['hypothesis'] = str(nlp_tools_paraphraser(data['question'] + ' ' + data['answer'])[0]['generated_text'])
    return data

In [None]:
data_nli_right_train_df = convert_question_and_answer_to_hypothesis(data_nli_right_train_df)
data_nli_right_val_df = convert_question_and_answer_to_hypothesis(data_nli_right_val_df)
data_nli_right_test_df = convert_question_and_answer_to_hypothesis(data_nli_right_test_df)

data_nli_right_train_df = move_to_column_number(data_nli_right_train_df, "hypothesis", 3)
data_nli_right_val_df = move_to_column_number(data_nli_right_val_df, "hypothesis", 3)
data_nli_right_test_df = move_to_column_number(data_nli_right_test_df, "hypothesis", 3)

In [None]:
data_nli_wrong_train_df = convert_question_and_answer_to_hypothesis(data_nli_wrong_train_df)
data_nli_wrong_val_df = convert_question_and_answer_to_hypothesis(data_nli_wrong_val_df)
data_nli_wrong_test_df = convert_question_and_answer_to_hypothesis(data_nli_wrong_test_df)

data_nli_wrong_train_df = move_to_column_number(data_nli_wrong_train_df, "hypothesis", 3)
data_nli_wrong_val_df = move_to_column_number(data_nli_wrong_val_df, "hypothesis", 3)
data_nli_wrong_test_df = move_to_column_number(data_nli_wrong_test_df, "hypothesis", 3)

# Add label: entailment & contradiction

In [None]:
data_nli_right_train_df['label'] = 'entailment'
data_nli_right_val_df['label'] = 'entailment'
data_nli_right_test_df['label'] = 'entailment'

data_nli_right_train_df = move_to_column_number(data_nli_right_train_df, "label", 4)
data_nli_right_val_df = move_to_column_number(data_nli_right_val_df, "label", 4)
data_nli_right_test_df = move_to_column_number(data_nli_right_test_df, "label", 4)

In [None]:
data_nli_wrong_train_df['label'] = 'contradiction'
data_nli_wrong_val_df['label'] = 'contradiction'
data_nli_wrong_test_df['label'] = 'contradiction'

data_nli_wrong_train_df = move_to_column_number(data_nli_wrong_train_df, "label", 4)
data_nli_wrong_val_df = move_to_column_number(data_nli_wrong_val_df, "label", 4)
data_nli_wrong_test_df = move_to_column_number(data_nli_wrong_test_df, "label", 4)

# Concat the right and wrong NLI to one NLI dataset

In [None]:
data_nli_train_df_final = pd.concat([data_nli_right_train_df, data_nli_wrong_train_df], axis=0, ignore_index=True)
data_nli_val_df_final = pd.concat([data_nli_right_val_df, data_nli_wrong_val_df], axis=0, ignore_index=True)
data_nli_test_df_final = pd.concat([data_nli_right_test_df, data_nli_wrong_test_df], axis=0, ignore_index=True)

# Convert to DataFrame format to CSV

In [None]:
1/0

In [None]:
data_nli_train_df_final.to_csv("data_nli_train_df_paraphrase.csv", index=False)
data_nli_val_df_final.to_csv("data_nli_val_df_paraphrase.csv", index=False)
data_nli_test_df_final.to_csv("data_nli_test_df_paraphrase.csv", index=False)

# Push to Hugging Face

In [None]:
"""
HUB_TOKEN = "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"
USER = "muhammadravi251001"
REPO = "idk-mrc-nli"

api = HfApi()

api.upload_file(
    path_or_fileobj="data_nli_train_df.csv",
    path_in_repo="data_nli_train_df.csv",
    repo_id=f"{USER}/{REPO}",
    token=HUB_TOKEN,
    repo_type="dataset",
)

api.upload_file(
    path_or_fileobj="data_nli_val_df.csv",
    path_in_repo="data_nli_val_df.csv",
    repo_id=f"{USER}/{REPO}",
    token=HUB_TOKEN,
    repo_type="dataset",
)

api.upload_file(
    path_or_fileobj="data_nli_test_df.csv",
    path_in_repo="data_nli_test_df.csv",
    repo_id=f"{USER}/{REPO}",
    token=HUB_TOKEN,
    repo_type="dataset",
)
"""

In [None]:
print("PROGRAM FINISHED")