# Import library

In [1]:
import transformers
import evaluate
import torch
import operator
import re
import sys
import collections
import string
import contextlib

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from nusacrowd import NusantaraConfigHelper
from datetime import datetime
from tqdm import tqdm
from IPython.display import display
from deep_translator import GoogleTranslator
from huggingface_hub import HfApi

from datasets import (
    load_dataset, 
    Dataset,
    DatasetDict
)
from transformers import (
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BertForQuestionAnswering,
    AutoTokenizer,
    EarlyStoppingCallback, 
    pipeline
)

# Import dataset QAS

In [2]:
conhelps = NusantaraConfigHelper()
data_qas_id = conhelps.filtered(lambda x: 'squad_id' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas_id['train'])
df_test = pd.DataFrame(data_qas_id['validation'])

cols = ['context', 'question', 'answer']
new_df_test = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_test['context']))):
    new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                    'question': df_test["question"][i], 
                                    'answer': {"text": eval(df_test["answer"][i][0])['text'], 
                                    "answer_start": eval(df_test["answer"][i][0])['answer_start'], 
                                    "answer_end": eval(df_test["answer"][i][0])['answer_end']}}, 
                                ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_train['context']))):
    new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                    'question': df_train["question"][i], 
                                    'answer': {"text": eval(df_train["answer"][i][0])['text'], 
                                    "answer_start": eval(df_train["answer"][i][0])['answer_start'], 
                                    "answer_end": eval(df_train["answer"][i][0])['answer_end']}}, 
                                ignore_index=True)

train_final_df = new_df_train[:-11874]
validation_final_df = new_df_train[-11874:]

train_dataset = Dataset.from_dict(train_final_df)
validation_dataset = Dataset.from_dict(validation_final_df)
test_dataset = Dataset.from_dict(df_test)

data_qas_id_squad_id = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})



  0%|          | 0/2 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████| 11873/11873 [00:24<00:00, 487.81it/s]
100%|██████████████████████████████████████████████████████████████████████████| 130319/130319 [10:38<00:00, 204.22it/s]


In [3]:
conhelps = NusantaraConfigHelper()
data_qas_id = conhelps.filtered(lambda x: 'idk_mrc' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas_id['train'])
df_validation = pd.DataFrame(data_qas_id['validation'])
df_test = pd.DataFrame(data_qas_id['test'])

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_train['context']))):
    for j in df_train["qas"][i]:
        if len(j['answers']) != 0:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": j['answers'][0]['text'], 
                                                           "answer_start": j['answers'][0]['answer_start'], 
                                                           "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                           ignore_index=True)
        else:
            new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                                'question': j['question'], 
                                                'answer': {"text": str(), 
                                                           "answer_start": 0, 
                                                           "answer_end": 0}}, 
                                                           ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_validation['context']))):
    for j in df_validation["qas"][i]:
        if len(j['answers']) != 0:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)        

cols = ['context', 'question', 'answer']
new_df_test = pd.DataFrame(columns=cols)

for i in tqdm(range(len(df_test['context']))):
    for j in df_test["qas"][i]:
        if len(j['answers']) != 0:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": j['answers'][0]['text'], 
                                                       "answer_start": j['answers'][0]['answer_start'], 
                                                       "answer_end": j['answers'][0]['answer_start'] + len(j['answers'][0]['text'])}}, 
                                                       ignore_index=True)
        else:
            new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                            'question': j['question'], 
                                            'answer': {"text": str(), 
                                                       "answer_start": 0, 
                                                       "answer_end": 0}}, 
                                                       ignore_index=True)

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)
test_dataset = Dataset.from_dict(new_df_test)

data_qas_id_idk_mrc = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})



  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████████████████████████████████████████████████████████████████████████| 3659/3659 [00:18<00:00, 199.79it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 358/358 [00:01<00:00, 240.75it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 378/378 [00:01<00:00, 251.67it/s]


In [4]:
conhelps = NusantaraConfigHelper()
data_qas_id = conhelps.filtered(lambda x: 'tydiqa_id' in x.dataset_name)[0].load_dataset()

df_train = pd.DataFrame(data_qas_id['train'])
df_validation = pd.DataFrame(data_qas_id['validation'])
df_test = pd.DataFrame(data_qas_id['test'])

cols = ['context', 'question', 'answer']
new_df_train = pd.DataFrame(columns=cols)

for i in range(len(df_train['context'])):
    answer_start = df_train['context'][i].index(df_train['label'][i])
    answer_end = answer_start + len(df_train['label'][i])
    new_df_train = new_df_train.append({'context': df_train["context"][i], 
                                        'question': df_train["question"][i], 
                                        'answer': {"text": df_train["label"][i], 
                                                   "answer_start": answer_start, 
                                                   "answer_end": answer_end}}, 
                                                   ignore_index=True)

cols = ['context', 'question', 'answer']
new_df_val = pd.DataFrame(columns=cols)    

for i in range(len(df_validation['context'])):
    answer_start = df_validation['context'][i].index(df_validation['label'][i])
    answer_end = answer_start + len(df_validation['label'][i])
    new_df_val = new_df_val.append({'context': df_validation["context"][i], 
                                    'question': df_validation["question"][i], 
                                    'answer': {"text": df_validation["label"][i], 
                                               "answer_start": answer_start, 
                                               "answer_end": answer_end}}, 
                                               ignore_index=True)    

cols = ['context', 'question', 'answer']
new_df_test = pd.DataFrame(columns=cols)

for i in range(len(df_test['context'])):
    answer_start = df_test['context'][i].index(df_test['label'][i])
    answer_end = answer_start + len(df_test['label'][i])
    new_df_test = new_df_test.append({'context': df_test["context"][i], 
                                    'question': df_test["question"][i], 
                                    'answer': {"text": df_test["label"][i], 
                                               "answer_start": answer_start, 
                                               "answer_end": answer_end}}, 
                                               ignore_index=True)

train_dataset = Dataset.from_dict(new_df_train)
validation_dataset = Dataset.from_dict(new_df_val)
test_dataset = Dataset.from_dict(new_df_test)

data_qas_id_tydiqa_id = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})



  0%|          | 0/3 [00:00<?, ?it/s]

# Mulai topik smoothing

In [84]:
question_word = ['siapa', 'siapakah',
                    'apa', 'apakah', 'adakah',
                    'dimana', 'dimanakah', 'darimanakah',
                    'kapan', 'kapankah',
                    'bagaimana', 'bagaimanakah',
                    'kenapa', 'mengapa',
                    'berapa', 'berapakah', 'seberapa'
                ]

In [6]:
MODEL_TG_IND_NAME = "Wikidepia/IndoT5-base-paraphrase"
MODEL_TG_ENG_NAME = "humarin/chatgpt_paraphraser_on_T5_base"
MODEL_NER_NAME = "cahya/xlm-roberta-base-indonesian-NER"

tokenizer_kwargs = {'truncation': True, 'max_length': 512}

nlp_ner = pipeline(task="ner", model=MODEL_NER_NAME, tokenizer=MODEL_NER_NAME)

nlp_tg_ind = pipeline(task="text2text-generation", model=MODEL_TG_IND_NAME, tokenizer=MODEL_TG_IND_NAME, 
              device=torch.cuda.current_device(), **tokenizer_kwargs)

nlp_tg_eng = pipeline(task="text2text-generation", model=MODEL_TG_ENG_NAME, tokenizer=MODEL_TG_ENG_NAME, 
              device=torch.cuda.current_device(), **tokenizer_kwargs)

In [238]:
def smoothing(question, pred_answer, gold_answer, type, question_word=question_word):
    
    question = question.lower()
    pred_answer = pred_answer.lower()
    gold_answer = gold_answer.lower()
    
    if type == 'replace first':
        pred_hypothesis = question.replace('?', '')
        pred_hypothesis = pred_hypothesis.replace(question.split()[0], pred_answer)

        gold_hypothesis = question.replace('?', '')
        gold_hypothesis = gold_hypothesis.replace(question.split()[0], gold_answer)

    elif type == 'replace question word':
        for i in question_word:
            if i in question.split():
                pred_hypothesis = question.replace('?', '')
                pred_hypothesis = pred_hypothesis.replace(i, pred_answer)

                gold_hypothesis = question.replace('?', '')
                gold_hypothesis = gold_hypothesis.replace(i, gold_answer)
                break
            
            else:
                pred_hypothesis = question.replace('?', '')
                pred_hypothesis = f"{pred_hypothesis.lstrip()} adalah {pred_answer}"

                gold_hypothesis = question.replace('?', '')
                gold_hypothesis = f"{gold_hypothesis.lstrip()} adalah {gold_answer}"
                break

    elif type == 'add adalah':
        pred_hypothesis = question.replace('?', '')
        pred_hypothesis = pred_hypothesis.replace(question.split()[0], '')
        pred_hypothesis = f"{pred_hypothesis} adalah {pred_answer}"

        gold_hypothesis = question.replace('?', '')
        gold_hypothesis = gold_hypothesis.replace(question.split()[0], '')
        gold_hypothesis = f"{gold_hypothesis} adalah {gold_answer}"

    elif type == 'just concat answer and question':
        pred_hypothesis = f"{question} {pred_answer}"         
        gold_hypothesis = f"{question} {gold_answer}"

    elif type == 'rule based':
        question = question.replace('kah', '')
        for j in question_word:
            if j in question.split():
                if j == 'siapa' or j == 'siapakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_answer} merupakan {pred_hypothesis}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_answer} merupakan {gold_hypothesis}"
                    break

                elif j == 'apa' or j == 'apakah' or j == 'adakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} adalah {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} adalah {gold_answer}"
                    break

                elif j == 'dimana' or j == 'dimanakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} di {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} di {gold_answer}"
                    break

                elif j == 'darimanakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} dari {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} dari {gold_answer}"
                    break

                elif j == 'kapan' or j == 'kapankah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} pada {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} pada {gold_answer}"
                    break

                elif j == 'bagaimana' or j == 'bagaimanakah':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '')
                    pred_hypothesis = f"{pred_hypothesis} adalah {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} adalah {gold_answer}"
                    break

                elif j == 'kenapa' or j == 'mengapa':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, 'alasan').lstrip()
                    pred_hypothesis = f"{pred_hypothesis} adalah karena {pred_answer}"

                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, 'alasan').lstrip()
                    gold_hypothesis = f"{gold_hypothesis} adalah karena {gold_answer}"
                    break

                elif j == 'berapa' or j == 'berapakah' or j == 'seberapa':
                    pred_hypothesis = question.replace('?', '')
                    pred_hypothesis = pred_hypothesis.replace(j, '').lstrip()

                    if 'luas' in pred_hypothesis.split():
                        pred_hypothesis = pred_hypothesis.replace('luas', '')
                        pred_hypothesis = f"{pred_hypothesis} memiliki luas {pred_answer}"

                    elif 'jumlah' in pred_hypothesis.split():
                        pred_hypothesis = pred_hypothesis.replace('jumlah', '')
                        pred_hypothesis = f"{pred_hypothesis} berjumlah {pred_answer}"
                        
                    else: pred_hypothesis = f"{pred_hypothesis} adalah {pred_answer}"
                        
                    gold_hypothesis = question.replace('?', '')
                    gold_hypothesis = gold_hypothesis.replace(j, '').lstrip()

                    if 'luas' in gold_hypothesis.split():
                        gold_hypothesis = gold_hypothesis.replace('luas', '')
                        gold_hypothesis = f"{gold_hypothesis} memiliki luas {gold_answer}"

                    elif 'jumlah' in gold_hypothesis.split():
                        gold_hypothesis = gold_hypothesis.replace('jumlah', '')
                        gold_hypothesis = f"{gold_hypothesis} berjumlah {gold_answer}"
                        
                    else: gold_hypothesis = f"{gold_hypothesis} adalah {gold_answer}"
                        
                    break
                    
            else:
                pred_hypothesis = question.replace('?', '')
                pred_hypothesis = f"{pred_hypothesis.lstrip()} adalah {pred_answer}"

                gold_hypothesis = question.replace('?', '')
                gold_hypothesis = f"{gold_hypothesis.lstrip()} adalah {gold_answer}"
                break

    elif type == 'machine generation with rule based':
        pred_hypothesis, gold_hypothesis = smoothing(question, pred_answer, gold_answer, type="rule based")
        pred_hypothesis = nlp_tg_ind(pred_hypothesis)[0]['generated_text']
        gold_hypothesis = nlp_tg_ind(gold_hypothesis)[0]['generated_text']

    elif type == 'pure machine generation':
        pred_hypothesis = f"{question} {pred_answer}"         
        gold_hypothesis = f"{question} {gold_answer}"

        pred_hypothesis = nlp_tg_ind(pred_hypothesis)[0]['generated_text']
        gold_hypothesis = nlp_tg_ind(gold_hypothesis)[0]['generated_text']

    elif type == 'machine generation with translation':
        pred_hypothesis, gold_hypothesis = smoothing(question, pred_answer, gold_answer, type="rule based")

        pred_hypothesis = GoogleTranslator(source='id', target='en').translate(pred_hypothesis)
        gold_hypothesis = GoogleTranslator(source='id', target='en').translate(gold_hypothesis)

        pred_hypothesis = nlp_tg_eng(pred_hypothesis)[0]['generated_text']
        gold_hypothesis = nlp_tg_eng(gold_hypothesis)[0]['generated_text']

        pred_hypothesis = GoogleTranslator(source='en', target='id').translate(pred_hypothesis)
        gold_hypothesis = GoogleTranslator(source='en', target='id').translate(gold_hypothesis)

    return pred_hypothesis.strip(), gold_hypothesis.strip()

# Pembuatan DataFrame dengan keseluruhan smoothing

In [239]:
type_smoothing = ['replace first', 'replace question word', 'add adalah',
       'just concat answer and question', 'rule based', 'machine generation with rule based',
       'pure machine generation', 'machine generation with translation']

In [240]:
def create_df_with_smoothing(data):

    question_array = []
    answer_array = []

    replace_first_array = []
    replace_question_word_array = []
    add_adalah_array = []
    just_concat_answer_and_question_array = []
    rule_based_array = []
    machine_generation_with_rule_based_array = []
    pure_machine_generation_array = []
    machine_generation_with_translation_array = []

    for i in tqdm(range(len(data))):

        question = str(data['question'][i])
        answer = str(data['answer'][i]['text'])

        _, replace_first =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='replace first')

        _, replace_question_word =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='replace question word')

        _, add_adalah =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='add adalah')

        _, just_concat_answer_and_question =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='just concat answer and question')

        _, rule_based =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='rule based')

        _, machine_generation_with_rule_based =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='machine generation with rule based')

        _, pure_machine_generation =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='pure machine generation')

        _, machine_generation_with_translation =  smoothing(question=question, 
                             pred_answer="-", 
                             gold_answer=answer, 
                             type='machine generation with translation')

        question_array.append(question)
        answer_array.append(answer)

        replace_first_array.append(replace_first)
        replace_question_word_array.append(replace_question_word)
        add_adalah_array.append(add_adalah)
        just_concat_answer_and_question_array.append(just_concat_answer_and_question)
        rule_based_array.append(rule_based)
        machine_generation_with_rule_based_array.append(machine_generation_with_rule_based)
        pure_machine_generation_array.append(pure_machine_generation)
        machine_generation_with_translation_array.append(machine_generation_with_translation)

        smoothing_df = pd.DataFrame({

            'Question': question_array,
            'Answer': answer_array,

            'replace first': replace_first_array,
            'replace question word': replace_question_word_array,
            'add adalah': add_adalah_array,
            'just concat answer and question': just_concat_answer_and_question_array,
            'rule based': rule_based_array,
            'machine generation with rule based': machine_generation_with_rule_based_array,
            'pure machine generation': pure_machine_generation_array,
            'machine generation with translation': machine_generation_with_translation_array

        }) 

    assert len(smoothing_df) == len(data)
    return smoothing_df

In [242]:
smoothing_df = create_df_with_smoothing(data_qas_id_idk_mrc['validation'])
smoothing_df.to_excel("smoothing_idk_mrc.xlsx")  

100%|█████████████████████████████████████████████████████████████████████████████████| 764/764 [52:22<00:00,  4.11s/it]


In [243]:
smoothing_df = create_df_with_smoothing(data_qas_id_tydiqa_id['validation'])
smoothing_df.to_excel("smoothing_tydiqa_id.xlsx")  

100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [41:15<00:00,  4.38s/it]


In [None]:
smoothing_df = create_df_with_smoothing(data_qas_id_squad_id['validation'])
smoothing_df.to_excel("smoothing_squad_id.xlsx")  

  0%|▍                                                                            | 58/11874 [04:00<15:26:50,  4.71s/it]