# 01 Pre Augmentation Preparation

In [63]:
import re
import pandas as pd 
from datasets import load_dataset, concatenate_datasets
from bs4 import BeautifulSoup
from pythainlp.util import normalize
from copy import deepcopy
from tqdm import tqdm

In [48]:
iapp    = load_dataset("iapp_wiki_qa_squad")
thaiqa  = load_dataset("thaiqa_squad")
xquad   = load_dataset("xquad", "xquad.th")
tydiqa = load_dataset("khalidalt/tydiqa-goldp", "thai")

Found cached dataset iapp_wiki_qa_squad (/Users/parinzee/.cache/huggingface/datasets/iapp_wiki_qa_squad/iapp_wiki_qa_squad/1.0.0/c1455d806e5a66ca9ee5c03b4aeaeaef4410afca6263c0bfb440ff1db28e20c3)
100%|██████████| 3/3 [00:00<00:00, 484.97it/s]
Found cached dataset thaiqa_squad (/Users/parinzee/.cache/huggingface/datasets/thaiqa_squad/thaiqa_squad/1.0.0/fce14864b511d48464540780f328f4b415746b63f2fd934ad0b06c3eead7787b)
100%|██████████| 2/2 [00:00<00:00, 351.37it/s]
Found cached dataset xquad (/Users/parinzee/.cache/huggingface/datasets/xquad/xquad.th/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 499.02it/s]
Found cached dataset tydiqa-goldp (/Users/parinzee/.cache/huggingface/datasets/khalidalt___tydiqa-goldp/thai/1.1.0/c0ba4613293c9a8e7cdc684809f0b2a520a66f38b23af4af5c09ea55e2b972f0)
100%|██████████| 2/2 [00:00<00:00, 222.07it/s]


In [49]:
# Merge all splits of each dataset
def merge_dataset_splits(dataset):
    splits = list(dataset.keys())
    if len(splits) == 1:
        return dataset[splits[0]]
    else:
        return concatenate_datasets([dataset[split] for split in splits])

In [50]:
iapp = merge_dataset_splits(iapp).to_pandas()
thaiqa = merge_dataset_splits(thaiqa).to_pandas()
xquad = merge_dataset_splits(xquad).to_pandas()
tydiqa = merge_dataset_splits(tydiqa).to_pandas()

## Reformat the Datasets

In [51]:
iapp = iapp[['question', 'context', 'answers']]
thaiqa = thaiqa[['question', 'context', 'answers']]
xquad = xquad[['question', 'context', 'answers']]
tydiqa = tydiqa.rename(columns={'passage_text': 'context', "question_text": "question"})[['question', 'context', 'answers']]

In [60]:
# Datasets require more processing to make them all in the correct format
def clean_text(text):
    # Remove html tags
    soup = BeautifulSoup(text, 'lxml')
    text = soup.get_text()

    # Remove semicolons
    text = re.sub(r';', '', text)

    # Remove empty parenthesis and parenthesis with only whitespace inside
    text = re.sub(r'\(\s*\)', '', text)
    text = re.sub(r'\(;\s*"(\w+)"\)', r'("\1")', text)

    # Remove reference citations for example [2]:7 or [9]:5 (present in tydiqa)
    text = re.sub(r'\[\d+\]:\d+', '', text)
    text = re.sub(r'\[\d+\]', '', text)

    # Remove more than one whitespace
    text = re.sub(r'\s+', ' ', text)

    # Strip text inside of parenthesis
    text = re.sub(r'\(\s*([^)]*)\)', r'(\1)', text)

    # Remove em dashes
    text = re.sub(u"\u2014", "", text)

    text = normalize(text)
    return text

def get_offset_begin_position(cleaned_context, answers_text, answer_begin_positions):
    new_answer_begin_positions = []

    try:
        for answer_text, answer_begin_position in zip(answers_text, answer_begin_positions):
            # Find all all instances of the answer in cleaned_context
            possible_answer_begin_positions = [index for index in range(len(cleaned_context)) if cleaned_context.find(answer_text, index)]

            # If len is 1, then we can just use the begin position of the answer in the cleaned_context
            if len(possible_answer_begin_positions) == 1:
                new_answer_begin_positions.append(possible_answer_begin_positions[0])
            
            # If len is 0, then we progressively remove the last character of the answer until we find a match
            elif len(possible_answer_begin_positions) == 0:
                while len(possible_answer_begin_positions) == 0:
                    possible_answer_begin_positions =  [index for index in range(len(cleaned_context)) if cleaned_context.find(answer_text, index)]
                    answer_text = answer_text[:-1]
                    if len(answer_text) == 0:
                        raise Exception("Answer not found in context")
                
                # If len is 1, then we can just use the begin position of the answer in the cleaned_context
                if len(possible_answer_begin_positions) == 1:
                    new_answer_begin_positions.append(possible_answer_begin_positions[0])
                elif len(possible_answer_begin_positions) > 1:
                    # Take the index closest to the original answer begin position
                    new_answer_begin_positions.append(min(possible_answer_begin_positions, key=lambda x: abs(x - answer_begin_position)))
    except:
        print("Error with answer: ", answer_text)
        print("Error with context: ", cleaned_context)
        print("Error with answer begin position: ", answer_begin_position)
        raise
    
    return new_answer_begin_positions

def process_row(row, answer_text_key, answer_start_key):
    # Normalize the text
    new_row = deepcopy(row)
    new_row["context"] = clean_text(row["context"])
    new_row["question"] = clean_text(row["question"])

    new_row["answers"] = {}
    new_row["answers"]["text"] = [clean_text(x) for x in row["answers"][answer_text_key]]

    # Reindex the dataset
    new_row["answers"]["answer_start"] = get_offset_begin_position(new_row["context"], new_row["answers"]["text"], row["answers"][answer_start_key])
    new_row["answers"]["answer_end"] = [x + len(y) for x, y in zip(new_row["answers"]["answer_start"], new_row["answers"]["text"])]

    return new_row

def sanity_check(datasets):
    # Match keys
    for dataset in tqdm(datasets):
        assert list(dataset.columns) == list(datasets[0].columns)
        assert dataset['answers'][0].keys() == datasets[0]['answers'][0].keys()
    
    print("All Keys Matched...")

    # Check theortical answers vs index
    for dataset in datasets:
        for _, row in tqdm(list(dataset.iterrows())):
            for text, begin, end in zip(row["answers"]['text'], row["answers"]['answer_start'], row["answers"]['answer_end']):
                assert text == row['context'][begin:end], f"Theoretical Answer: {text} | Indexed: {row['context'][begin:end]} | Context: {row['context']}"
    
    print("Theortical Answers Matched...")

In [53]:
iapp = iapp.apply(lambda x: process_row(x, "text", "answer_start"), axis=1)
thaiqa = thaiqa.apply(lambda x: process_row(x, "answer", "answer_begin_position"), axis=1)
xquad = xquad.apply(lambda x: process_row(x, "text", "answer_start"), axis=1)
tydiqa = tydiqa.apply(lambda x: process_row(x, "text", "start_byte"), axis=1)

  soup = BeautifulSoup(text, 'lxml')
  soup = BeautifulSoup(text, 'lxml')
  soup = BeautifulSoup(text, 'lxml')
  soup = BeautifulSoup(text, 'lxml')


In [61]:
sanity_check([iapp, thaiqa, xquad, tydiqa])

100%|██████████| 4/4 [00:00<00:00, 2888.14it/s]


All Keys Matched...


100%|██████████| 7242/7242 [00:00<00:00, 84331.84it/s]
100%|██████████| 4074/4074 [00:00<00:00, 86065.82it/s]
100%|██████████| 1190/1190 [00:00<00:00, 84205.92it/s]
100%|██████████| 4579/4579 [00:00<00:00, 85255.68it/s]

Theortical Answers Matched...





## Merge the Datasets

In [62]:
iapp["source"] = "iapp"
thaiqa["source"] = "thaiqa"
xquad["source"] = "xquad"
tydiqa["source"] = "tydiqa"

In [64]:
final = pd.concat([iapp, thaiqa, xquad, tydiqa], ignore_index=True)
final

Unnamed: 0,question,context,answers,source
0,พัทธ์ธีรา ศรุติพงศ์โภคิน เกิดวันที่เท่าไร,พัทธ์ธีรา ศรุติพงศ์โภคิน (เกิด 3 ธันวาคม พ.ศ. ...,"{'text': ['3 ธันวาคม พ.ศ. 2533'], 'answer_star...",iapp
1,พัทธ์ธีรา ศรุติพงศ์โภคิน มีฃื่อเล่นว่าอะไร,พัทธ์ธีรา ศรุติพงศ์โภคิน (เกิด 3 ธันวาคม พ.ศ. ...,"{'text': ['อร'], 'answer_start': [], 'answer_e...",iapp
2,พัทธ์ธีรา ศรุติพงศ์โภคิน ทำอาชีพอะไร,พัทธ์ธีรา ศรุติพงศ์โภคิน (เกิด 3 ธันวาคม พ.ศ. ...,"{'text': ['นักแสดงหญิงชาวไทย'], 'answer_start'...",iapp
3,พัทธ์ธีรา ศรุติพงศ์โภคิน จบการศึกษาจากประเทศอะไร,พัทธ์ธีรา ศรุติพงศ์โภคิน (เกิด 3 ธันวาคม พ.ศ. ...,"{'text': ['ประเทศนิวซีแลนด์'], 'answer_start':...",iapp
4,บิดาของคลีโอพัตราเป็นใคร,คลีโอพัตราที่ 7 ฟิโลพาเธอร์ (กรีก: Κλεοπάτρα θ...,"{'text': ['ทอเลมีที่ 12 ออเลติส'], 'answer_sta...",iapp
...,...,...,...,...
17080,หนังสือการ์ตูนชานะ นักรบเนตรอัคคี มีกี่เล่ม?,นิยาย ชานะ นักรบเนตรอัคคี แต่งโดย ยาชิจิโร ทาค...,"{'text': ['22', '26'], 'answer_start': [], 'an...",tydiqa
17081,ไบโอช็อก อินฟินิต เปิดตัวครั้งแรกเมื่อไหร่?,ไบโอช็อก อินฟินิต (English: BioShock Infinite)...,"{'text': ['26 มีนาคม พ.ศ. 2556', '26 มีนาคม พ....",tydiqa
17082,ยู ซึง-โฮ เริ่มเข้าวงการบันเทิงเมื่อไหร่?,ยู ซึง-โฮเดบิวต์เมื่อปี 1999 กับผลงานโฆษณาของ ...,"{'text': ['ปี 1999', '1999'], 'answer_start': ...",tydiqa
17083,นภัทร อินทร์ใจเอื้อมีมารดาชื่อว่าอะไร?,กันเกิดเมื่อวันที่ 23 ตุลาคม พ.ศ. 2533 จังหวัด...,"{'text': ['นางวรรณา อินทร์ใจเอื้อ', 'นางวรรณา ...",tydiqa


In [65]:
final.to_csv("data/01_prepared_merged.csv", index=False)