# Change Original SQUaD JSON to a CSV table

In [None]:
import pandas as pd
import json

In [None]:
with open('../dataset/stanford-squad/raw/dev-v2.0.json', mode='r') as f:
    data: dict = json.load(f)

In [None]:
themes: list = data['data']

In [None]:
qa_list = []
for theme in themes:
    title: str = theme['title']
    paragraphs: list = theme['paragraphs']
    for paragraph in paragraphs:
        context: str = paragraph['context']
        qas: list = paragraph['qas']
        for qa in qas:
            question: str = qa['question']
            id: str = qa['id']
            answers: list = qa['answers']
            for answer in answers:
                answer_text: str = answer['text']
                answer_start: int = answer['answer_start']
                qa_list.append({
                    'question_id': id,
                    'title': title,
                    'answer_context': context,
                    'question': question,
                    'answer_text': answer_text,
                    'answer_start': answer_start,
                })
                break  # only use the first answer, since all answers are identical in this dataset, if the answer exists


In [None]:
df = pd.DataFrame(qa_list)
df.info()

In [None]:
df.head()

In [None]:
def get_question_type(question: str) -> int:
    type_dict = {
        'what': 1,
        'when': 2,
        'where': 3,
        'who': 4,
        'which': 5,
    }
    for key in type_dict.keys():
        if key in question.lower():
            return type_dict[key]
    return 0  # unknown type

df['question_type'] = df['question'].apply(get_question_type)

In [None]:
df

In [None]:
df['question_type'].value_counts()

In [None]:
df = df[df['question_type'] != 0].reset_index(drop=True)
df = df[['question_type', 'question', 'answer_text', 'answer_start', 'answer_context', 'question_id', 'title']]
df.info()

In [None]:
df

In [None]:
df.to_csv('../dataset/stanford-squad/table/dev-v2.0.csv', sep='\t', index=False)

In [None]:
export GOOGLE_APPLICATION_CREDENTIALS=key.json