In [6]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import squad_utils as utils

In [4]:
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
# !wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json

In [1]:
import json
import time
import subprocess
import csv
from tqdm import tqdm
from collections import defaultdict
import pickle
import argparse
from tqdm import tqdm
import logging
import malaya

In [4]:
with open('train-v2.0.json') as hn:
    content = json.load(hn)

squad_version = content['version']
titles = [data['title'] for data in content['data']]
context_sentences = [
    context_sentence
    for data in content['data']
    for paragraph in data['paragraphs']
    for context_sentence in malaya.text.function.split_into_sentences(
        utils.remove_line_breaks(paragraph['context']), minimum_length = 2
    )
    if context_sentence
]

questions = [
    qa['question']
    for data in content['data']
    for paragraph in data['paragraphs']
    for qa in paragraph['qas']
    if qa['question']
]

answers = [
    answer['text']
    for data in content['data']
    for paragraph in data['paragraphs']
    for qa in paragraph['qas']
    for answer in qa['answers']
    if answer['text']
]

if squad_version == 'v2.0':
    plausible_answers = []
    for data in content['data']:
        for paragraph in data['paragraphs']:
            for qa in paragraph['qas']:
                if qa['is_impossible']:
                    for answer in qa['plausible_answers']:
                        plausible_answers.append(answer['text'])
else:
    plausible_answers = []

content = titles + context_sentences + questions + answers + plausible_answers
content = set(content)

In [7]:
file = 'translated-train-2.0.json'
batch_size = 10

if os.path.exists(file):
    with open(file) as fopen:
        data = json.load(fopen)
    list_content, translated = data
    
else:
    list_content = list(content)
    transformer = malaya.translation.en_ms.transformer()

    translated = []
    for i in tqdm(range(0, len(list_content), batch_size)):
        translated.extend(transformer.greedy_decoder(list_content[i: i + batch_size]))
        
    with open(file, 'w') as fopen:
        json.dump([list_content, translated], fopen)

In [8]:
file = 'content_translations_alignments-train-2.0.json'

if os.path.exists(file):
    with open(file) as fopen:
        content_translations_alignments = json.load(fopen)
        
else:
    context_sentence_questions_answers_alignments = utils.compute_alignment(
        list_content, 'en', translated, 'ms', 'forward', 'train-v2.0.json', 'out'
    )

    content_translations_alignments = {}
    for sentence, sentence_translated, alignment in zip(
        list_content, translated, context_sentence_questions_answers_alignments
    ):
        content_translations_alignments[sentence] = {
            'translation': sentence_translated,
            'alignment': alignment,
        }
        
    with open(file, 'w') as fopen:
        json.dump(content_translations_alignments, fopen)

In [9]:
answers_from_alignment = True

with open('train-v2.0.json') as hn:
    content = json.load(hn)

squad_version = content['version']

for data in tqdm(content['data']):
    title = data['title']
    data['title'] = content_translations_alignments[title]['translation']
    for paragraphs in data['paragraphs']:
        context = paragraphs['context']

        context_sentences = [
            s
            for s in malaya.text.function.split_into_sentences(
                utils.remove_line_breaks(context), minimum_length = 2
            )
        ]

        context_translated = ' '.join(
            content_translations_alignments[s]['translation']
            for s in context_sentences
        )
        context_alignment_tok = utils.compute_context_alignment(
            [
                content_translations_alignments[s]['alignment']
                for s in context_sentences
            ]
        )

        paragraphs['context'] = context_translated
        for qa in paragraphs['qas']:
            question = qa['question']
            question_translated = content_translations_alignments[question][
                'translation'
            ]
            qa['question'] = question_translated

            # Translate answers and plausible answers for SQUAD v2.0
            if squad_version == 'v2.0':
                if not qa['is_impossible']:
                    for answer in qa['answers']:
                        answer_translated = content_translations_alignments[
                            answer['text']
                        ]['translation']
                        answer_translated, answer_translated_start = utils.extract_answer_translated(
                            answer,
                            answer_translated,
                            context,
                            context_translated,
                            context_alignment_tok,
                            answers_from_alignment,
                        )
                        answer['text'] = answer_translated
                        answer['answer_start'] = answer_translated_start

                else:
                    for plausible_answer in qa['plausible_answers']:
                        plausible_answer_translated = content_translations_alignments[
                            plausible_answer['text']
                        ][
                            'translation'
                        ]
                        answer_translated, answer_translated_start = utils.extract_answer_translated(
                            plausible_answer,
                            plausible_answer_translated,
                            context,
                            context_translated,
                            context_alignment_tok,
                            answers_from_alignment,
                        )
                        plausible_answer['text'] = answer_translated
                        plausible_answer[
                            'answer_start'
                        ] = answer_translated_start

            # Translate answers for SQUAD v1.1
            else:
                for answer in qa['answers']:
                    answer_translated = content_translations_alignments[
                        answer['text']
                    ]['translation']
                    answer_translated, answer_translated_start = utils.extract_answer_translated(
                        answer,
                        answer_translated,
                        context,
                        context_translated,
                        context_alignment_tok,
                        answers_from_alignment,
                    )
                    answer['text'] = answer_translated
                    answer['answer_start'] = answer_translated_start


100%|██████████| 442/442 [12:06<00:00,  1.64s/it]


In [11]:
content_translated = content
content_cleaned = {'version': content['version'], 'data': []}
total_answers = 0
total_correct_plausible_answers = 0
total_correct_answers = 0
for idx_data, data in tqdm(enumerate(content_translated['data'])):
    content_title = content_translated['data'][idx_data]['title']
    content_cleaned['data'].append({'title': content_title, 'paragraphs': []})
    for par in data['paragraphs']:
        qas_cleaned = []
        for idx_qa, qa in enumerate(par['qas']):
            question = qa['question']

            # Extract answers and plausible answers for SQUAD v2.0
            if squad_version == 'v2.0':
                if not qa['is_impossible']:
                    correct_answers = []
                    for a in qa['answers']:
                        total_answers += 1
                        if a['text']:
                            total_correct_answers += 1
                            correct_answers.append(a)
                    correct_plausible_answers = []
                else:
                    correct_plausible_answers = []
                    for pa in qa['plausible_answers']:
                        total_answers += 1
                        if pa['text']:
                            total_correct_plausible_answers += 1
                            correct_plausible_answers.append(pa)
                    correct_answers = []

                # add answers and plausible answers to the content cleaned
                if correct_answers:
                    content_qas_id = qa['id']
                    content_qas_is_impossible = qa['is_impossible']
                    correct_answers_from_context = []
                    for a in qa['answers']:
                        start = a['answer_start']
                        correct_answers_from_context.append(
                            {
                                'text': par['context'][
                                    start : start + len(a['text'])
                                ],
                                'answer_start': start,
                            }
                        )
                    qa_cleaned = {
                        'question': question,
                        'answers': correct_answers_from_context,
                        'id': content_qas_id,
                        'is_impossible': content_qas_is_impossible,
                    }
                    qas_cleaned.append(qa_cleaned)
                if correct_plausible_answers and not correct_answers:
                    content_qas_id = qa['id']
                    content_qas_is_impossible = qa['is_impossible']
                    correct_answers_from_context = []
                    for a in qa['answers']:
                        start = a['answer_start']
                        correct_answers_from_context.append(
                            {
                                'text': par['context'][
                                    start : start + len(a['text'])
                                ],
                                'answer_start': start,
                            }
                        )
                    qa_cleaned = {
                        'question': question,
                        'answers': correct_answers,
                        'plausible_answers': correct_plausible_answers,
                        'id': content_qas_id,
                        'is_impossible': content_qas_is_impossible,
                    }
                    qas_cleaned.append(qa_cleaned)

            # Extract answers for SQUAD v1.0
            else:
                correct_answers = []
                for a in qa['answers']:
                    total_answers += 1
                    if a['text']:
                        total_correct_answers += 1
                        correct_answers.append(a)

                # add answers and plausible answers to the content cleaned
                if correct_answers:
                    content_qas_id = qa['id']
                    correct_answers_from_context = []
                    for a in qa['answers']:
                        start = a['answer_start']
                        correct_answers_from_context.append(
                            {
                                'text': par['context'][
                                    start : start + len(a['text'])
                                ],
                                'answer_start': start,
                            }
                        )
                    qa_cleaned = {
                        'question': question,
                        'answers': correct_answers_from_context,
                        'id': content_qas_id,
                    }
                    qas_cleaned.append(qa_cleaned)

        # Add the paragraph only if there are non-empty question-answer examples inside
        if qas_cleaned:
            content_context = par['context']
            content_cleaned['data'][idx_data]['paragraphs'].append(
                {'context': content_context, 'qas': qas_cleaned}
            )

442it [00:01, 424.34it/s]


In [12]:
with open('ms-train-2.0.json', 'w') as fn:
    json.dump(content_cleaned, fn)

In [13]:
if squad_version == 'v2.0':
    total_correct = total_correct_answers + total_correct_plausible_answers
    accuracy = round((total_correct / total_answers) * 100, 2)
    print(
        'Percentage of translated examples (correct answers/total answers): {}/{} = {}%\n'
        'No. of answers: {}\n'
        'No. of plausible answers: {}'.format(
            total_correct,
            total_answers,
            accuracy,
            total_correct_answers,
            total_correct_plausible_answers,
        )
    )

# Count correct answers
else:
    total_correct = total_correct_answers
    accuracy = round((total_correct / total_answers) * 100, 2)
    print(
        'Percentage of translated examples (correct answers/total answers): {}/{} = {}%\n'
        'No. of answers: {}'.format(
            total_correct, total_answers, accuracy, total_correct_answers
        )
    )


Percentage of translated examples (correct answers/total answers): 130318/130319 = 100.0%
No. of answers: 86821
No. of plausible answers: 43497
