# Backtranslate the samples

In [115]:
import ast
import json
import os
from pprint import pprint
from typing import Dict

from google.cloud import translate

In [116]:
ROOT_FP = "../data/"
TRANSLATED_DATA_FP = "../data/translations/"
samples_to_translate_filename = "samples_to_translate.jsonl"
samples_to_translate_fp = os.path.join(ROOT_FP, samples_to_translate_filename)

In [117]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "../credentials.json"

In [118]:
PROJECT_ID = "nlp2022-final-project"
LOCATION= "global"
PARENT = f"projects/{PROJECT_ID}/locations/{LOCATION}"

GOOGLE_CLIENT = translate.TranslationServiceClient()

In [119]:
ROOT_SOURCE_LANG = "en-US"
SPANISH_PIVOT_LANG = "es"
JAPANESE_PIVOT_LANG = "ja"

## Load samples

In [120]:
samples_to_translate_lst = []

with open(samples_to_translate_fp, 'r') as f:
    for line in f.readlines():
        samples_to_translate_lst.append(
            ast.literal_eval(line)
        )

In [121]:
samples_to_translate_lst[0]

{'id': '56de57394396321400ee2830',
 'title': 'Institute_of_technology',
 'context': 'In Turkey and the Ottoman Empire, the oldest technical university is Istanbul Technical University. Its graduates contributed to a wide variety of activities in scientific research and development. In 1950s, 2 technical universities were opened in Ankara and Trabzon. In recent years, Yildiz University is reorganized as Yildiz Technical University and 2 institutes of technology were founded in Kocaeli and Izmir. In 2010, another technical university named Bursa Technical University was founded in Bursa. Moreover, a sixth technical university is about to be opened in Konya named Konya Technical University.',
 'question': 'When Konya Technical University opens, how many total institutes of technology will there be in Turkey and the Ottoman Empire?',
 'answers': {'text': ['six'], 'answer_start': [520]}}

## Create helper functions

In [122]:
def translate_text(
    text: str, source_lang: str, target_lang: str
) -> str:
    response = GOOGLE_CLIENT.translate_text(
        request={
            "parent": PARENT,
            "contents": [text],
            "mime_type": "text/plain",
            "source_language_code": source_lang,
            "target_language_code": target_lang,
        }
    )
    return response.translations[0].translated_text

In [123]:
def backtranslate_text(text: str, source_lang: str, pivot_lang: str) -> str:
    source_to_pivot_translation = translate_text(
        text=text, source_lang=source_lang, target_lang=pivot_lang
    )
    pivot_to_source_translation = translate_text(
        text=source_to_pivot_translation, source_lang=pivot_lang,
        target_lang=source_lang
    )
    return {
        "translation": source_to_pivot_translation,
        "backtranslation":pivot_to_source_translation
    }

In [124]:
def translate_sample(
    object_dict: Dict, source_lang: str, pivot_lang: str
):
    text = object_dict["question"]
    backtranslation_dict = backtranslate_text(
        text=text, source_lang=source_lang, pivot_lang=pivot_lang
    )
    return {
        **object_dict,
        **backtranslation_dict,
        **{
            "source_lang": source_lang,
            "pivot_lang": pivot_lang   
        }
    }


In [125]:
def write_json(object_dict: Dict, write_path: str):
    with open(write_path, 'w') as f:
        f.write(json.dumps(object_dict))


In [126]:
def process_sample(
    object_dict: Dict,
    source_lang: str,
    pivot_lang: str,
    write_dir: str
) -> Dict:
    processed_dict = translate_sample(
        object_dict=object_dict, source_lang=source_lang, pivot_lang=pivot_lang
    )
    filename = f"{object_dict['id']}.json"
    fp = os.path.join(write_dir, filename)
    write_json(processed_dict, fp)
    return processed_dict

## Do backtranslation from English to Spanish

In [134]:
SPANISH_TEST_COUNT = 1000

In [135]:
spanish_write_dir = "../data/translations/es"

In [136]:
current_spanish_translations = os.listdir(spanish_write_dir)

In [140]:
for i in range(SPANISH_TEST_COUNT):
    filename = f"{samples_to_translate_lst[i]['id']}.json"
    if i % 50 == 0:
        print(f"Sample {i} out of {SPANISH_TEST_COUNT}")
    if filename not in current_spanish_translations:
        try:
            backtranslated_sample = process_sample(
                object_dict=samples_to_translate_lst[i],
                source_lang=ROOT_SOURCE_LANG,
                pivot_lang=SPANISH_PIVOT_LANG,
                write_dir=spanish_write_dir
            )
        except Exception as e:
            print(f"Error {e} on sample {i} in the Spanish backtranslation")
        

## Do backtranslation from English to Japanese

In [152]:
JAPANESE_TEST_COUNT = 1000

In [153]:
japanese_write_dir = "../data/translations/ja"

In [143]:
current_japanese_translations = os.listdir(japanese_write_dir)

In [154]:
for i in range(JAPANESE_TEST_COUNT):
    filename = f"{samples_to_translate_lst[i]['id']}.json"
    if i % 50 == 0:
        print(f"Sample {i} out of {JAPANESE_TEST_COUNT}")
    if filename not in current_japanese_translations:
        try:
            processed_dict = process_sample(
                object_dict=samples_to_translate_lst[i],
                source_lang=ROOT_SOURCE_LANG,
                pivot_lang=JAPANESE_PIVOT_LANG,
                write_dir=japanese_write_dir
            )
        except Exception as e:
            print(f"Error {e} on sample {i} in the Japanese backtranslation")

Sample 0 out of 1000
Sample 50 out of 1000
Sample 100 out of 1000
Sample 150 out of 1000
Sample 200 out of 1000
Sample 250 out of 1000
Sample 300 out of 1000
Sample 350 out of 1000
Sample 400 out of 1000
Sample 450 out of 1000
Sample 500 out of 1000
Sample 550 out of 1000
Sample 600 out of 1000
Sample 650 out of 1000
Sample 700 out of 1000
Sample 750 out of 1000
