In [22]:
from tqdm.auto import tqdm
import json
from typing import List
import random
import re
import ast

In [25]:
def encode_string(text):
    return text.replace("\r", r"\r").replace("\n", r"\n").replace("\t", r"\t")


def decode_string(text):
    return text.replace("\\r", "\r").replace("\\n", "\n").replace("\\t", "\t")


def prepare_data_for_translation(original_json: str, output_path: str):
    with open(original_json, "r", encoding="utf-8") as original_file, open(
        output_path, "w", encoding="utf-8"
    ) as output_file:
        data = json.load(original_file)
        for example in tqdm(data):
            instruction = example["instruction"].strip()
            model_input = example["input"].strip()
            model_output = example["output"].strip()
            if len(instruction) > 0:
                print(
                    encode_string(instruction),
                    file=output_file,
                )
            if len(model_input) > 0:
                print(
                    encode_string(model_input),
                    file=output_file,
                )
            if len(model_output) > 0:
                print(
                    encode_string(model_output),
                    file=output_file,
                )


def is_numbers(text):
    return re.search("[a-zA-Z]", text) is None


def is_code(text):
    try:
        ast.parse(text)
    except SyntaxError:
        return False
    return True


def build_from_translation(original_json: str, translations: str, output_json: str):
    with open(original_json, "r", encoding="utf-8") as original_file, open(
        translations, "r", encoding="utf-8"
    ) as translation_file, open(output_json, "w", encoding="utf-8") as output_file:
        data = json.load(original_file)
        translations = translation_file.readlines()
        translations = [decode_string(line) for line in translations]
        output_dict = []
        i = 0
        for example in tqdm(data):
            translated_example = {"instruction": "", "input": "", "output": ""}
            instruction = example["instruction"].strip()
            model_input = example["input"].strip()
            model_output = example["output"].strip()

            if len(instruction) > 0:
                if is_numbers(instruction) or is_code(instruction):
                    translated_example["instruction"] = instruction
                else:
                    translated_example["instruction"] = translations[i]
                i += 1
            if len(model_input) > 0:
                if is_numbers(model_input) or is_code(model_input):
                    translated_example["input"] = model_input
                else:
                    translated_example["input"] = translations[i]
                i += 1
            if len(model_output) > 0:
                if is_numbers(model_output) or is_code(model_output):
                    translated_example["output"] = model_output
                else:
                    translated_example["output"] = translations[i]
                i += 1

            output_dict.append(translated_example)

        json.dump(output_dict, output_file, indent=4, ensure_ascii=False)


def merge_examples(json_paths: List[str], output_path: str):
    merged_examples = []
    for json_path in json_paths:
        with open(json_path, "r", encoding="utf-8") as json_path:
            data = json.load(json_path)
            merged_examples.extend(data)

    # shuffle
    random.shuffle(merged_examples)
    with open(output_path, "w", encoding="utf-8") as output_file:
        json.dump(merged_examples, output_file, indent=4, ensure_ascii=False)

In [26]:
prepare_data_for_translation(
    "../data/alpaca_data_cleaned.json", "../data/en.sentences.txt"
)

  0%|          | 0/51942 [00:00<?, ?it/s]

In [17]:
build_from_translation(
    "../data/alpaca_data_cleaned.json",
    "../data/en.sentences.txt",
    "../data/en.sanitycheck.json",
)

  0%|          | 0/51942 [00:00<?, ?it/s]

In [27]:
build_from_translation(
    "../data/alpaca_data_cleaned.json",
    "../data/spa_Latn.sentences.txt",
    "../data/spa_Latn.json",
)

  0%|          | 0/51942 [00:00<?, ?it/s]

IndexError: list index out of range

False