[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/masa3141/japanese-alpaca-lora/blob/master/notebooks/translate.ipynb)


# Translate
Translated the [alpaca_data.json](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json) to japanese using ChatGPT API.
We paid around US $45 to translate the full dataset to japanese. Translated data is available. ([japanese_alpaca_data.json](https://github.com/masa3141/japanese-alpaca-lora/blob/main/data/japanese_alpaca_data.json))

In [None]:
!pip install openai

In [None]:
import openai
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import os

In [None]:
openai.api_key = ''

In [None]:
# Recommeding to store the data in mounted google drive
!mkdir translated_data translated_data/data translated_data/error 

In [None]:
# Since it doesn't succeed in one attempt, it is necessary to execute multiple times, so from the next time onwards, only translate things that do not exist.
translated_files = set(os.listdir('translated_data/data'))

In [None]:
def translate_text(value):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
                {"role": "system", "content": "日本語に翻訳するAIアシスタントです。<start><end>で囲まれた文章を日本語に翻訳しなさい。"},
                {"role": "user", "content": f"<start>'{value}'<end>\n 日本語訳: "},
            ],
        max_tokens=1024,
        temperature=0,
        )
    return response.choices[0]["message"]["content"].strip().replace("<start>", "").replace("<end>", "")

def translate_item(item):
    translated_item = {}
    for key, value in item.items():
        if value:
            translated_value = translate_text(value)
            translated_item[key] = translated_value
        else:
            translated_item[key] = ''
    return translated_item

def save_item(item, file_name):
    with open(file_name, 'w') as f:
        json.dump(item, f, ensure_ascii=False, indent=4)

def translate_save(item, i):
    if f"translated_{i}.json" in translated_files:
      return
    try:
      translated_item = translate_item(item)
      save_item(translated_item, f"translated_data/data/translated_{i}.json")
    except Exception as e:
      print(f"translated_{i}.json: {e}")
      with open(f"translated_data/error/translated_{i}.json", 'a'):
          pass

In [None]:
# Please upload alpaca_data.json
with open('alpaca_data.json', 'r') as f:
    data = json.load(f)


In [None]:
# Translate in parallel
with ThreadPoolExecutor(max_workers=100) as executor:
    futures = {executor.submit(translate_save, item, i) for i, item in enumerate(data)}
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Translating"):
        future.result()


## Since it doesn't succeed in one attempt, it is necessary to execute multiple times. Please try untill all files are translated. It took US $45 and 5 hours.

## After finishing translation, merge those files into one file

In [None]:
def merge_json_files(data_folder):
    merged_data = []
    for i in range(52002):
        print(i)
        file_path = os.path.join(data_folder, f"translated_{i}.json")
        with open(file_path, 'r', encoding="utf-8") as file:
            data = json.load(file)
            merged_data.append(data)
    return merged_data

def write_merged_json_file(output_file, merged_data):
    with open(output_file, 'w', encoding="utf-8") as file:
        json.dump(merged_data, file, indent=2, ensure_ascii=False)

data_folder = 'translated_data/data'
output_file = 'japanese_alpaca_data.json'

merged_data = merge_json_files(data_folder)
write_merged_json_file(output_file, merged_data)