# Find and suggest synonyms for untranslated entries in Itak-uoeroskip

## Find untranslated entries

In [12]:
from typing import TypedDict, cast
import requests

req = requests.get("https://itak.aynu.org/api/gdoc")

class Entry(TypedDict):
    sheetName: str
    Aynu: str
    日本語: str
    English: str
    中文: str

itakuoeroskip = cast(list[Entry], req.json()["table"])

not_translated_in_english_or_chinese: list[Entry] = []
for entry in itakuoeroskip:
    if not entry.get("English", "") or not entry.get("中文", ""):
        not_translated_in_english_or_chinese.append(entry)


with open("../output/itakuoeroskip_untranslated.txt", "w") as f:
    for entry in not_translated_in_english_or_chinese:
        f.write(
            entry.get("日本語", "")
            + "\t"
            + entry.get("English", "")
            + "\t"
            + entry.get("中文", "")
            + "\t"
            + entry.get("Ainu", "")
            + "\n"
        )


In [13]:
print(len(not_translated_in_english_or_chinese))
for entry in not_translated_in_english_or_chinese:
    print(entry)

298
{'日本語': '年下のN1', 'English': '', '中文': '', 'Aynu': 'poniwne N1', 'sheetName': 'general_modifier'}
{'日本語': '壮年のN1、大人のN1', 'English': '', '中文': '', 'Aynu': 'sukup N1, rupne N1', 'sheetName': 'general_modifier'}
{'日本語': '年老いたN1', 'English': '', '中文': '', 'Aynu': 'onne N1', 'sheetName': 'general_modifier'}
{'日本語': '挨拶', 'English': 'greetings', '中文': '', 'Aynu': 'uerenkarap', 'sheetName': 'general_noun'}
{'日本語': 'N1を参考する、N1を参照する', 'English': 'refer to N1', '中文': '', 'Aynu': 'N1 a=toykonukar', 'sheetName': 'general_verb'}
{'日本語': 'N1にN2を任せる', 'English': '', '中文': '', 'Aynu': 'N1 N2 a=ekosi', 'sheetName': 'general_verb'}
{'日本語': 'N1を許す、N1許可する、N1を承認する', 'English': 'allow N1', '中文': '', 'Aynu': 'a=kosawnu, ekoramusawnu', '註 / Notes': '< アア, < 田村', 'sheetName': 'general_verb'}
{'日本語': 'V1もの', 'English': '', '中文': '', 'Aynu': 'V1-pe', 'sheetName': 'general_expression'}
{'日本語': 'V1人、V1者', 'English': '', '中文': '', 'Aynu': 'V1-kur', 'sheetName': 'general_expression'}
{'日本語': 'V1こと、V1ところ', 'Englis

## AI Translated Synonym Suggestion

In [4]:
from openai import OpenAI
import os
import dotenv

dotenv.load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)

model = "gpt-4o-mini-2024-07-18"

In [15]:
from pydantic import BaseModel

class MultiLangDictionaryEntry(BaseModel):
    japanese_expanded_synonyms: list[str]
    english_synonyms: list[str]
    traditional_chinese_synonyms: list[str]


# System instructions to the AI
system_prompt = """You are a helpful assistant that builds multi-language dictionary entries.
Given a list of Japanese words that approximate the nuance of a concept, 
you will:
1. Suggest additional Japanese synonyms (5 to 10 words) that best fit the given nuance.
2. Suggest suitable English synonyms (5 to 10 words) capturing the same concept.
3. Suggest suitable Chinese (in Traditional script used in Taiwan and Hong kong) synonyms (5 to 10 words) capturing the same concept.

Instructions:
- Do not repeat the original Japanese words that are provided.
- Provide synonyms as concise words or short phrases.
- Ensure that the synonyms reflect the same nuance or meaning.
- Output only the fields as specified in the schema.
- If there is placeholders like N1, V1, etc., keep them in other languages as well.
"""

def create_user_prompt(entry: Entry) -> str:
    japanese = entry.get("日本語", "")
    genre = entry.get("sheetName", "") or "general"
    return f"Input Japanese words from genre '{genre}': \"{japanese}\"."

### Example

In [47]:
# user_prompt = """Input Japanese words: "N1とN2を組み合わせる、N1とN2をつなげる、N1とN2を結びつける"."""

# completion = client.beta.chat.completions.parse(
#     model=model,
#     messages=[
#         {"role": "system", "content": system_prompt},
#         {"role": "user", "content": user_prompt},
#     ],
#     response_format=MultiLangDictionaryEntry,
# )

# entry = completion.choices[0].message.parsed
# entry

MultiLangDictionaryEntry(japanese_expanded_synonyms=['N1とN2を連結する', 'N1とN2を融合する', 'N1とN2を関連づける', 'N1とN2を結合する', 'N1とN2を調和させる', 'N1とN2を一つにする', 'N1とN2を接続する'], english_synonyms=['combine N1 and N2', 'connect N1 and N2', 'link N1 and N2', 'join N1 and N2', 'merge N1 and N2', 'associate N1 with N2', 'unite N1 and N2'], traditional_chinese_synonyms=['將N1和N2結合', '連接N1與N2', '鏈接N1和N2', '聯合N1與N2', '合併N1與N2', '關聯N1和N2', '統一N1和N2'])

### Repeated calling

In [74]:
from typing import cast


LIMIT = 10

result: list[tuple[Entry, MultiLangDictionaryEntry]] = []

for untranslated in not_translated_in_english_or_chinese[:LIMIT]:
    if "日本語" in untranslated:
        if untranslated.get("日本語", ""):
            user_prompt = create_user_prompt(untranslated)

            completion = client.beta.chat.completions.parse(
                model="gpt-4o-mini-2024-07-18",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                response_format=MultiLangDictionaryEntry,
            )

            entry = cast(MultiLangDictionaryEntry, completion.choices[0].message.parsed)
            result.append((untranslated, entry))

    else:
        print("no japanese", untranslated)

KeyboardInterrupt: 

In [82]:
import re
from StarCC import PresetConversion
convert = PresetConversion(src="cn", dst="hk", with_phrase=False)

for untranslated, entry in result:
    print(f"{untranslated.get("Aynu", "")} ({untranslated.get("sheetName", "")})")
    print("-" * 100)
    print(
        f"{untranslated.get('日本語', '')} → {'、'.join(entry.japanese_expanded_synonyms)}"
    )
    print(
        f"英語: {untranslated.get('English', '')} → {', '.join(re.sub(r"^to ", "", r) for r in entry.english_synonyms)}"
    )
    print(
        f"中文: {untranslated.get('中文', '')} → {'，'.join(convert(e) for e in entry.traditional_chinese_synonyms)}"
    )
    print("=" * 100)

### Batch API

In [105]:
import pprint
import json
from openai.lib._pydantic import to_strict_json_schema

# Prepare batch file data
batch_data = [
    {
        "custom_id": str(index),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": create_user_prompt(entry)},
            ],
            "max_tokens": 1000,
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "multi_lang_dictionary_entry",
                    "schema": to_strict_json_schema(MultiLangDictionaryEntry),
                    "strict": True,
                },
            },
        },
    }
    for index, entry in enumerate(not_translated_in_english_or_chinese)
]

len(batch_data)
pprint.pprint(batch_data[0])

batch_file_path = "batch_requests.jsonl"
with open(batch_file_path, "w", encoding="utf-8") as batch_file:
    for item in batch_data:
        batch_file.write(json.dumps(item, ensure_ascii=False) + "\n")

print(
    f"Batch file created at {batch_file_path}. Please upload it to the OpenAI Batch API endpoint."
)

{'body': {'max_tokens': 1000,
          'messages': [{'content': 'You are a helpful assistant that builds '
                                   'multi-language dictionary entries.\n'
                                   'Given a list of Japanese words that '
                                   'approximate the nuance of a concept, \n'
                                   'you will:\n'
                                   '1. Suggest additional Japanese synonyms (5 '
                                   'to 10 words) that best fit the given '
                                   'nuance.\n'
                                   '2. Suggest suitable English synonyms (5 to '
                                   '10 words) capturing the same concept.\n'
                                   '3. Suggest suitable Chinese (in '
                                   'Traditional script used in Taiwan and Hong '
                                   'kong) synonyms (5 to 10 words) capturing '
                           

In [97]:
# test batch by directly sending request
import requests
import json
response = requests.post(
    "https://api.openai.com/v1/chat/completions",
    headers={"Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"},
    json=batch_data[0]["body"],
)
print(
    MultiLangDictionaryEntry.model_validate(
        json.loads(response.json()["choices"][0]["message"]["content"])
    )
)

{'id': 'chatcmpl-AeDeNPz07hr13jMRb9X1q4L6EdMpm', 'object': 'chat.completion', 'created': 1734148955, 'model': 'gpt-4o-mini-2024-07-18', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': '{"japanese_expanded_synonyms":["若いN1","年齢差のあるN1","下のN1","弟N1","妹N1","後輩N1","小さいN1","年少のN1"],"english_synonyms":["younger N1","junior N1","subordinate N1","younger sibling N1","younger counterpart N1","lesser N1","age-difference N1","minor N1"],"traditional_chinese_synonyms":["年輕的N1","年少的N1","下級的N1","弟弟N1","妹妹N1","後輩N1","年齡差的N1","小的N1"]}', 'refusal': None}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 296, 'completion_tokens': 153, 'total_tokens': 449, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'system_fingerprint': 'fp_6fc10e10eb'}


In [106]:
# Upload and create the batch
batch_input_file = client.files.create(
    file=open(batch_file_path, "rb"), purpose="batch"
)

batch_input_file_id = batch_input_file.id
batch = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={"description": "Multi-language dictionary batch job"},
)

print("batch created", batch_input_file_id)

batch created file-B62AxwJyyqYWdh5xFeFR4u


In [6]:
batch_id = batch.id
# batch_id = "batch_675d04f173e48190a5b271e7269326f1"

In [7]:
from openai.types.batch import Batch
from typing import cast

batch_status = cast(Batch, client.batches.retrieve(batch_id=batch_id))
print(batch_status)

Batch(id='batch_675d04f173e48190a5b271e7269326f1', completion_window='24h', created_at=1734149361, endpoint='/v1/chat/completions', input_file_id='file-B62AxwJyyqYWdh5xFeFR4u', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1734151738, error_file_id=None, errors=None, expired_at=None, expires_at=1734235761, failed_at=None, finalizing_at=1734151713, in_progress_at=1734149362, metadata={'description': 'Multi-language dictionary batch job'}, output_file_id='file-EBSfgLQggnhGg334gt9GUt', request_counts=BatchRequestCounts(completed=298, failed=0, total=298))


In [8]:
from openai.types.batch_request_counts import BatchRequestCounts
request_counts = cast(BatchRequestCounts, batch_status.request_counts)
print(f"Batch status: {batch_status.status}, {request_counts.completed}/{request_counts.total} ({request_counts.failed})")


Batch status: completed, 298/298 (0)


In [9]:
if batch_status.status == "completed":
    print("Batch completed")
    output_file_id = batch_status.output_file_id
    if output_file_id:
        output_file_response = client.files.content(output_file_id)
        output_file_path = "batch_responses.jsonl"
        with open(output_file_path, "w", encoding="utf-8") as output_file:
            output_file.write(output_file_response.text)
        print(f"Batch results saved to {output_file_path}")
    else:
        print("No output file found for completed batch.")
elif batch_status.status == "in_progress":
    print("Batch is still in progress")
else:
    print(f"Batch ended with status: {batch_status.status}")


Batch completed
Batch results saved to batch_responses.jsonl


In [16]:
import json
# Instructions to handle responses
response_file_path = "batch_responses.jsonl"  # Replace with the actual response file path after batch completion
try:
    with open(response_file_path, "r", encoding="utf-8") as response_file:
        responses = [json.loads(line) for line in response_file]

    # Parse and map results
    result = []
    for response, original_entry in zip(responses, not_translated_in_english_or_chinese):
        if "response" in response and response["response"]["status_code"] == 200:
            parsed = MultiLangDictionaryEntry.model_validate(
                json.loads(response["response"]["body"]["choices"][0]["message"]["content"])
            )
            result.append((original_entry, parsed))

    # Output result
    for original, parsed in result:
        print("Original Entry:", original)
        print("Parsed Entry:", parsed.dict())

except FileNotFoundError:
    print(
        f"Response file not found: {response_file_path}. Please ensure the responses are downloaded."
    )

Original Entry: {'日本語': '年下のN1', 'English': '', '中文': '', 'Aynu': 'poniwne N1', 'sheetName': 'general_modifier'}
Parsed Entry: {'japanese_expanded_synonyms': ['年下の人', '若い人', '後輩', '青年', '未成年', '若者', '子供', '少年', '青年層'], 'english_synonyms': ['younger person', 'youth', 'junior', 'youngster', 'minor', 'young generation', 'child', 'teenager', 'youthful'], 'traditional_chinese_synonyms': ['年輕人', '青春', '後輩', '少年', '未成年', '小孩', '青少年', '年輕世代', '青年']}
Original Entry: {'日本語': '壮年のN1、大人のN1', 'English': '', '中文': '', 'Aynu': 'sukup N1, rupne N1', 'sheetName': 'general_modifier'}
Parsed Entry: {'japanese_expanded_synonyms': ['中年のN1', '成熟したN1', '大柄なN1', '逞しいN1', '力強いN1', '実年齢のN1', '円熟したN1', '世代のN1', '進化したN1'], 'english_synonyms': ['middle-aged N1', 'mature N1', 'robust N1', 'sturdy N1', 'strong N1', 'aged N1', 'seasoned N1', 'developed N1', 'vintage N1'], 'traditional_chinese_synonyms': ['中年N1', '成熟的N1', '健壯的N1', '堅固的N1', '強壯的N1', '年長的N1', '經驗豐富的N1', '發展的N1', '經典的N1']}
Original Entry: {'日本語': '年老いたN1

/tmp/ipykernel_704715/3969079110.py:20: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  print("Parsed Entry:", parsed.dict())


In [18]:
import re
from StarCC import PresetConversion
import sys

convert = PresetConversion(src="cn", dst="hk", with_phrase=False)

sys.stdout = open("../output/untranslated_output.txt", "w", encoding="utf-8")
for untranslated, entry in result:
    print(f"{untranslated.get("Aynu", "")} ({untranslated.get("sheetName", "")})")
    print("-" * 100)
    print(
        f"{untranslated.get('日本語', '')} → {'、'.join(entry.japanese_expanded_synonyms)}"
    )
    print(
        f"英語: {untranslated.get('English', '')} → {', '.join(re.sub(r"^to ", "", r) for r in entry.english_synonyms)}"
    )
    print(
        f"中文: {untranslated.get('中文', '')} → {'，'.join(convert(e) for e in entry.traditional_chinese_synonyms)}"
    )
    print("=" * 100)
sys.stdout.close()
sys.stdout = sys.__stdout__