In [2]:
import json
import asyncio
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import google.generativeai as genai
from dotenv import load_dotenv
import os
load_dotenv()


genai.configure(api_key=os.environ['GEMINI_API_KEY'])
model = genai.GenerativeModel("gemini-1.5-flash")


In [4]:
async def translate_entry(entry, semaphore):
    async with semaphore:
        prompt = f"""
        Translate the following text from English to Catalan:
        1. activity_label: {entry['activity_label']}
        2. ctx_a: {entry['ctx_a']}
        3. ctx_b: {entry['ctx_b']}
        4. ctx: {entry['ctx']}
        5. endings: {entry['endings']}

        The output format should be:
        1: TRANSLATION OF ACTIVITY LABEL
        2: TRANSLATION OF CTX_A
        3: TRANSLATION OF CTX_B
        4: TRANSLATION OF CTX
        5: TRANSLATION OF ENDINGS

        As you can see, each translation is separated by a new line and indicated using 1: .
        """

        try:
            response = await model.generate_content_async(
                prompt,
                safety_settings={
                    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_ONLY_HIGH,
                    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_ONLY_HIGH
                }
            )

            translated_text = response.results[0].text.split('\n')

            entry['activity_label'] = translated_text[0].split(': ')[1]
            entry['ctx_a'] = translated_text[1].split(': ')[1]
            entry['ctx_b'] = translated_text[2].split(': ')[1]
            entry['ctx'] = translated_text[3].split(': ')[1]
            entry['endings'] = json.loads(translated_text[4].split(': ')[1])

            return entry
        except Exception as e:
            print(f"{entry}.ERROR: {e}")
            return None

async def translate_dataset(input_file, output_file):
    semaphore = asyncio.Semaphore(40)
    translated_entries = []

    with open(input_file, 'r') as infile:
        tasks = []
        for line in infile:
            entry = json.loads(line.strip())
            tasks.append(translate_entry(entry, semaphore))

        translated_entries = await asyncio.gather(*tasks)
    
    translated_entries = [entry for entry in translated_entries if entry is not None]

    with open(output_file, 'w') as outfile:
        for entry in translated_entries:
            outfile.write(json.dumps(entry) + '\n')

In [5]:
async def main():
    input_file = 'input_data.jsonl'
    output_file = 'translated_data.jsonl'
    await translate_dataset(input_file, output_file)

loop = asyncio.get_running_loop()
loop.create_task(main())

<Task pending name='Task-5' coro=<main() running at C:\Users\Usuario\AppData\Local\Temp\ipykernel_21284\55873255.py:1>>

In [7]:
main()

<coroutine object main at 0x000001FF0D9F1620>