# Phase 1
* Phase 1 of this project involves the translation of 20 language direction, from and to English, involving all three datasets.

## Setup
* We define the setup code that define folders and instantiate the tasks before running them.

In [None]:
from scripts.task import TranslationTask
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
from scripts.translators import GPTClient, DeeplClient
from scripts.logger import MyLogger
from os.path import join

# Define all English including pairs
langs = Opus100Manager.EURO_ISO_2_PAIR.keys()
possible = [tuple((lang, 'en')) for lang in sorted(langs)]
extended = [(pair[1], pair[0]) for pair in possible]
possible.extend(extended)
en_pairs = possible

# Define folder hierarchy of where translations should be stored
main_folder = 'tasks'
sub_folder = join(main_folder, 'phase1')

# Define the data managers and folders for translation storage
dms = [EuroParlManager(), FloresPlusManager(), Opus100Manager()]
dm_ids = [dm.name.split('/')[-1] for dm in dms]
dm_folders = [join(sub_folder, dm_id) for dm_id in dm_ids]
tasks = {dm_id : {} for dm_id in dm_ids}

# Define the clients and logger
logger = MyLogger(logfile=join(main_folder, 'phase1_log.jsonl'))
cli_gpt = GPTClient(logger=logger)
cli_deepl = DeeplClient(logger=logger)
clients = [cli_gpt, cli_deepl]

# Check if we have 20 pairs
len(en_pairs) == 20

* Do repeat, we translate for 20 language pairs, 400 sentences each (one task)
* We do this for 3 datasets and 2 translators, so 6 tasks in total

In [None]:
num_of_tasks = 0
for dm, folder, dm_id in zip(dms, dm_folders, dm_ids):
    for client in clients:
        task = TranslationTask(
            target_pairs=en_pairs,
            dm=dm,
            client=client,
            logger=logger,
            mt_folder=join(folder, client.model),
            num_of_sents=400,
            acceptable_range=(360, 480)
        )
        tasks[dm_id][client.model] = task
        num_of_tasks += 1

num_of_tasks == 6

In [None]:
import random
# Sanity check
some_client_name =  random.choice([c.model for c in clients])
some_dm_id = random.choice(dm_ids)

print(some_dm_id, some_client_name)
vars(tasks[some_dm_id][some_client_name])

## Execution
* Most of the logging will be stored in `phase1_log.jsonl`
* We just run each task per cell

In [None]:
tasks.keys()

### GPT4.1

In [None]:
tasks['europarl'][cli_gpt.model].run()