# Phase 1
* Phase 1 of this project involves the translation of 20 language direction, from and to English, involving all three datasets.

## Setup
* We define the setup code that define folders and instantiate the tasks before running them.

In [1]:
from scripts.task import TranslationTask
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
from scripts.translators import GPTClient, DeeplClient
from scripts.logger import MyLogger
from os.path import join

# Define all English including pairs
possible = [tuple(pair.split('-')) for pair in EuroParlManager.EP_PAIRS]
extended = [(pair[1], pair[0]) for pair in possible]
possible = possible + extended
en_pairs = [pair for pair in possible if 'en' in pair]

# Define folder hierarchy of where translations should be stored
main_folder = 'tasks'
sub_folder = join(main_folder, 'phase1')

# Define the data managers and folders for translation storage
dms = [EuroParlManager(), FloresPlusManager(), Opus100Manager()]
dm_ids = [dm.name.split('/')[-1] for dm in dms]
dm_folders = [join(sub_folder, dm_id) for dm_id in dm_ids]
tasks = {dm_id : {} for dm_id in dm_ids}

# Define the clients and logger
logger = MyLogger(logfile=join(sub_folder, 'phase1_log.jsonl'))
client_gpt = GPTClient(logger=logger)
client_deepl = DeeplClient(logger=logger)
clients = [client_gpt, client_deepl]

# Check if there are indeed 20 pairs
len(en_pairs) == 20

True

* Do repeat, we translate for 20 language pairs, 400 sentences each (one task)
* We do this for 3 datasets and 2 translators, so 6 tasks in total

In [2]:
num_of_tasks = 0
for dm, folder, dm_id in zip(dms, dm_folders, dm_ids):
    for client in clients:
        task = TranslationTask(
            target_pairs=en_pairs,
            dm=dm,
            client=client,
            logger=logger,
            mt_folder=join(folder, client.model),
            num_of_sents=400
        )
        tasks[dm_id][client.model] = task
        num_of_tasks += 1

num_of_tasks == 6

True

In [3]:
import random
# Sanity check
some_client_name =  random.choice([c.model for c in clients])
some_dm_id = random.choice(dm_ids)

print(some_dm_id, some_client_name)
vars(tasks[some_dm_id][some_client_name])

opus-100 deepl_document


{'store': 'tasks\\phase1\\opus-100\\deepl_document',
 'pairs': [('en', 'es'),
  ('da', 'en'),
  ('en', 'fr'),
  ('de', 'en'),
  ('el', 'en'),
  ('en', 'it'),
  ('en', 'pt'),
  ('en', 'nl'),
  ('en', 'sv'),
  ('en', 'fi'),
  ('es', 'en'),
  ('en', 'da'),
  ('fr', 'en'),
  ('en', 'de'),
  ('en', 'el'),
  ('it', 'en'),
  ('pt', 'en'),
  ('nl', 'en'),
  ('sv', 'en'),
  ('fi', 'en')],
 'dm': <scripts.data_management.Opus100Manager at 0x199bf7f0410>,
 'logger': <scripts.logger.MyLogger at 0x199c11dbec0>,
 'num_of_sents': 400,
 'client': <scripts.translators.DeeplClient at 0x199c12fb5f0>,
 'is_retry': False}

## Execution
* Most of the logging will be stored in `phase1_log.jsonl`
* We just run each task per cell

In [5]:
tasks.keys()

dict_keys(['europarl', 'flores_plus', 'opus-100'])

### GPT4.1

In [None]:
tasks['flores_plus']['gpt-4.1'].run()