# Logging
* Showcasing how the Logger is used in combination with the translation task
* We use the MockClient that was created for testing the TranslationTask


In [1]:
!rm -rf tmp_test

In [2]:
from test_translation_task import MockClient
from scripts.task import TranslationTask
from scripts.data_management import Opus100Manager
from scripts.logger import MyLogger, RetryLog
from io import StringIO
import json

dm = Opus100Manager()
pairs = [('de', 'en'), ('en', 'de'), ('fr', 'en'), ('en', 'fr')]
logfile = StringIO()
logger = MyLogger(logfile=logfile)
cli = MockClient(logger=logger, planned_fails=[pairs[1], pairs[3]], planned_errors=[pairs[0]])

task = TranslationTask(
    target_pairs=pairs,
    dm=dm,
    client=cli,
    logger=logger,
    mt_folder='tmp_test',
    num_of_sents=400,
    retry_delay=0 # Default retry_delay is 30s, we set it to 0
)

In [3]:
task.run()

[⚠️]: Error MockError
[⏲️]: Retrying de-en...
[✔️]: 400 translated from de to en
[❌]: Output for en-de is not acceptable!
[⏲️]: Retrying en-de...
[✔️]: 400 translated from en to de
[✔️]: 400 translated from fr to en
[❌]: Output for en-fr is not acceptable!
[⏲️]: Retrying en-fr...
[✔️]: 400 translated from en to fr


In [4]:
log_data = [json.loads(ln) for ln in logfile.getvalue().splitlines()]
for log in log_data:
    print(log)

{'git_hash': 'd0b94fe', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'de', 'tgt_lang': 'en', 'start': 1746105396.328032, 'id': '7bfb1773-a532-48ea-b242-2bea240a82e0', 'in_lines': 400, 'in_sents': 444, 'start_timestamp': '2025-05-01 15:16:36.401987+02:00', 'in_chars': 32731, 'in_tokens': 8295}, 'verdict': {'failure': 'Translation failed', 'error': 'MockError'}}
{'git_hash': 'd0b94fe', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'de', 'tgt_lang': 'en', 'start': 1746105396.4209864, 'id': 'beb84349-0159-4c9c-97bd-71575ad245ec', 'in_lines': 400, 'in_sents': 444, 'start_timestamp': '2025-05-01 15:16:36.491985+02:00', 'in_chars': 32731, 'in_tokens': 8295, 'end': 1746105396.5129905, 'end_timestamp': '2025-05-01 15:16:36.512990+02:00', 'time': 0.09200406074523926, 'out_chars'

In [5]:
import os
os.listdir('tmp_test')

['de-en.txt',
 'en-de.txt',
 'en-de_fail1.txt',
 'en-fr.txt',
 'en-fr_fail1.txt',
 'fr-en.txt']

* We keep the ones that we consider as failures for documentation purposes
* If the failure was caused by an error, nothing left do document, so no fail file for `de-en.txt`

In [6]:
log_ids = [log['translation']['id'] for log in [log_data[-3], log_data[-1]]]
log_ids

['18f431ca-b266-4fa9-81a9-86ea0c928a85',
 '740c37ac-5efa-4918-925a-4e16a334b7c7']

## Manual Retry
* In case we still think we need to retry the call, we have to start a new task and specifcy the log ids of the translations we want to compute again 
* This provides an adequate level of transparency

In [7]:
retry = RetryLog(pairs=[pairs[-2], pairs[-1]], log_ids=log_ids, reasons=['BLEU score single digit', 'Insufficient output, failed to detect automatically'])
new_logger = MyLogger(logfile=logfile, retry=retry)
cli = MockClient(logger=new_logger)
task = TranslationTask(
    target_pairs=[pairs[-2], pairs[-1]],
    dm=dm,
    client=cli,
    logger=new_logger,
    mt_folder='tmp_test',
    num_of_sents=400,
    manual_retry=True,
)

In [8]:
!rm -rf tmp_test

In [9]:
task.run()

[✔️]: 400 translated from fr to en
[✔️]: 400 translated from en to fr


In [10]:
log_data = [json.loads(ln) for ln in logfile.getvalue().splitlines()]
for log in log_data:
    print(log)

{'git_hash': 'd0b94fe', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'de', 'tgt_lang': 'en', 'start': 1746105396.328032, 'id': '7bfb1773-a532-48ea-b242-2bea240a82e0', 'in_lines': 400, 'in_sents': 444, 'start_timestamp': '2025-05-01 15:16:36.401987+02:00', 'in_chars': 32731, 'in_tokens': 8295}, 'verdict': {'failure': 'Translation failed', 'error': 'MockError'}}
{'git_hash': 'd0b94fe', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'de', 'tgt_lang': 'en', 'start': 1746105396.4209864, 'id': 'beb84349-0159-4c9c-97bd-71575ad245ec', 'in_lines': 400, 'in_sents': 444, 'start_timestamp': '2025-05-01 15:16:36.491985+02:00', 'in_chars': 32731, 'in_tokens': 8295, 'end': 1746105396.5129905, 'end_timestamp': '2025-05-01 15:16:36.512990+02:00', 'time': 0.09200406074523926, 'out_chars'

* The last log entries have a `manual_retry` field, indicating that they were added post translation

## Limit
* To not burn through our money, we do set a limit

In [11]:
!rm -rf tmp_test

In [12]:
dm = Opus100Manager()
pairs = [('de', 'en'), ('en', 'de')]
logfile = StringIO()
logger = MyLogger(logfile=logfile)
cli = MockClient(logger=logger, planned_fails=[pairs[1], pairs[1]], planned_errors=[pairs[1]])

task = TranslationTask(
    target_pairs=pairs,
    dm=dm,
    client=cli,
    logger=logger,
    mt_folder='tmp_test',
    num_of_sents=400,
    retry_delay=0,
    max_retries=2
)

task.run()

[✔️]: 400 translated from de to en
[⚠️]: Error MockError
[⏲️]: Retrying en-de...
[❌]: Output for en-de is not acceptable!
[⏲️]: Retrying en-de...
[❌]: Output for en-de is not acceptable!
[⏩]: Failed 2 times, skipping en-de...


In [13]:
!rm -rf tmp_test