# Logging
* Showcasing how the Logger is used in combination with the translation task
* We use the MockClient that was created for testing the TranslationTask


In [1]:
!rm -rf tmp_test

In [2]:
from test_translation_task import MockClient
from scripts.task import TranslationTask
from scripts.data_management import Opus100Manager
from scripts.logger import MyLogger, Retry
from io import StringIO
import json

dm = Opus100Manager()
pairs = [('de', 'en'), ('en', 'de'), ('fr', 'en'), ('en', 'fr')]
logfile = StringIO()
logger = MyLogger(logfile=logfile)
cli = MockClient(logger=logger, planned_fails=[pairs[1], pairs[3]])

task = TranslationTask(
    target_pairs=pairs,
    dm=dm,
    client=cli,
    logger=logger,
    mt_folder='tmp_test',
    num_of_sents=400,
)

In [3]:
task.run()

400 translated from de to en
200 translated from en to de
400 translated from fr to en
200 translated from en to fr


In [4]:
log_data = [json.loads(ln) for ln in logfile.getvalue().splitlines()]
for log in log_data:
    print(log)

{'git_hash': '36662df', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'de', 'tgt_lang': 'en', 'start': 1746022517.5436447, 'id': '931686fc-5b69-4ea1-84a4-bfaad44a30c2', 'in_lines': 400, 'in_sents': 444, 'start_timestamp': '2025-04-30 16:15:17.600645+02:00', 'in_chars': 32731, 'in_tokens': 8295, 'end': 1746022517.6126444, 'end_timestamp': '2025-04-30 16:15:17.612644+02:00', 'time': 0.0689997673034668, 'out_chars': 32731, 'out_lines': 400, 'out_sents': 478, 'out_tokens': 15578}}
{'git_hash': '36662df', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'en', 'tgt_lang': 'de', 'start': 1746022517.6855094, 'id': 'd931772d-7dd6-44bc-b6fc-c450d0cf64ff', 'in_lines': 400, 'in_sents': 446, 'start_timestamp': '2025-04-30 16:15:17.734513+02:00', 'in_chars': 29414, 'in_tokens': 6994, 'e

In [5]:
log_ids = [log['translation']['id'] for log in log_data if log['translation']['out_lines']==200]
log_ids

['d931772d-7dd6-44bc-b6fc-c450d0cf64ff',
 '63553348-01a7-4854-86f3-d0a192c55ef9']

In [6]:
retry = Retry(pairs=[pairs[1], pairs[3]], log_ids=log_ids, reasons=['only half of expected output', 'only half of expected output'])
new_logger = MyLogger(logfile=logfile, retry=retry)
cli = MockClient(logger=new_logger)
task = TranslationTask(
    target_pairs=[pairs[1], pairs[3]],
    dm=dm,
    client=cli,
    logger=new_logger,
    mt_folder='tmp_test',
    num_of_sents=400,
    manual_retry=True,
)

In [7]:
!rm -rf tmp_test

In [8]:
task.run()

400 translated from en to de
400 translated from en to fr


In [9]:
log_data = [json.loads(ln) for ln in logfile.getvalue().splitlines()]
for log in log_data:
    print(log)

{'git_hash': '36662df', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'de', 'tgt_lang': 'en', 'start': 1746022517.5436447, 'id': '931686fc-5b69-4ea1-84a4-bfaad44a30c2', 'in_lines': 400, 'in_sents': 444, 'start_timestamp': '2025-04-30 16:15:17.600645+02:00', 'in_chars': 32731, 'in_tokens': 8295, 'end': 1746022517.6126444, 'end_timestamp': '2025-04-30 16:15:17.612644+02:00', 'time': 0.0689997673034668, 'out_chars': 32731, 'out_lines': 400, 'out_sents': 478, 'out_tokens': 15578}}
{'git_hash': '36662df', 'dataset': {'name': 'Helsinki-NLP/opus-100', 'num_of_sents': 400, 'start_idx': 0, 'split': 'test[:500]'}, 'translation': {'translator': 'mock', 'src_lang': 'en', 'tgt_lang': 'de', 'start': 1746022517.6855094, 'id': 'd931772d-7dd6-44bc-b6fc-c450d0cf64ff', 'in_lines': 400, 'in_sents': 446, 'start_timestamp': '2025-04-30 16:15:17.734513+02:00', 'in_chars': 29414, 'in_tokens': 6994, 'e

* In case we have to manually retry, we will log it like this, so we have a reference to the failed attempt.

## Automatic Retry

* If the translation fails mid-translation due to an error, so the API call fails as a whole rather than succeeding and giving us malformatted output, we let the code automatically retry the call. In those cases, we do not add additional log information, because the retry is fully justified.

In [10]:
!rm -rf tmp_test

In [11]:
dm = Opus100Manager()
pairs = [('de', 'en'), ('en', 'de'), ('fr', 'en'), ('en', 'fr')]
logfile = StringIO()
logger = MyLogger(logfile=logfile)
cli = MockClient(logger=logger, planned_errors=[
                 pairs[1], pairs[1], pairs[2]])
task = TranslationTask(
    target_pairs=pairs,
    dm=dm,
    client=cli,
    logger=logger,
    mt_folder='tmp_test',
    num_of_sents=400,
    retry_delay=0 # Default retry_delay is 30s, we set it to 0
)

In [12]:
task.run()

400 translated from de to en
Error:
 MockError
Waiting 0 seconds before retrying...
Retrying...
Error:
 MockError
Waiting 0 seconds before retrying...
Retrying...
400 translated from en to de
Error:
 MockError
Waiting 0 seconds before retrying...
Retrying...
400 translated from fr to en
400 translated from en to fr


In [13]:
log_data = [json.loads(ln) for ln in logfile.getvalue().splitlines()]
for log in log_data:
    print(log['translation'])

{'translator': 'mock', 'src_lang': 'de', 'tgt_lang': 'en', 'start': 1746022518.7894006, 'id': '7fe14658-cc5e-4c7a-bb15-3310efee5c0c', 'in_lines': 400, 'in_sents': 444, 'start_timestamp': '2025-04-30 16:15:18.844398+02:00', 'in_chars': 32731, 'in_tokens': 8295, 'end': 1746022518.8553987, 'end_timestamp': '2025-04-30 16:15:18.855398+02:00', 'time': 0.06599807739257812, 'out_chars': 32731, 'out_lines': 400, 'out_sents': 478, 'out_tokens': 15578}
{'translator': 'mock', 'src_lang': 'en', 'tgt_lang': 'de', 'start': 1746022518.9935327, 'id': '83efbbbc-84ae-4914-bb64-ada712519cbf', 'in_lines': 0, 'in_sents': 0, 'start_timestamp': '2025-04-30 16:15:18.995533+02:00', 'in_chars': 0, 'in_tokens': 0, 'error_msg': 'Translation en to de failed', 'error': 'MockError'}
{'translator': 'mock', 'src_lang': 'en', 'tgt_lang': 'de', 'start': 1746022519.0685322, 'id': '909caf92-f010-4689-8c79-84a50f8425b7', 'in_lines': 0, 'in_sents': 0, 'start_timestamp': '2025-04-30 16:15:19.072548+02:00', 'in_chars': 0, 'in

* We can see exactly that en-de and fr-en failed twice and once but then succeeded, automatic logging should be sufficient for such case.
* Manual retry is only conducted if there is no failure that warrants an error