# Post-Processing
* Manual process, unstructured, proof of concepts how post-processing could be done.

In [5]:
from os.path import join
main_folder = join('tasks', 'proc1')
ep = join(main_folder, 'europarl')
opus = join(main_folder, 'opus-100')
flores = join(main_folder, 'flores_plus')

In [6]:
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
tmp_folder = 'tmp'
tmp_ep = join(tmp_folder, 'europarl')
tmp_opus = join(tmp_folder, 'opus-100')
tmp_flores = join(tmp_folder, 'flores_plus')

dms = [EuroParlManager(), Opus100Manager(), FloresPlusManager()]
mt_folders = [ep, opus, flores]
align_folders = [tmp_ep, tmp_opus, tmp_flores]
tl_folders = ['deepl_document', 'gpt-4.1-2025-04-14']

In [7]:
from scripts.post_process import direct_triplet_align
from scripts.util import load_sents
pairs = Opus100Manager.get_pairs()
for pair in pairs:
    s, t = pair
    for dm, mt_store_folder, align_folder in zip(dms, mt_folders, align_folders):
        for tl_folder in tl_folders:
            mt_folder = join(mt_store_folder, tl_folder)
            align_store_folder = join(align_folder, tl_folder)
            src_sents, tgt_sents = dm.get_sentence_pairs(s, t, num_of_sents=400)
            mt_sents = load_sents(mt_folder, s, t)
            direct_triplet_align(
                mt_sents=mt_sents,
                ref_sents=tgt_sents,
                src_sents=src_sents,
                src_lang=s,
                ref_lang=t,
                folder_path=align_store_folder
            )

In [8]:
import os
flores_deepl = os.listdir(join(tmp_flores, 'deepl_document'))
flores_gpt = os.listdir(join(tmp_flores, 'gpt-4.1-2025-04-14'))
opus_deepl = os.listdir(join(tmp_opus, 'deepl_document'))
opus_gpt = os.listdir(join(tmp_opus, 'gpt-4.1-2025-04-14'))
ep_deepl = os.listdir(join(tmp_ep, 'deepl_document'))
ep_gpt = os.listdir(join(tmp_ep, 'gpt-4.1-2025-04-14'))

flores_deepl_l2f = {f.replace('.jsonl', ''): join(tmp_flores, 'deepl_document', f) for f in flores_deepl}
flores_gpt_l2f = {f.replace('.jsonl', ''): join(tmp_flores, 'gpt-4.1-2025-04-14', f) for f in flores_gpt}
opus_deepl_l2f = {f.replace('.jsonl', ''): join(tmp_opus, 'deepl_document', f) for f in opus_deepl}
opus_gpt_l2f = {f.replace('.jsonl', ''): join(tmp_opus, 'gpt-4.1-2025-04-14', f) for f in opus_gpt}
ep_deepl_l2f = {f.replace('.jsonl', ''): join(tmp_ep, 'deepl_document', f) for f in ep_deepl}
ep_gpt_l2f = {f.replace('.jsonl', ''): join(tmp_ep, 'gpt-4.1-2025-04-14', f) for f in ep_gpt}

In [9]:
from scripts.scoring import ResultProducer
rp_ep_deepl = ResultProducer(label2files=ep_deepl_l2f)
rp_ep_gpt = ResultProducer(label2files=ep_gpt_l2f)
rp_opus_deepl = ResultProducer(label2files=opus_deepl_l2f)
rp_opus_gpt = ResultProducer(label2files=opus_gpt_l2f)
rp_flores_deepl = ResultProducer(label2files=flores_deepl_l2f)
rp_flores_gpt = ResultProducer(label2files=flores_gpt_l2f)

In [13]:
rps = [rp_ep_deepl, rp_ep_gpt, rp_opus_deepl, rp_opus_gpt, rp_flores_deepl, rp_flores_gpt]
for rp in rps:
    rp.compute_results()

In [14]:
rps[1].display_results()

    Label       BLEU       chrF
0   da-en  34.265758  60.810718
1   de-en   7.483574  25.967502
2   el-en  34.108317  61.503165
3   en-da  34.598924  61.089082
4   en-de  27.088722  58.309062
5   en-el  28.540416  55.179982
6   en-es  36.721217  62.622892
7   en-fi   5.877664  29.960764
8   en-fr  33.207951  61.092516
9   en-it  27.096256  57.742429
10  en-nl  27.868022  56.994260
11  en-pt  23.898661  53.313087
12  en-sv  27.720709  58.059838
13  es-en   2.409827  20.934189
14  fi-en  31.805305  58.531921
15  fr-en  33.892693  60.612404
16  it-en  29.144336  57.073480
17  nl-en  29.693938  56.231841
18  pt-en  33.078499  59.841007
19  sv-en  33.809473  60.638831


In [None]:
rps[3].display_results() # Opus

    Label       BLEU       chrF
0   da-en  37.540643  59.712757
1   de-en  10.654713  22.085397
2   el-en  33.192299  54.431516
3   en-da  35.490453  61.309857
4   en-de  30.982175  54.541909
5   en-el  28.310343  52.370445
6   en-es  39.294456  61.688325
7   en-fi  20.247985  51.417513
8   en-fr  35.979858  60.723943
9   en-it  32.272354  57.125565
10  en-nl  30.348308  54.975438
11  en-pt  29.557788  55.337756
12  en-sv  30.547284  58.334569
13  es-en  43.104300  63.976862
14  fi-en  31.019251  50.670106
15  fr-en  39.515843  62.198332
16  it-en  36.832604  57.883906
17  nl-en  31.012149  53.106790
18  pt-en   0.105828   9.550674
19  sv-en  34.220010  56.374436


In [20]:
import json
with open(join('tasks', 'proc1.jsonl'), 'r') as f:
    logs = [json.loads(ln) for ln in f.readlines()]

for log in logs:
    if log['out_lines'] != log['in_lines']:
        print(log['src_lang'], log['tgt_lang'], log['out_lines'])

de en 398
es en 401
en fi 401
de en 372
pt en 402


In [22]:
target_files = {
    'de-en-ep': join(ep, 'gpt-4.1-2025-04-14'),
    'es-en-ep': join(ep, 'gpt-4.1-2025-04-14'),
    'en-fi-ep': join(ep, 'gpt-4.1-2025-04-14'),
    'de-en-opus': join(opus,'gpt-4.1-2025-04-14'),
    'pt-en-opus': join(opus, 'gpt-4.1-2025-04-14')}

In [23]:
from scripts.post_process import align_sents
from scripts.util import load_sents
out_folder = {'ep': 'tmp_ep', 'opus': 'tmp_opus'}
dms = {'ep': EuroParlManager(), 'opus': Opus100Manager()}

for f in target_files:
    s, t, p = f.split('-')
    dm = dms[p]
    src_sents, _ = dm.get_sentence_pairs(s, t, num_of_sents=400)
    mt_sents = load_sents(target_files[f], s, t)
    align_sents(src_sents, mt_sents, s, t, out_folder[p])

Source language: de, Number of sentences: 435
Target language: en, Number of sentences: 435
Embedding source and target text using paraphrase-multilingual-MiniLM-L12-v2 ...
Performing first-step alignment ...
Performing second-step alignment ...
Finished! Successfully aligned 435 de sentences to 435 en sentences

Source language: es, Number of sentences: 417
Target language: en, Number of sentences: 411
Embedding source and target text using paraphrase-multilingual-MiniLM-L12-v2 ...
Performing first-step alignment ...
Performing second-step alignment ...
Finished! Successfully aligned 417 es sentences to 411 en sentences

Source language: en, Number of sentences: 414
Target language: fi, Number of sentences: 415
Embedding source and target text using paraphrase-multilingual-MiniLM-L12-v2 ...
Performing first-step alignment ...
Performing second-step alignment ...
Finished! Successfully aligned 414 en sentences to 415 fi sentences

Source language: de, Number of sentences: 444
Target la

In [37]:
from scripts.post_process import post_triplet_align
target_files = {
    'de-en-ep': 'tmp_ep',
    'es-en-ep':  'tmp_ep',
    'en-fi-ep': 'tmp_ep',
    'de-en-opus': 'tmp_opus',
    'pt-en-opus': 'tmp_opus'}

dms = {'ep': EuroParlManager(), 'opus': Opus100Manager()}

for f in target_files:
    s, t, p = f.split('-')
    dm = dms[p]
    src_sents, tgt_sents = dm.get_sentence_pairs(s, t, num_of_sents=400)
    with open(join(target_files[f], f'{s}-{t}.{t}'), 'r') as f1:
        mt_sents_a = [ln.strip() for ln in f1.readlines()]
    
    with open(join(target_files[f], f'{s}-{t}.{s}'), 'r') as f2:
        src_sents_a = [ln.strip() for ln in f2.readlines()]


    post_triplet_align(
        src_sents_org=src_sents,
        src_sents_ali=src_sents_a,
        ref_sents_org=tgt_sents,
        mt_sents_ali=mt_sents_a,
        src_lang=s,
        ref_lang=t,
        folder_path=target_files[f]
    )

372 sents aligned for de and en
396 sents aligned for es and en
392 sents aligned for en and fi
339 sents aligned for de and en
381 sents aligned for pt and en


In [38]:
l2f = {
    'de-en-ep': join('tmp_ep', 'de-en.jsonl'),
    'es-en-ep':  join('tmp_ep', 'es-en.jsonl'),
    'en-fi-ep': join('tmp_ep', 'en-fi.jsonl'),
    'de-en-opus': join('tmp_opus', 'de-en.jsonl'),
    'pt-en-opus': join('tmp_opus', 'pt-en.jsonl')}

rp = ResultProducer(label2files=l2f)
rp.compute_results()
rp.display_results()

        Label       BLEU       chrF
0    de-en-ep  33.140530  59.340320
1    es-en-ep  36.063764  62.694386
2    en-fi-ep  19.333943  55.350342
3  de-en-opus  35.711588  56.957870
4  pt-en-opus  38.110482  59.786798
