In [1]:
from os.path import join
def load_sents(src_lang : str, tgt_lang: str, folder: str) -> list[str]:
    src_file = f'{src_lang}-{tgt_lang}.{src_lang}'
    tgt_file = f'{src_lang}-{tgt_lang}.{tgt_lang}'
    src_path = join(folder, src_file)
    tgt_path = join(folder, tgt_file)
    
    with open(src_path, 'r') as f:
        src_sents = [ln.strip() for ln in f]
    
    with open(tgt_path, 'r') as f:
        tgt_sents = [ln.strip() for ln in f]
    
    return src_sents, tgt_sents

In [2]:
src_tgt = 'src_tgt_alignments'
src_mt = 'src_mt_alignments'

In [3]:
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

alignment_required = prefix2file.keys()
selected = [o for o in alignment_required if not o.startswith('opus')]
len(selected)

439

## Triplets

In [4]:
from scripts.post_process import post_triplet_align
selected.append('ep-gpt-it-el')  # add the missing piece
align_cnt = {}
for p in selected:
    dm, tl, src_lang, tgt_lang = p.split(
        '-')[0], p.split('-')[1], p.split('-')[2], p.split('-')[3]
    src_sents_o, tgt_sents_o = load_sents(
        src_lang, tgt_lang, join(src_tgt, f'{dm}'))
    src_sents_a, mt_sents_a = load_sents(
        src_lang, tgt_lang, join(src_mt, f'{dm}-{tl}'))

    cnt = post_triplet_align(
        src_sents_org=src_sents_o,
        src_sents_ali=src_sents_a,
        ref_sents_org=tgt_sents_o,
        mt_sents_ali=mt_sents_a,
        src_lang=src_lang,
        ref_lang=tgt_lang,
        folder_path=join('triplets', f'{dm}-{tl}')
    )
    align_cnt[f'{dm}-{tl}-{src_lang}-{tgt_lang}'] = cnt

384 sents aligned for da and en
377 sents aligned for de and en
392 sents aligned for el and en
382 sents aligned for en and da
386 sents aligned for en and de
380 sents aligned for en and el
377 sents aligned for en and es
381 sents aligned for en and fi
378 sents aligned for en and fr
366 sents aligned for en and it
377 sents aligned for en and nl
379 sents aligned for en and pt
378 sents aligned for en and sv
390 sents aligned for es and en
386 sents aligned for fi and en
383 sents aligned for fr and en
378 sents aligned for it and en
339 sents aligned for nl and en
389 sents aligned for pt and en
375 sents aligned for sv and en
386 sents aligned for da and en
368 sents aligned for de and en
393 sents aligned for el and en
382 sents aligned for en and da
386 sents aligned for en and de
380 sents aligned for en and el
377 sents aligned for en and es
381 sents aligned for en and fi
379 sents aligned for en and fr
366 sents aligned for en and it
377 sents aligned for en and nl
379 sent

In [5]:
max_pair = max(align_cnt.items(), key=lambda x: x[1])
min_pair = min(align_cnt.items(), key=lambda x: x[1])
max_pair, min_pair

(('flores-gpt-da-de', 428), ('ep-deepl-nl-it', 328))

In [6]:
flores_gpt = {k:v for k,v in align_cnt.items() if k.startswith('flores-gpt')}
flores_deepl = {k: v for k,v in align_cnt.items() if k.startswith('flores-deepl')}

ep_gpt = {k: v for k,v in align_cnt.items() if k.startswith('ep-gpt')}
ep_deepl = {k: v for k,v in align_cnt.items() if k.startswith('ep-deepl')}

In [7]:
acs = [flores_gpt, flores_deepl, ep_gpt, ep_deepl]
for ac in acs:
    max_pair = max(ac.items(), key=lambda x: x[1])
    min_pair = min(ac.items(), key=lambda x: x[1])
    mean = sum(ac.values()) / len(ac)
    print('Max', max_pair)
    print('Min', min_pair)
    print('Mean', f'{mean:.2f}')
    print()


Max ('flores-gpt-da-de', 428)
Min ('flores-gpt-nl-es', 390)
Mean 417.58

Max ('flores-deepl-da-fi', 428)
Min ('flores-deepl-nl-es', 390)
Mean 417.23

Max ('ep-gpt-el-fi', 396)
Min ('ep-gpt-nl-it', 329)
Mean 374.53

Max ('ep-deepl-el-fi', 395)
Min ('ep-deepl-nl-it', 328)
Mean 377.38



In [10]:
import re
def ok(pat, key):
    return re.search(pat, key)

pat1 = r'flores-gpt-\w\w-en'
pat2 = r'flores-deepl-\w\w-en'
pat3 = r'ep-gpt-\w\w-en'
pat4 = r'ep-deepl-\w\w-en'


In [11]:
flores_gpt_en = {k: v for k, v in align_cnt.items() if k.startswith('flores-gpt-en') or ok(pat1, k)}
flores_deepl_en = {k: v for k, v in align_cnt.items()
                if k.startswith('flores-deepl') or ok(pat2, k)}

ep_gpt_en = {k: v for k, v in align_cnt.items() if k.startswith('ep-gpt') or ok(pat3, k)}
ep_deepl_en = {k: v for k, v in align_cnt.items(
) if k.startswith('ep-deepl') or ok(pat4, k)}

In [12]:
acs = [flores_gpt_en, flores_deepl_en, ep_gpt_en, ep_deepl_en]
for ac in acs:
    max_pair = max(ac.items(), key=lambda x: x[1])
    min_pair = min(ac.items(), key=lambda x: x[1])
    mean = sum(ac.values()) / len(ac)
    print('Max', max_pair)
    print('Min', min_pair)
    print('Mean', f'{mean:.2f}')
    print()

Max ('flores-gpt-en-da', 427)
Min ('flores-gpt-nl-en', 394)
Mean 420.90

Max ('flores-deepl-da-fi', 428)
Min ('flores-deepl-nl-es', 390)
Mean 417.23

Max ('ep-gpt-el-fi', 396)
Min ('ep-gpt-nl-it', 329)
Mean 374.53

Max ('ep-deepl-el-fi', 395)
Min ('ep-deepl-nl-it', 328)
Mean 377.38

