## Direct Triplets
* No alignment is performed on the text, we just place source, reference and translation side by side based on their order (line by line) in the dataset and translation output.
* I.e., assume that translators preserved the alignment.

In [1]:
from scripts.util import get_env_variables
from os.path import join
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
triplet_folder = get_env_variables('TRIPLETS')
dst_path = join(triplet_folder, 'direct_triplets')
parts = {
    'opus': {'dm': Opus100Manager(), 'pairs': Opus100Manager.get_pairs()},
    'ep': {'dm': EuroParlManager(), 'pairs': EuroParlManager.get_pairs()},
    'flores': {'dm': FloresPlusManager(), 'pairs': FloresPlusManager.get_pairs()}
}
translators = ['gpt', 'deepl']

In [2]:
from scripts.post_process import direct_triplet_align, load_sents_from_file
fn2align_cnt_direct = {}
for dataset, content in parts.items():
    dm = content['dm']
    pairs = content['pairs']
    for pair in pairs:
        s, t = pair
        for translator in translators:
            filename = f'{dataset}-{translator}-{s}-{t}'
            mt_sents = load_sents_from_file(
                folder='translations', filename=filename)
            src_sents, tgt_sents = dm.get_sentence_pairs(
                s, t, num_of_sents=400)
            cnt = direct_triplet_align(
                mt_sents=mt_sents,
                src_sents=src_sents,
                ref_sents=tgt_sents,
                folder_path=dst_path,
                filename=filename
            )
            fn2align_cnt_direct[filename] = cnt

In [3]:
from collections import defaultdict
cnt_freq = defaultdict(int)
for k, v in fn2align_cnt_direct.items():
    cnt_freq[v] += 1

for k in sorted(cnt_freq):
    print(k, cnt_freq[k])

372 1
398 1
399 20
400 458


In [4]:
import json
with open('direct_cnt.json', 'w') as f:
    json.dump(fn2align_cnt_direct, f, indent=4)

* Direct alignment only removes empty strings if there are any
* Most of the time, we have 400 aligned triplets

## Aligned Triplets WITHOUT Sentence Splitting
* Alignments were computed in this notebook: [Alignments_No_Sent_Split.ipynb](https://colab.research.google.com/drive/1867NBRM7ixgiVmeznqRf4oh9nYDd4D5S?usp=sharing)

In [5]:
from scripts.post_process import post_triplet_align, load_aligned_sents_from_file
from scripts.util import get_env_variables
import os
from os.path import join
aligned_folder = get_env_variables('ALIGNMENTS')
triplet_folder = get_env_variables('TRIPLETS')
src2hyp_split_fo = join(aligned_folder, 'source2translations_no_sent_split')
dst_path = join(triplet_folder, 'aligned_triplets_no_sent_split')
filenames = [f.replace('.jsonl', '') for f in os.listdir(src2hyp_split_fo)]
len(filenames)

480

In [6]:
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager

dms = {
    'ep': EuroParlManager(),
    'flores': FloresPlusManager(),
    'opus': Opus100Manager()
}

fn2align_cnt_no_sent_split = {}
fn2discard_no_sent_split = {}
cases = ['ep-gpt', 'ep-deepl', 'flores-gpt',
         'flores-deepl', 'opus-gpt', 'opus-deepl']
case2align_cnts_no_sent_split = {c: [] for c in cases}

for fn in filenames:
    dataset, translator, s, t = fn.split('-')
    src_sents_a, mt_sents_a = load_aligned_sents_from_file(
        fn, folder=src2hyp_split_fo)
    dm = dms[dataset]
    src_sents_o, ref_sents_o = dm.get_sentence_pairs(s, t, num_of_sents=400)
    align_cnt, dis = post_triplet_align(
        src_sents_org=src_sents_o,
        src_sents_ali=src_sents_a,
        ref_sents_org=ref_sents_o,
        mt_sents_ali=mt_sents_a,
        folder_path=dst_path,
        filename=fn)

    fn2align_cnt_no_sent_split[fn] = align_cnt
    fn2discard_no_sent_split[fn] = dis
    case2align_cnts_no_sent_split[f'{dataset}-{translator}'].append(align_cnt)

for t, ac in case2align_cnts_no_sent_split.items():
    max_cnt = max(ac)
    min_cnt = min(ac)
    mean = sum(ac) / len(ac)
    print(t)
    print(f'min: {min_cnt}')
    print(f'max: {max_cnt}')
    print(f'mean: {mean:.2f}')
    print()

ep-gpt
min: 383
max: 398
mean: 396.31

ep-deepl
min: 369
max: 398
mean: 395.77

flores-gpt
min: 372
max: 400
mean: 396.65

flores-deepl
min: 374
max: 400
mean: 396.61

opus-gpt
min: 372
max: 400
mean: 397.85

opus-deepl
min: 395
max: 400
mean: 398.70



In [7]:
from collections import defaultdict
cnt_freq = defaultdict(int)
for k, v in fn2align_cnt_no_sent_split.items():
    cnt_freq[v] += 1

for k in sorted(cnt_freq):
    print(k, cnt_freq[k])

369 1
372 2
374 19
383 2
384 1
391 1
392 2
393 8
394 13
395 38
396 36
397 144
398 47
399 32
400 134


In [8]:
import json
with open('aligned_cnt_no_sent_split.json', 'w') as f:
    json.dump(fn2align_cnt_no_sent_split, f, indent=4)

### Investigating Loss of Text
* If we apply `bertalign` WITHOUT sentence splitting to all translations, we lose text in some cases.
* The 'biggest' loss is 391-369=22 sentences for `ep-deepl-sv-de`

In [9]:
!cat translations/ep-deepl-sv-de.txt | wc -l

400


In [10]:
!cat $src2hyp_split_fo/ep-deepl-sv-de.jsonl | wc -l

391


In [11]:
!cat $dst_path/ep-deepl-sv-de.jsonl | wc -l

369


In [12]:
from scripts.data_management import EuroParlManager
from scripts.post_process import load_aligned_sents_from_file, load_sents_from_file
mt_sents = load_sents_from_file('ep-deepl-sv-de', 'translations')
dm = EuroParlManager()
src_sents, tgt_sents = dm.get_sentence_pairs('sv', 'de', num_of_sents=400)
src_sents[100:106]

['Vi skall rösta om begäran från PPE-DE-gruppen som syftar till att stryka den muntliga frågan om kapitalskatt från föredragningslistan.',
 '(Parlamentet avslog begäran med 164 röster för, 166 emot. 7 ledamöter avstod från att rösta.)',
 'Fru talman! Jag skulle vilja tacka Poettering för att han just gjort reklam för denna debatt.',
 'Tack.',
 'Fru talman!',
 'Jag undrar om även min röst har räknats, trots att den inte kunde avges på elektronisk väg, eftersom jag inte har något kort?']

In [13]:
tgt_sents[100:106]

['Wir stimmen jetzt über den Antrag der PPE/DE-Fraktion ab, die mündliche Anfrage über die Kapitalsteuer von der Tagesordnung abzusetzen.',
 '(Das Parlament lehnt den Antrag mit 164 Ja-Stimmen, 166 Nein-Stimmen und 7 Enthaltungen ab.)',
 'Frau Präsidentin, ich möchte Herrn Poettering für das Rühren der Werbetrommel zugunsten dieser Aussprache danken.',
 'Vielen Dank.',
 'Frau Präsidentin!',
 'Ist meine Stimme mitgezählt worden? Ich konnte sie nämlich nicht elektronisch abgeben, weil ich die Karte nicht habe.']

In [14]:
mt_sents[100:106]

['Wir werden über den Antrag der Fraktion der Europäischen Volkspartei (Christdemokraten) und europäischer Demokraten abstimmen, die mündliche Anfrage zur Gesellschaftssteuer von der Tagesordnung abzusetzen.',
 '(Das Parlament lehnt den Antrag mit 164 gegen 166 Stimmen ab. 7 Abgeordnete enthalten sich der Stimme).',
 'Frau Präsidentin, ich möchte Herrn Poettering dafür danken, dass er diese Aussprache soeben angekündigt hat.',
 'Vielen Dank, Herr Pöttering.',
 'Frau Präsidentin, ich möchte Folgendes fragen',
 'Ich frage mich, ob meine Stimme auch gezählt worden ist, obwohl sie nicht elektronisch abgegeben werden konnte, weil ich keine Karte habe?']

* The direct alignment for this specific instance is correct, however, we already observe a slight difference between reference and translation
* `Vielen Dank` vs `Vielen Dank, Herr Pöttering.`

In [15]:
src_sents_ali, mt_sents_ali = load_aligned_sents_from_file('ep-deepl-sv-de', src2hyp_split_fo)
src_sents_ali[100:107]

['(Parlamentet avslog begäran med 164 röster för, 166 emot. 7 ledamöter avstod från att rösta.)',
 'Fru talman! Jag skulle vilja tacka Poettering för att han just gjort reklam för denna debatt.',
 '',
 'Tack. Fru talman!',
 'Jag undrar om även min röst har räknats, trots att den inte kunde avges på elektronisk väg, eftersom jag inte har något kort?',
 'Jag röstade "för".',
 'Om man lägger till de två kolleger som yttrat sig blir resultatet...']

In [16]:
mt_sents_ali[100:107]

['(Das Parlament lehnt den Antrag mit 164 gegen 166 Stimmen ab. 7 Abgeordnete enthalten sich der Stimme).',
 'Frau Präsidentin, ich möchte Herrn Poettering dafür danken, dass er diese Aussprache soeben angekündigt hat.',
 'Vielen Dank, Herr Pöttering.',
 'Frau Präsidentin, ich möchte Folgendes fragen',
 'Ich frage mich, ob meine Stimme auch gezählt worden ist, obwohl sie nicht elektronisch abgegeben werden konnte, weil ich keine Karte habe?',
 'Ich habe mit "Ja" gestimmt.',
 'Wenn Sie die beiden Kollegen, die sich zu Wort gemeldet haben, hinzuzählen, lautet das Ergebnis...']

* Alignment with `bertalign` works but is dependent on the translation. It merged the source sentences together to get a 2-1 alignment with:
```
'Tack. Fru talman!' <->  'Vielen Dank, Herr Pöttering.',
```
* As a consequence, we see an empty string in the aligned source sentences
* Furthermore, since the original source sentences do not contain the string `'Tack. Fru talman!'`, it will be discarded, as triplet alignments are created by matching the sentences from the original source strings to the aligned source strings.

## Aligned Triplets WITH Sentences Splitting
* The alignments were computed in this notebook: [Alignments.ipynb](https://colab.research.google.com/drive/1xlwQPctsOGjZB2NpB9WNtzWPae_Oj4gt?usp=sharing)
* Note: To make alignments with sentence splitting, we had to align source to reference AND source to translation, whereas before only source to translation was enough, as we could use the alignments provided by the respective datasets directly. 

In [17]:
from scripts.post_process import post_triplet_align, load_aligned_sents_from_file
import os
from scripts.util import get_env_variables
from os.path import join
aligned_folder = get_env_variables('ALIGNMENTS')
triplet_folder = get_env_variables('TRIPLETS')
# Aligned source to translation
src2hyp_split_fo = join(aligned_folder, 'source2translations_sent_split')
# Aligned source to reference
src2ref_split_fo = join(aligned_folder, 'source2reference_sent_split')
dst_path = join(triplet_folder, 'aligned_triplets_sent_split')
filenames = [f.replace('.jsonl', '') for f in os.listdir(src2hyp_split_fo)]
len(filenames)

480

In [18]:
fn2align_cnt_sent_split = {}
cases = ['ep-gpt', 'ep-deepl', 'flores-gpt',
         'flores-deepl', 'opus-gpt', 'opus-deepl']
case2align_cnts_sent_split = {c: [] for c in cases}

for fn in filenames:
    dataset, translator, s, t = fn.split('-')
    src_sents_a, mt_sents_a = load_aligned_sents_from_file(
        fn, folder=src2hyp_split_fo)
    src_sents_o, ref_sents_o = load_aligned_sents_from_file(
        f'{dataset}-{s}-{t}', folder=src2ref_split_fo)
    align_cnt, dis = post_triplet_align(
        src_sents_org=src_sents_o,
        src_sents_ali=src_sents_a,
        ref_sents_org=ref_sents_o,
        mt_sents_ali=mt_sents_a,
        folder_path=dst_path,
        filename=fn)

    fn2align_cnt_sent_split[fn] = align_cnt
    case2align_cnts_sent_split[f'{dataset}-{translator}'].append(align_cnt)

for t, ac in case2align_cnts_sent_split.items():
    max_cnt = max(ac)
    min_cnt = min(ac)
    mean = sum(ac) / len(ac)
    print(t)
    print(f'min: {min_cnt}')
    print(f'max: {max_cnt}')
    print(f'mean: {mean:.2f}')
    print()

ep-gpt
min: 329
max: 396
mean: 374.74

ep-deepl
min: 328
max: 395
mean: 377.38

flores-gpt
min: 390
max: 428
mean: 417.58

flores-deepl
min: 390
max: 428
mean: 417.23

opus-gpt
min: 361
max: 399
mean: 382.95

opus-deepl
min: 361
max: 398
mean: 383.85



In [19]:
from collections import defaultdict
cnt_freq = defaultdict(int)
for k, v in fn2align_cnt_sent_split.items():
    cnt_freq[v] += 1

for k in sorted(cnt_freq):
    print(k, cnt_freq[k])

328 1
329 1
333 2
334 1
335 1
336 1
338 3
339 5
342 2
343 2
344 2
345 1
347 1
348 2
349 1
351 2
356 1
357 1
360 2
361 4
362 2
363 1
364 2
365 1
366 6
367 2
368 3
369 4
370 3
371 1
372 5
373 2
374 5
375 5
376 4
377 11
378 14
379 7
380 7
381 7
382 7
383 12
384 13
385 4
386 18
387 8
388 12
389 11
390 16
391 13
392 6
393 4
394 7
395 3
396 3
397 3
398 4
399 2
400 2
401 1
403 4
404 1
405 3
406 4
407 1
408 2
409 2
410 1
411 6
412 7
413 8
414 4
415 5
416 3
417 6
418 10
419 13
420 14
421 20
422 13
423 30
424 20
425 10
426 8
427 5
428 3


In [20]:
import json
with open('aligned_cnt_sent_split.json', 'w') as f:
    json.dump(fn2align_cnt_sent_split, f, indent=4)

## Scoring
* For all three triplet formations, no alignment, alignment without sentence splitting and alignment with sentence splitting, we compute BLUE scores.

In [21]:
results_folder = get_env_variables('TMP_RESULTS')
!rm -rf {results_folder}
!mkdir {results_folder}

* Compute BLEU scores to show differences in alignments approaches; no final results here

In [22]:
from scripts.scoring import ResultProducer
from scripts.util import get_env_variables
import os
from os.path import join

triplet_folder = get_env_variables('TRIPLETS')
results_folder = get_env_variables('TMP_RESULTS')

result_types = {'direct_results':join(triplet_folder, 'direct_triplets'), 
                'aligned_results_no_sent_split': join(triplet_folder, 'aligned_triplets_no_sent_split'),
                'aligned_results_sent_split': join(triplet_folder, 'aligned_triplets_sent_split')}

for rt in result_types:
    files = os.listdir(result_types[rt])
    output_path = join(results_folder, f'{rt}.csv')
    l2f = {f.replace('.jsonl', ''): join(result_types[rt], f) for f in files}
    rp = ResultProducer(label2files=l2f)
    rp.compute_and_store_results(output_path)

In [23]:
from scripts.presentation import parse_results_from_file
direct_df = parse_results_from_file(join(results_folder, 'direct_results.csv'))
aligned_df_no_sent_split = parse_results_from_file(
    join(results_folder, 'aligned_results_no_sent_split.csv'))
aligned_df_sent_split = parse_results_from_file(
    join(results_folder, 'aligned_results_sent_split.csv'))

## Difference between Alignment WITH and WITHOUT Sentence Splitting

In [24]:
import json
import pandas as pd
with open('aligned_cnt_no_sent_split.json', 'r') as f:
    fn2align_cnt_no_sent_split = json.load(f)

with open('aligned_cnt_sent_split.json', 'r') as f:
    fn2align_cnt_sent_split = json.load(f)

with open('direct_cnt.json', 'r') as f:
    fn2align_cnt_direct = json.load(f)

In [25]:
fn_cnts = {
    'direct':fn2align_cnt_direct, 
    'sent_split':fn2align_cnt_sent_split,
    'no_sent_split':fn2align_cnt_no_sent_split
}

dfs = {}

for d in fn_cnts:
    data = {'dataset': [], 'translator': [], 'src_lang': [], 'tgt_lang': [], 'align_cnt': []}
    for fn in fn_cnts[d]:
        dataset, translator, src_lang, tgt_lang = fn.split('-')
        data['dataset'].append(dataset)
        data['translator'].append(translator)
        data['src_lang'].append(src_lang)
        data['tgt_lang'].append(tgt_lang)
        data['align_cnt'].append(fn_cnts[d][fn])
    df = pd.DataFrame(data)
    dfs[d] = df

In [26]:
import pandas as pd
merged = pd.merge(aligned_df_no_sent_split, aligned_df_sent_split, on=['dataset', 'translator', 'src_lang', 'tgt_lang'])
for d in dfs:
    if d == 'direct': continue
    tmp = dfs[d].copy()
    tmp[f'{d}-align_cnt'] = tmp['align_cnt']
    tmp.drop(columns=['align_cnt'], inplace=True)
    merged = pd.merge(merged, tmp, on=['dataset', 'translator', 'src_lang', 'tgt_lang'])
merged['BLEU_DIFF'] = merged['BLEU_x'] - merged['BLEU_y']

In [27]:
sorted_df = merged.sort_values(by='BLEU_DIFF', ascending=True)
sorted_df[['BLEU_DIFF', 'no_sent_split-align_cnt', 'sent_split-align_cnt']].head(30)

Unnamed: 0,BLEU_DIFF,no_sent_split-align_cnt,sent_split-align_cnt
452,-3.396974,400,362
85,-2.51144,397,345
80,-2.482981,397,343
474,-2.446469,400,377
454,-2.430071,395,374
194,-2.413389,397,338
199,-2.409091,395,333
472,-2.404507,400,364
190,-2.350724,397,343
459,-2.226007,398,381


* Negative Differences mean that Alignment WITH Sentence Splitting yielded higher BLEU scores than WITHOUT
* However, we can see based on alignment counts, that it may be not related to alignment but due to the fact that it works with less sentences overall.
* Additionally, the highest difference we observe is -3.4, not too tragic.

## Difference between Direct and Alignment WITHOUT Sentence Splitting

In [28]:
import json
suspects = {}

with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)
    
for prefix, info in prefix2file.items():
    dataset, translator, s, t = prefix.split('-')
    key = f'{dataset}-{translator}'
    outlines = info['log']['out_lines']
    if outlines != 400:
        suspects[prefix] = outlines
len(suspects)

23

* We have 23 suspects that we assume to be misaligned
* We marke the these 23

In [29]:
import pandas as pd
merged = pd.merge(direct_df, aligned_df_no_sent_split, on=['dataset', 'translator', 'src_lang', 'tgt_lang'])
for d in dfs:
    if d == 'sent_split': continue
    tmp = dfs[d].copy()
    tmp[f'{d}-align_cnt'] = tmp['align_cnt']
    tmp.drop(columns=['align_cnt'], inplace=True)
    merged = pd.merge(merged, tmp, on=['dataset', 'translator', 'src_lang', 'tgt_lang'])
merged['BLEU_DIFF'] = merged['BLEU_x'] - merged['BLEU_y']

In [30]:
merged['suspect'] = False
for label in suspects:
    dataset, translator, s, t = label.split('-')

    condition = (
        (merged['dataset'] == dataset) &
        (merged['translator'] == translator) &
        (merged['src_lang'] == s) &
        (merged['tgt_lang'] == t)
    )

    merged.loc[condition, 'suspect'] = True

In [31]:
sorted_df = merged.sort_values(by='BLEU_DIFF', ascending=True, ignore_index=True)
sorted_df[['BLEU_DIFF', 'direct-align_cnt', 'no_sent_split-align_cnt', 'suspect']].head(30)

Unnamed: 0,BLEU_DIFF,direct-align_cnt,no_sent_split-align_cnt,suspect
0,-37.587794,399,400,True
1,-34.075059,399,397,True
2,-32.72526,399,400,True
3,-31.679778,399,400,True
4,-29.89395,399,398,True
5,-25.13161,398,394,True
6,-24.583561,399,400,True
7,-24.301205,372,372,True
8,-22.202973,399,400,True
9,-20.890602,399,400,True


* Biggest differences occur only for the ones that we marked, the ones that we suspect to have low BLEU scores due to misalignment!

In [32]:
sus = merged[merged['suspect'] == True]
s = sus.sort_values(by='BLEU_DIFF', ascending=True, ignore_index=True)
s[['BLEU_DIFF', 'direct-align_cnt', 'no_sent_split-align_cnt', 'dataset', 'translator', 'src_lang', 'tgt_lang']].head(30)

Unnamed: 0,BLEU_DIFF,direct-align_cnt,no_sent_split-align_cnt,dataset,translator,src_lang,tgt_lang
0,-37.587794,399,400,opus,gpt,pt,en
1,-34.075059,399,397,ep,gpt,es,en
2,-32.72526,399,400,flores,gpt,it,fr
3,-31.679778,399,400,flores,gpt,fi,pt
4,-29.89395,399,398,flores,gpt,fi,da
5,-25.13161,398,394,ep,gpt,de,en
6,-24.583561,399,400,flores,gpt,fi,nl
7,-24.301205,372,372,opus,gpt,de,en
8,-22.202973,399,400,flores,gpt,fi,es
9,-20.890602,399,400,flores,gpt,fi,el


* If we compare Alignment WITHOUT Sentence Splitting to direct alignment, resp. no additional alignment, we observe that differences occur mainly where we assumed to a have suspect anyway. 

## Consider LaBSE
* LaBSE alignments were computed in this notebook: [Alignments_No_Sent_Split_LaBSE.ipynb](https://colab.research.google.com/drive/1ieADAugVQ2nVq0Sqr9a299rsTjs9eMsB?usp=sharing)
* LaBSE is computationally more expensive and a bit trickier to use even with Google Colab; has issues with memory occasionally, however, since we used paraphrase-multilingual-MiniLM-L12-v2 for everything, we can now just compare it against LaBSE for the sentences that we plan to align for sure and see if it makes a big difference or not.

In [33]:
from scripts.post_process import post_triplet_align, load_aligned_sents_from_file
import os
from scripts.util import get_env_variables
from os.path import join
aligned_folder = get_env_variables('ALIGNMENTS')
triplet_folder = get_env_variables('TRIPLETS')
src2hyp_split_fo = join(aligned_folder, 'source2translations_no_sent_split_LaBSE')
dst_path = join(triplet_folder, 'aligned_triplets_no_sent_split_LaBSE')
filenames = [f.replace('.jsonl', '') for f in os.listdir(src2hyp_split_fo)]
len(filenames)

23

In [34]:
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager

dms = {
    'ep': EuroParlManager(),
    'flores': FloresPlusManager(),
    'opus': Opus100Manager()
}

fn2align_cnt_no_sent_split_LaBSE = {}
fn2discard_no_sent_split_LaBSE = {}
cases = ['ep-gpt', 'ep-deepl', 'flores-gpt',
         'flores-deepl', 'opus-gpt', 'opus-deepl']
case2align_cnts_no_sent_split_LaBSE = {c: [] for c in cases}

for fn in filenames:
    dataset, translator, s, t = fn.split('-')
    src_sents_a, mt_sents_a = load_aligned_sents_from_file(
        fn, folder=src2hyp_split_fo)
    dm = dms[dataset]
    src_sents_o, ref_sents_o = dm.get_sentence_pairs(s, t, num_of_sents=400)
    align_cnt, dis = post_triplet_align(
        src_sents_org=src_sents_o,
        src_sents_ali=src_sents_a,
        ref_sents_org=ref_sents_o,
        mt_sents_ali=mt_sents_a,
        folder_path=dst_path,
        filename=fn)

    fn2align_cnt_no_sent_split_LaBSE[fn] = align_cnt


for k, v in fn2align_cnt_no_sent_split_LaBSE.items():
    other = fn2align_cnt_no_sent_split[k]
    if v!=other:
        print(k, v, other)

ep-gpt-de-fi 396 394
ep-gpt-el-nl 396 395
ep-gpt-sv-fi 393 394
flores-gpt-fi-da 400 398


* We observe only few disagreements in terms of alignment count between LaBSE and paraphrase-multilingual-MiniLM-L12-v2

In [37]:
from scripts.post_process import load_aligned_sents_from_file
from scripts.scoring import compute_bleu
from scripts.util import get_env_variables
import pandas as pd
triplet_folder = get_env_variables('TRIPLETS')
paraphrase = join(triplet_folder, 'aligned_triplets_no_sent_split')
laBSE = join(triplet_folder, 'aligned_triplets_no_sent_split_LaBSE')
data = {'Label': [], 'BLEU Paraphrase': [], 'BLEU LaBSE': [], 'Difference': []}
for fn in sorted(suspects):
    ref_sents_p, mt_sents_p = load_aligned_sents_from_file(
        fn, folder=paraphrase, src_label='ref', tgt_label='mt')
    ref_sents_l, mt_sents_l = load_aligned_sents_from_file(fn, folder=laBSE,
                                                           src_label='ref', tgt_label='mt')
    bleu_p = compute_bleu(ref_sents_p, mt_sents_p)
    bleu_l = compute_bleu(ref_sents_l, mt_sents_l)
    diff = (not bleu_l==bleu_p)
    data['Label'].append(fn)
    data['BLEU Paraphrase'].append(bleu_p)
    data['BLEU LaBSE'].append(bleu_l)
    data['Difference'].append(diff)
    

df = pd.DataFrame(data)
#print(df.to_latex(index=False, float_format="%.3f"))
df

Unnamed: 0,Label,BLEU Paraphrase,BLEU LaBSE,Difference
0,ep-gpt-da-it,24.79466,24.79466,False
1,ep-gpt-de-en,32.615184,32.615184,False
2,ep-gpt-de-fi,20.382731,20.356405,True
3,ep-gpt-el-it,25.871847,25.871847,False
4,ep-gpt-el-nl,25.018855,25.031535,True
5,ep-gpt-en-fi,18.9836,18.9836,False
6,ep-gpt-es-el,30.406654,30.406654,False
7,ep-gpt-es-en,36.493097,36.493097,False
8,ep-gpt-es-nl,27.348489,27.348489,False
9,ep-gpt-fi-it,22.02456,22.02456,False


In [36]:
len(df[df['Difference']==True])

6

* 6 out of 23 alignments yielded slightly different BLEU scores.
* 4 out of 6 alignments were likely different because LaBSE aligned differently
* **OVERALL CONCLUSION**: Using paraphrase-multilingual-MiniLM-L12-v2 for alignment instead of LaBSE should not be an issue. 

## Final Triplets
* After doing these various alignment experiments, we can finally create our final triplets that we use for evaluation. 

In [1]:
import json
from os.path import join
suspects = {}
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

for prefix, info in prefix2file.items():
    dataset, translator, s, t = prefix.split('-')
    key = f'{dataset}-{translator}'
    outlines = info['log']['out_lines']
    if outlines != 400:
        suspects[prefix] = outlines
len(suspects)

23

In [2]:
from scripts.util import get_env_variables
from os.path import join
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
from scripts.post_process import direct_triplet_align, load_sents_from_file, post_triplet_align, load_aligned_sents_from_file

triplet_folder = get_env_variables('TRIPLETS')
aligned_folder = get_env_variables('ALIGNMENTS')
dst_path = join(triplet_folder, 'final_triplets')

# Direct Triplets
fn2align_cnt = {}
parts = {
    'opus': {'dm': Opus100Manager(), 'pairs': Opus100Manager.get_pairs()},
    'ep': {'dm': EuroParlManager(), 'pairs': EuroParlManager.get_pairs()},
    'flores': {'dm': FloresPlusManager(), 'pairs': FloresPlusManager.get_pairs()}
}
translators = ['gpt', 'deepl']
for dataset, content in parts.items():
    dm = content['dm']
    pairs = content['pairs']
    for pair in pairs:
        s, t = pair
        for translator in translators:
            filename = f'{dataset}-{translator}-{s}-{t}'
            if filename in suspects:
                continue
            mt_sents = load_sents_from_file(folder='translations', filename=filename)
            src_sents, tgt_sents = dm.get_sentence_pairs(
                s, t, num_of_sents=400)
            cnt = direct_triplet_align(
                mt_sents=mt_sents,
                src_sents=src_sents,
                ref_sents=tgt_sents,
                folder_path=dst_path,
                filename=filename
            )
            fn2align_cnt[filename] = cnt

# Aligned Triplets for Suspects
src2hyp_no_split_fo = join(aligned_folder, 'source2translations_no_sent_split')
for fn in suspects:
    dataset, translator, s, t = fn.split('-')
    dm = parts[dataset]['dm']
    src_sents_o, ref_sents_o = dm.get_sentence_pairs(s, t, num_of_sents=400)
    src_sents_a, mt_sents_a = load_aligned_sents_from_file(
        fn, folder=src2hyp_no_split_fo)
    align_cnt, dis = post_triplet_align(
        src_sents_org=src_sents_o,
        src_sents_ali=src_sents_a,
        ref_sents_org=ref_sents_o,
        mt_sents_ali=mt_sents_a,
        folder_path=dst_path,
        filename=fn)
    fn2align_cnt[fn] = align_cnt

In [5]:
from collections import Counter
counts = Counter(fn2align_cnt.values())
for k, v in sorted(counts.items()):
    print(k, v)


372 1
394 4
395 5
396 2
397 3
398 1
399 1
400 463
