# Evaluation
* Here we compute matrices of BLEU scores, first without any alignment, then with alignment. 
* `Preparation.ipynb` was run before to create the files within `translations`

In [1]:
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
set1 = set(FloresPlusManager.get_pairs())
set2 = set(EuroParlManager.get_pairs())
set1==set2

True

In [2]:
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
parts = {
    'opus': {'dm':Opus100Manager(), 'pairs':Opus100Manager.get_pairs()},
    'ep': {'dm':EuroParlManager(), 'pairs':EuroParlManager.get_pairs()},
    'flores': {'dm':FloresPlusManager(), 'pairs':FloresPlusManager.get_pairs()}
}

translators = ['gpt', 'deepl']

In [3]:
from scripts.post_process import direct_triplet_align, load_sents_from_file
from os.path import join

for dataset, content in parts.items():
    dm = content['dm']
    pairs = content['pairs']
    for pair in pairs:
        s,t = pair
        for translator in translators:
            filename = f'{dataset}-{translator}-{s}-{t}'
            mt_sents = load_sents_from_file(folder='translations', filename=filename)
            src_sents, tgt_sents = dm.get_sentence_pairs(s, t, num_of_sents=400)
            direct_triplet_align(
                mt_sents=mt_sents,
                src_sents=src_sents,
                ref_sents=tgt_sents,
                src_lang=s,
                ref_lang=t,
                folder_path='direct_triplets',
                prefix=f'{dataset}-{translator}-'
            )

In [4]:
from scripts.scoring import ResultProducer
import os
from os.path import join

files = os.listdir('direct_triplets')
os.makedirs('direct_results', exist_ok=True)

result_setup = ['ep-gpt', 'ep-deepl', 'flores-gpt', 'flores-deepl', 'opus-gpt', 'opus-deepl']

for rs in result_setup:
    l2f = {f.replace(f'{rs}-', '').replace('.jsonl', ''):join('direct_triplets', f) for f in files if f.startswith(rs)}
    rp = ResultProducer(label2files=l2f)
    rp.compute_results()
    rp.store_results(join('direct_results', f'{rs}.csv'))

In [5]:
import os
from scripts.scoring import create_matrix_from_csv
res_files = os.listdir('direct_results')
res2df = {f.replace('.csv', ''):{'file':join('direct_results', f), 'df':None} for f in res_files}

for rs, content in res2df.items():
    file_path = content['file']
    df = create_matrix_from_csv(file_path)
    res2df[rs]['df'] = df

In [6]:
res2df['ep-deepl']['df'].round(1)

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,34.1,28.3,40.9,37.1,25.5,35.7,27.6,30.9,31.0,31.2
de,35.0,,26.4,37.7,36.1,24.6,37.8,27.2,28.3,31.0,30.0
el,34.4,30.0,,39.0,38.2,23.6,37.3,28.7,28.5,34.3,29.3
en,37.5,32.5,30.8,,41.1,24.4,38.5,29.2,32.2,34.2,32.3
es,37.1,32.9,30.2,43.3,,24.9,39.9,30.3,30.3,35.8,30.5
fi,32.6,28.7,25.1,35.5,32.0,,32.5,24.3,27.0,27.9,26.5
fr,32.9,30.3,27.8,38.3,37.7,23.0,,28.8,29.0,33.2,28.5
it,29.1,26.5,25.5,33.2,34.7,19.8,33.3,,26.3,30.0,24.2
nl,28.7,25.3,22.1,31.2,29.1,19.5,29.8,23.7,,26.4,23.6
pt,32.7,30.7,28.7,37.2,39.1,23.2,39.1,28.8,28.7,,27.9


In [7]:
res2df['ep-gpt']['df'].round(1)

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,34.1,27.4,34.3,33.5,21.6,31.9,19.9,26.5,26.2,29.3
de,34.5,,23.4,7.5,34.4,4.8,31.3,24.3,25.3,27.4,27.3
el,32.5,27.9,,34.1,37.6,19.8,36.1,11.5,12.9,30.0,27.2
en,34.6,27.1,28.5,,36.7,5.9,33.2,27.1,27.9,23.9,27.7
es,36.4,32.4,15.4,2.4,,19.2,36.0,28.3,15.1,32.2,28.0
fi,29.0,26.4,22.9,31.8,29.8,,30.5,7.2,6.7,24.9,24.4
fr,33.0,28.9,28.1,33.9,37.5,18.7,,27.1,27.2,27.4,27.2
it,26.9,22.8,24.6,29.1,31.7,13.2,28.6,,23.2,25.9,21.7
nl,28.5,22.6,21.5,29.7,27.4,16.1,26.9,22.0,,22.9,22.6
pt,32.5,28.4,28.8,33.1,35.7,5.8,33.8,25.8,26.0,,24.4


* Now we can check if the pairs we assumed to be misaligned have very low BLEU scores


In [8]:
from os.path import join
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

for prefix, info in prefix2file.items():
    if prefix.startswith('ep-gpt'):
        outlines = info['log']['out_lines']
        if outlines!=400:
            src_lang, tgt_lang = prefix.split('-')[2], prefix.split('-')[3]
            score = res2df['ep-gpt']['df'].loc[src_lang, tgt_lang]
            print(f'{score:.1f}')

7.5
5.9
2.4
19.9
4.8
11.5
12.9
15.4
15.1
7.2
6.7
5.8
6.6
13.2


In [9]:
res2df['flores-deepl']['df'].round(1)

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,37.9,27.1,54.8,26.4,26.4,44.5,30.9,29.3,35.0,39.8
de,41.1,,25.4,49.2,24.9,25.9,41.5,30.7,28.6,33.3,37.3
el,34.6,31.6,,41.8,24.5,21.6,38.6,28.0,24.6,29.8,31.0
en,50.6,44.4,30.6,,28.7,29.8,52.4,34.8,32.4,42.3,47.1
es,30.4,27.6,20.5,35.8,,19.9,35.6,27.2,24.4,27.2,27.6
fi,33.2,30.7,22.0,38.2,21.4,,37.0,26.4,24.8,27.9,30.1
fr,38.4,34.6,25.4,49.2,26.1,24.9,,30.6,27.3,33.6,35.3
it,31.3,29.5,22.1,37.3,24.2,20.6,36.9,,23.9,27.8,29.1
nl,31.4,29.9,20.5,36.5,23.1,21.5,35.0,26.2,,27.5,28.2
pt,39.7,36.6,26.3,53.7,25.9,24.6,44.2,31.0,27.1,,37.1


In [10]:
res2df['flores-gpt']['df'].round(1)

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,38.9,25.1,51.7,26.8,25.4,43.6,29.6,28.9,42.0,39.0
de,38.7,,24.6,48.8,26.1,25.6,40.5,29.1,28.9,38.3,36.0
el,34.8,32.4,,43.6,25.2,22.5,39.1,25.6,26.4,35.9,32.4
en,49.3,43.6,29.0,,29.2,29.4,51.9,32.7,30.9,51.4,46.4
es,28.6,28.3,0.2,36.0,,19.2,32.3,21.7,23.1,24.3,27.3
fi,0.3,30.0,0.2,39.7,0.7,,36.2,25.8,0.6,0.6,27.6
fr,36.7,34.9,24.7,49.7,26.6,24.4,,26.5,26.7,40.0,35.3
it,29.4,31.2,21.1,39.2,22.6,20.9,0.6,,23.4,29.2,28.7
nl,28.5,28.8,19.0,37.0,23.4,18.4,33.1,24.2,,30.5,25.8
pt,39.1,36.8,24.2,55.5,25.3,24.4,44.8,27.2,27.6,,36.8


In [11]:
from os.path import join
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

for prefix, info in prefix2file.items():
    if prefix.startswith('flores-gpt'):
        outlines = info['log']['out_lines']
        if outlines != 400:
            src_lang, tgt_lang = prefix.split('-')[2], prefix.split('-')[3]
            score = res2df['flores-gpt']['df'].loc[src_lang, tgt_lang]
            print(f'{score:.2f}')

0.23
0.29
0.22
0.65
0.60
0.58
0.62


In [12]:
res2df['opus-deepl']['df'].round(1)

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,,,40.5,,,,,,,
de,,,,36.3,,,,,,,
el,,,,34.7,,,,,,,
en,38.9,32.4,29.2,,40.0,25.1,40.3,34.2,32.1,34.2,34.0
es,,,,44.8,,,,,,,
fi,,,,32.6,,,,,,,
fr,,,,41.6,,,,,,,
it,,,,37.8,,,,,,,
nl,,,,33.9,,,,,,,
pt,,,,40.2,,,,,,,


In [13]:
res2df['opus-gpt']['df'].round(1)

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,,,37.5,,,,,,,
de,,,,10.7,,,,,,,
el,,,,33.2,,,,,,,
en,35.5,31.0,28.3,,39.3,20.2,36.0,32.3,30.3,29.6,30.5
es,,,,43.1,,,,,,,
fi,,,,31.0,,,,,,,
fr,,,,39.5,,,,,,,
it,,,,36.8,,,,,,,
nl,,,,31.0,,,,,,,
pt,,,,0.1,,,,,,,


In [14]:
from os.path import join
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

for prefix, info in prefix2file.items():
    if prefix.startswith('opus-gpt'):
        outlines = info['log']['out_lines']
        if outlines != 400:
            src_lang, tgt_lang = prefix.split('-')[2], prefix.split('-')[3]
            score = res2df['opus-gpt']['df'].loc[src_lang, tgt_lang]
            print(f'{score:.2f}')

10.65
0.11


In [None]:
check1 = res2df['ep-gpt']['df']
check2 = res2df['ep-deepl']['df']

check1_flat = check1.values.flatten()
check2_flat = check2.values.flatten()

In [31]:
check1.loc['da', 'el'], check2.loc['da', 'el']

(27.403131020989917, 28.320273995031453)

In [None]:
check1_flat[2], check2_flat[2] # flattens all values in 1D-array

(27.403131020989917, 28.320273995031453)

In [None]:
import numpy as np
from scipy.stats import pearsonr

parts = ['ep', 'flores', 'opus']

for part in parts:
    gpt = f'{part}-gpt'
    deepl = f'{part}-deepl'
    gpt_df = res2df[gpt]['df']
    deepl_df = res2df[deepl]['df']

    gpt_flat = gpt_df.values.flatten()
    deepl_flat = deepl_df.values.flatten()

    mask = ~np.isnan(gpt_flat) & ~np.isnan(deepl_flat)

    # Compute Pearson correlation
    corr, pval = pearsonr(gpt_flat[mask], deepl_flat[mask])
    print(f"Pearson correlation: {corr:.2f}")
    print(f"p-value: {pval:.3f}")

Pearson correlation: 0.61
p-value: 0.000
Pearson correlation: 0.80
p-value: 0.000
Pearson correlation: 0.30
p-value: 0.206
