# Evaluation
* Here we compute matrices of BLEU scores, first without any alignment, then with alignment. 
* `Preparation.ipynb` was run before to create the files within `translations`

## Direct Evaluation
* Compute Scores immediately after translation
* Assumption: Alignment was preserved by the translator, so we only need to create triplets.

In [1]:
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
set1 = set(FloresPlusManager.get_pairs())
set2 = set(EuroParlManager.get_pairs())
set1==set2

True

In [2]:
from scripts.data_management import EuroParlManager, FloresPlusManager, Opus100Manager
parts = {
    'opus': {'dm':Opus100Manager(), 'pairs':Opus100Manager.get_pairs()},
    'ep': {'dm':EuroParlManager(), 'pairs':EuroParlManager.get_pairs()},
    'flores': {'dm':FloresPlusManager(), 'pairs':FloresPlusManager.get_pairs()}
}

translators = ['gpt', 'deepl']

In [3]:
from scripts.post_process import direct_triplet_align, load_sents_from_file
from os.path import join

for dataset, content in parts.items():
    dm = content['dm']
    pairs = content['pairs']
    for pair in pairs:
        s,t = pair
        for translator in translators:
            filename = f'{dataset}-{translator}-{s}-{t}'
            mt_sents = load_sents_from_file(folder='translations', filename=filename)
            src_sents, tgt_sents = dm.get_sentence_pairs(s, t, num_of_sents=400)
            direct_triplet_align(
                mt_sents=mt_sents,
                src_sents=src_sents,
                ref_sents=tgt_sents,
                src_lang=s,
                ref_lang=t,
                folder_path='direct_triplets',
                prefix=f'{dataset}-{translator}-'
            )

In [4]:
from scripts.scoring import ResultProducer
import os
from os.path import join

files = os.listdir('direct_triplets')
os.makedirs('direct_results', exist_ok=True)

result_setup = ['ep-gpt', 'ep-deepl', 'flores-gpt', 'flores-deepl', 'opus-gpt', 'opus-deepl']

for rs in result_setup:
    l2f = {f.replace(f'{rs}-', '').replace('.jsonl', ''):join('direct_triplets', f) for f in files if f.startswith(rs)}
    rp = ResultProducer(label2files=l2f)
    rp.compute_results()
    rp.store_results(join('direct_results', f'{rs}.csv'))

In [29]:
import os
from scripts.scoring import create_matrix_from_csv
res_files = os.listdir('direct_results')
res2df = {f.replace('.csv', ''):{'file':join('direct_results', f), 'df':None} for f in res_files}

for rs, content in res2df.items():
    file_path = content['file']
    df = create_matrix_from_csv(file_path)
    res2df[rs]['df'] = df

In [30]:
for key in res2df:
    out = res2df[key]['df'].round(1)
    print(key)
    print(out)
    print()

ep-deepl
      da    de    el    en    es    fi    fr    it    nl    pt    sv
da   NaN  34.1  28.3  40.9  37.1  25.5  35.7  27.6  30.9  31.0  31.2
de  35.0   NaN  26.4  37.7  36.1  24.6  37.8  27.2  28.3  31.0  30.0
el  34.4  30.0   NaN  39.0  38.2  23.6  37.3  28.7  28.5  34.3  29.3
en  37.5  32.5  30.8   NaN  41.1  24.4  38.5  29.2  32.2  34.2  32.3
es  37.1  32.9  30.2  43.3   NaN  24.9  39.9  30.3  30.3  35.8  30.5
fi  32.6  28.7  25.1  35.5  32.0   NaN  32.5  24.3  27.0  27.9  26.5
fr  32.9  30.3  27.8  38.3  37.7  23.0   NaN  28.8  29.0  33.2  28.5
it  29.1  26.5  25.5  33.2  34.7  19.8  33.3   NaN  26.3  30.0  24.2
nl  28.7  25.3  22.1  31.2  29.1  19.5  29.8  23.7   NaN  26.4  23.6
pt  32.7  30.7  28.7  37.2  39.1  23.2  39.1  28.8  28.7   NaN  27.9
sv  36.1  30.6  27.7  39.9  35.0  22.4  34.8  27.4  29.3  31.2   NaN

ep-gpt
      da    de    el    en    es    fi    fr    it    nl    pt    sv
da   NaN  34.1  27.4  34.3  33.5  21.6  31.9  19.9  26.5  26.2  29.3
de  34.5   NaN  2

* Now we can check if the pairs we assumed to be misaligned have very low BLEU scores


In [40]:
from os.path import join
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

mismatches = {
    'ep-gpt':[],
    'flores-gpt':[],
    'opus-gpt':[],
}

for prefix, info in prefix2file.items():
    dataset, translator, s, t = prefix.split('-')
    key = f'{dataset}-{translator}'
    outlines = info['log']['out_lines']
    if outlines!=400:
        score = res2df[key]['df'].loc[s, t]
        mismatches[key].append((f'{s}-{t}', score))

for key in mismatches:
    print(key)
    for item in sorted(mismatches[key], key=lambda x: x[1], reverse=True):
        label, score = item
        print(f'{label}: {score:.2f}')
    print()

ep-gpt
da-it: 19.91
es-el: 15.40
es-nl: 15.05
it-fi: 13.18
el-nl: 12.86
el-it: 11.54
de-en: 7.48
fi-it: 7.24
fi-nl: 6.73
sv-fi: 6.63
en-fi: 5.88
pt-fi: 5.85
de-fi: 4.77
es-en: 2.41

flores-gpt
fi-es: 0.65
it-fr: 0.62
fi-nl: 0.60
fi-pt: 0.58
fi-da: 0.29
es-el: 0.23
fi-el: 0.22

opus-gpt
de-en: 10.65
pt-en: 0.11



In [15]:
from os.path import join
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

for prefix, info in prefix2file.items():
    if prefix.startswith('flores-gpt'):
        outlines = info['log']['out_lines']
        if outlines != 400:
            src_lang, tgt_lang = prefix.split('-')[2], prefix.split('-')[3]
            score = res2df['flores-gpt']['df'].loc[src_lang, tgt_lang]
            print(f'{score:.2f}')

0.23
0.29
0.22
0.65
0.60
0.58
0.62


In [16]:
from os.path import join
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

for prefix, info in prefix2file.items():
    if prefix.startswith('opus-gpt'):
        outlines = info['log']['out_lines']
        if outlines != 400:
            src_lang, tgt_lang = prefix.split('-')[2], prefix.split('-')[3]
            score = res2df['opus-gpt']['df'].loc[src_lang, tgt_lang]
            print(f'{score:.2f}')

10.65
0.11


In [17]:
check1 = res2df['ep-gpt']['df']
check2 = res2df['ep-deepl']['df']

check1_flat = check1.values.flatten()
check2_flat = check2.values.flatten()

In [18]:
check1.loc['da', 'el'], check2.loc['da', 'el']

(27.403131020989917, 28.320273995031453)

In [19]:
check1_flat[2], check2_flat[2] # flattens all values in 1D-array

(27.403131020989917, 28.320273995031453)

In [20]:
import numpy as np
from scipy.stats import pearsonr

parts = ['ep', 'flores', 'opus']

for part in parts:
    gpt = f'{part}-gpt'
    deepl = f'{part}-deepl'
    gpt_df = res2df[gpt]['df']
    deepl_df = res2df[deepl]['df']

    gpt_flat = gpt_df.values.flatten()
    deepl_flat = deepl_df.values.flatten()

    mask = ~np.isnan(gpt_flat) & ~np.isnan(deepl_flat)

    # Compute Pearson correlation
    corr, pval = pearsonr(gpt_flat[mask], deepl_flat[mask])
    print(f"Pearson correlation: {corr:.2f}")
    print(f"p-value: {pval:.3f}")

Pearson correlation: 0.61
p-value: 0.000
Pearson correlation: 0.80
p-value: 0.000
Pearson correlation: 0.30
p-value: 0.206


## Post-Processed Evaluation
* Alignment was conducted on Google Collab using `bertalign`
* The notebook can be found [here](https://colab.research.google.com/drive/1xlwQPctsOGjZB2NpB9WNtzWPae_Oj4gt?usp=sharing)

In [5]:

from scripts.post_process import post_triplet_align, load_aligned_sents_from_file
import os
from os.path import join
src2hyp_fo = 'source2translations'
src2ref_fo = 'source2reference'

files = [f.replace('.jsonl', '') for f in os.listdir(src2hyp_fo)]


In [6]:
for fi in files:
    dataset, translator, s, t = fi.split('-')
    src_sents_a, mt_sents_a = load_aligned_sents_from_file(fi, folder=src2hyp_fo)
    key = f'{dataset}-{s}-{t}'
    src_sents_o, ref_sents_o = load_aligned_sents_from_file(key, folder=src2ref_fo)
    post_triplet_align(
        src_sents_org=src_sents_o,
        src_sents_ali=src_sents_a,
        ref_sents_org=ref_sents_o,
        mt_sents_ali=mt_sents_a,
        src_lang=s,
        ref_lang=t,
        folder_path='post_triplets',
        prefix=f'{dataset}-{translator}-'
    )

384 sents aligned for da and de
385 sents aligned for da and el
384 sents aligned for da and en
383 sents aligned for da and es
387 sents aligned for da and fi
384 sents aligned for da and fr
363 sents aligned for da and it
384 sents aligned for da and nl
382 sents aligned for da and pt
386 sents aligned for da and sv
372 sents aligned for de and da
366 sents aligned for de and el
377 sents aligned for de and en
369 sents aligned for de and es
379 sents aligned for de and fi
378 sents aligned for de and fr
361 sents aligned for de and it
372 sents aligned for de and nl
369 sents aligned for de and pt
370 sents aligned for de and sv
394 sents aligned for el and da
391 sents aligned for el and de
392 sents aligned for el and en
391 sents aligned for el and es
395 sents aligned for el and fi
391 sents aligned for el and fr
376 sents aligned for el and it
387 sents aligned for el and nl
387 sents aligned for el and pt
389 sents aligned for el and sv
382 sents aligned for en and da
386 sent

In [7]:
from scripts.scoring import ResultProducer
import os
from os.path import join

files = os.listdir('post_triplets')
os.makedirs('post_results', exist_ok=True)

result_setup = ['ep-gpt', 'ep-deepl', 'flores-gpt',
                'flores-deepl', 'opus-gpt', 'opus-deepl']

for rs in result_setup:
    l2f = {f.replace(f'{rs}-', '').replace('.jsonl', '')
                     : join('post_triplets', f) for f in files if f.startswith(rs)}
    rp = ResultProducer(label2files=l2f)
    rp.compute_results()
    rp.store_results(join('post_results', f'{rs}.csv'))

In [21]:
import os
from scripts.scoring import create_matrix_from_csv
res_files = os.listdir('post_results')
post_res2df = {f.replace('.csv', ''): {'file': join(
    'post_results', f), 'df': None} for f in res_files}

for rs, content in post_res2df.items():
    file_path = content['file']
    df = create_matrix_from_csv(file_path)
    post_res2df[rs]['df'] = df

In [22]:
for key in post_res2df:
    out = post_res2df[key]['df'].round(1)
    print(key)
    print(out)
    print()

ep-deepl
      da    de    el    en    es    fi    fr    it    nl    pt    sv
da   NaN  34.1  28.3  41.0  36.2  25.8  36.0  27.9  31.1  31.2  31.5
de  35.3   NaN  26.9  37.8  36.4  24.7  38.1  27.5  28.5  31.1  30.4
el  34.3  30.2   NaN  39.3  38.4  23.6  37.7  28.8  28.8  34.5  29.5
en  37.5  33.0  31.3   NaN  41.0  24.7  38.8  29.0  32.4  34.6  32.9
es  36.8  33.0  30.4  43.3   NaN  25.0  39.7  30.4  30.2  35.8  30.8
fi  32.6  28.9  25.4  35.8  31.7   NaN  32.6  24.8  27.3  28.0  27.0
fr  32.7  30.1  27.6  38.4  37.5  22.4   NaN  28.5  28.8  33.3  28.4
it  29.4  27.3  26.2  33.9  35.4  20.0  34.1   NaN  26.4  30.8  24.7
nl  31.1  27.2  23.4  33.4  31.1  21.9  31.7  25.2   NaN  28.4  25.6
pt  32.7  30.6  28.7  37.2  38.9  23.4  39.2  29.1  29.1   NaN  28.0
sv  36.1  31.0  27.9  40.1  35.0  22.3  34.9  27.6  29.4  31.3   NaN

ep-gpt
      da    de    el    en    es    fi    fr    it    nl    pt    sv
da   NaN  34.1  27.1  34.6  33.2  21.9  31.9  25.1  26.9  26.4  29.7
de  34.8   NaN  2

In [24]:
import numpy as np
from scipy.stats import pearsonr

parts = ['ep', 'flores', 'opus']

for part in parts:
    gpt = f'{part}-gpt'
    deepl = f'{part}-deepl'
    gpt_df = post_res2df[gpt]['df']
    deepl_df = post_res2df[deepl]['df']

    gpt_flat = gpt_df.values.flatten()
    deepl_flat = deepl_df.values.flatten()

    mask = ~np.isnan(gpt_flat) & ~np.isnan(deepl_flat)

    # Compute Pearson correlation
    corr, pval = pearsonr(gpt_flat[mask], deepl_flat[mask])
    print(part)
    print(f"Pearson correlation: {corr:.2f}")
    print(f"p-value: {pval:.3f}")
    print()

ep
Pearson correlation: 0.93
p-value: 0.000

flores
Pearson correlation: 0.96
p-value: 0.000

opus
Pearson correlation: 0.96
p-value: 0.000



In [28]:
import numpy as np
for key in post_res2df:
    out = post_res2df[key]['df'] - res2df[key]['df']
    print(key)
    print('mean', f'{np.nanmean(out.values):.2f}')
    print('max', f'{np.nanmax(out.values):.2f}')
    print('min', f'{np.nanmin(out.values):.2f}')
    print()

ep-deepl
mean 0.32
max 2.43
min -0.91

ep-gpt
mean 2.15
max 34.19
min -0.56

flores-deepl
mean 0.09
max 0.74
min -0.23

flores-gpt
mean 1.74
max 32.75
min -0.29

opus-deepl
mean 1.52
max 3.40
min 0.23

opus-gpt
mean 4.45
max 38.80
min 0.23



* We can check if the misaligned were improved with the alignment

In [42]:
from os.path import join
import json
with open(join('translations', 'info.json'), 'r') as f:
    prefix2file = json.load(f)

mismatches = {
    'ep-gpt': [],
    'flores-gpt': [],
    'opus-gpt': [],
}

for prefix, info in prefix2file.items():
    dataset, translator, s, t = prefix.split('-')
    key = f'{dataset}-{translator}'
    outlines = info['log']['out_lines']
    if outlines != 400:
        score = res2df[key]['df'].loc[s, t]
        improved = post_res2df[key]['df'].loc[s, t]
        mismatches[key].append((f'{s}-{t}', score, improved))

for key in mismatches:
    print(key)
    for item in sorted(mismatches[key], key=lambda x: x[1], reverse=True):
        label, score, improved = item
        print(f'{label}: {score:.2f} -> {improved:.2f}')
    print()

ep-gpt
da-it: 19.91 -> 25.13
es-el: 15.40 -> 30.55
es-nl: 15.05 -> 27.40
it-fi: 13.18 -> 15.49
el-nl: 12.86 -> 25.28
el-it: 11.54 -> 26.05
de-en: 7.48 -> 32.61
fi-it: 7.24 -> 22.30
fi-nl: 6.73 -> 22.44
sv-fi: 6.63 -> 19.18
en-fi: 5.88 -> 19.21
pt-fi: 5.85 -> 18.76
de-fi: 4.77 -> 20.43
es-en: 2.41 -> 36.59

flores-gpt
fi-es: 0.65 -> 23.09
it-fr: 0.62 -> 33.37
fi-nl: 0.60 -> 25.39
fi-pt: 0.58 -> 32.68
fi-da: 0.29 -> 30.45
es-el: 0.23 -> 19.87
fi-el: 0.22 -> 21.36

opus-gpt
de-en: 10.65 -> 36.60
pt-en: 0.11 -> 38.91

