## Evaluation
* We just evaluate everything without worrying about alignment
* Make sure to run `Preparation.ipynb` before.

In [1]:
import shutil
import os
os.makedirs('tmp_results', exist_ok=True)

In [2]:
from scripts.data_management import EuroParlManager, FloresPlusManager
dms = {
    'ep' : EuroParlManager(),
    'flores' : FloresPlusManager()
}

tls = ['gpt', 'deepl']

In [3]:
all_pairs = EuroParlManager.get_pairs()


In [None]:
from scripts.post_process import direct_triplet_align, load_mt_sents
from os.path import join
for pair in all_pairs:
    s, t = pair
    for dataset in dms:
        if dataset == 'ep' and pair == ('it', 'el'):
            continue
        for translator in tls:
            mt_sents = load_mt_sents(dataset, translator, s, t)
            src_sents, tgt_sents = dms[dataset].get_sentence_pairs(s, t, num_of_sents=400)
            direct_triplet_align(
                mt_sents=mt_sents,
                ref_sents=tgt_sents,
                src_sents=src_sents,
                src_lang=s,
                ref_lang=t,
                folder_path=join('tmp_results', f'{dataset}-{translator}')
            )

In [7]:
from scripts.scoring import ResultProducer
aligned = os.listdir('tmp_results')
for res in aligned:
    fp = join('tmp_results', res)
    files = os.listdir(fp)
    l2f = {f.replace('.jsonl', ''): join(fp, f) for f in files}
    rp = ResultProducer(label2files=l2f)
    rp.compute_results()
    rp.store_results(join('tmp_results', f'{res}.csv'))



In [5]:
from scripts.scoring import create_matrix_from_csv
from os.path import join
ep_gpt = create_matrix_from_csv(join('tmp_results', 'ep-gpt.csv'))
ep_deepl = create_matrix_from_csv(join('tmp_results', 'ep-deepl.csv'))
flores_gpt = create_matrix_from_csv(join('tmp_results', 'flores-gpt.csv'))
flores_deepl = create_matrix_from_csv(join('tmp_results', 'flores-deepl.csv'))

### Matrices
* From source to target (left to right), the row is the source and the column is the target

In [6]:
ep_gpt

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,34.052207,27.403131,34.265758,33.450159,21.559752,31.853186,19.909345,26.493696,26.156095,29.329501
de,34.496495,,23.401336,7.483574,34.40574,4.767343,31.288819,24.302457,25.314173,27.390053,27.251319
el,32.528914,27.909468,,34.108317,37.581394,19.816198,36.058423,11.543177,12.862619,29.987619,27.174453
en,34.598924,27.088722,28.540416,,36.721217,5.877664,33.207951,27.096256,27.868022,23.898661,27.720709
es,36.416038,32.385556,15.403091,2.409827,,19.211941,35.952666,28.261688,15.050712,32.150601,28.047307
fi,28.984256,26.40335,22.939204,31.805305,29.777864,,30.513101,7.239586,6.729594,24.917295,24.41138
fr,33.007701,28.890542,28.057027,33.892693,37.496366,18.709931,,27.078451,27.220359,27.383797,27.204276
it,26.915048,22.830478,,29.144336,31.736969,13.175652,28.628564,,23.233386,25.929821,21.734194
nl,28.484463,22.562986,21.464066,29.693938,27.44581,16.068469,26.936332,22.018028,,22.855778,22.578141
pt,32.535884,28.399012,28.751309,33.078499,35.746317,5.846544,33.797273,25.771903,25.961316,,24.433565


In [7]:
ep_deepl

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,34.060691,28.320274,40.89269,37.117352,25.503815,35.695082,27.5673,30.864241,31.047434,31.234542
de,34.974381,,26.427654,37.683698,36.148667,24.58625,37.804653,27.186651,28.339887,30.970712,29.996975
el,34.401319,29.99263,,39.020138,38.186257,23.645183,37.261932,28.691005,28.528897,34.262699,29.280422
en,37.49733,32.547469,30.79929,,41.137497,24.369398,38.496851,29.162393,32.202903,34.195964,32.251568
es,37.065578,32.860527,30.20693,43.274294,,24.895837,39.912205,30.341695,30.271553,35.809157,30.512634
fi,32.623281,28.696167,25.109554,35.4883,32.0302,,32.506162,24.320832,26.955631,27.897197,26.504494
fr,32.871805,30.348471,27.817182,38.308544,37.703949,22.997572,,28.78898,29.003367,33.227476,28.527832
it,29.072912,26.528497,,33.204503,34.749799,19.833691,33.320046,,26.338769,29.982963,24.20684
nl,28.717135,25.253817,22.129751,31.239803,29.104525,19.464109,29.80866,23.729883,,26.387243,23.574642
pt,32.717717,30.738889,28.661167,37.206175,39.076059,23.158269,39.056324,28.814821,28.687616,,27.93815


In [8]:
flores_gpt

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,38.863723,25.144143,51.678229,26.76896,25.403951,43.613682,29.552197,28.913497,42.015165,38.96864
de,38.691221,,24.628724,48.811945,26.089909,25.572269,40.467246,29.130906,28.881781,38.349227,36.044331
el,34.773403,32.386091,,43.565306,25.183134,22.467562,39.093314,25.621634,26.434385,35.889013,32.381023
en,49.252663,43.614308,28.990484,,29.179662,29.36792,51.947871,32.661039,30.930165,51.370512,46.351554
es,28.640257,28.275379,0.22881,35.985613,,19.249954,32.255087,21.654174,23.112735,24.278693,27.263133
fi,0.293491,29.987002,0.217186,39.660204,0.650015,,36.203299,25.802701,0.599517,0.583201,27.622318
fr,36.655553,34.863355,24.748256,49.696842,26.63817,24.417397,,26.510702,26.660309,39.963644,35.290419
it,29.439101,31.180642,21.067091,39.206555,22.630972,20.852346,0.622156,,23.426438,29.244247,28.651399
nl,28.519843,28.786491,18.973567,36.991055,23.369889,18.381937,33.125362,24.152701,,30.540325,25.760948
pt,39.132374,36.787979,24.158032,55.545055,25.336856,24.432898,44.780088,27.240783,27.576622,,36.77419


In [9]:
flores_deepl

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,37.903019,27.065014,54.79953,26.358653,26.373987,44.505666,30.947992,29.349099,34.960044,39.809583
de,41.110841,,25.357988,49.230138,24.867585,25.864145,41.499288,30.679027,28.574291,33.325661,37.334801
el,34.583404,31.615394,,41.849369,24.529222,21.6342,38.631227,27.965881,24.619949,29.797411,31.002461
en,50.610947,44.393833,30.589381,,28.737538,29.775069,52.424001,34.797073,32.372457,42.252638,47.076802
es,30.376231,27.618031,20.542459,35.793684,,19.905015,35.569537,27.183703,24.40872,27.168136,27.571089
fi,33.174106,30.691418,21.968231,38.213989,21.449547,,36.97758,26.401138,24.790704,27.879875,30.11665
fr,38.416856,34.55113,25.433119,49.22722,26.064141,24.930215,,30.600824,27.320816,33.634741,35.322091
it,31.286225,29.542371,22.12738,37.348334,24.232551,20.625066,36.913005,,23.889913,27.824569,29.05266
nl,31.393276,29.895214,20.475395,36.490951,23.124973,21.546739,35.004903,26.1512,,27.540381,28.197053
pt,39.723654,36.616311,26.257181,53.701485,25.878094,24.627075,44.189374,31.001435,27.102937,,37.075581


## Evaluation After Alignment

In [10]:
from scripts.scoring import ResultProducer
from os.path import join
import os
dm2l2f = {}
aligned = [f for f in os.listdir('tmp_results') if not f.endswith('.csv')]
for res in aligned:
    fp = join('tmp_results', res)
    files = os.listdir(fp)
    l2f = {f.replace('.jsonl', ''): join(fp, f) for f in files}
    dm2l2f[res] = l2f


In [11]:
for key in dm2l2f:
    dm, tl = key.split('-')
    folder_path = join('triplets', f'{dm}-{tl}')
    files = os.listdir(folder_path)
    print(len(files))
    for f in files:
        l2fkey = f.replace('.jsonl', '')
        dm2l2f[key][l2fkey] = join(folder_path, f)

110
109
110
110


In [12]:
for key in dm2l2f:
    rp = ResultProducer(label2files=dm2l2f[key])
    rp.compute_results()
    rp.store_results(join('tmp_results', f'{key}-new.csv'))

In [13]:
from scripts.scoring import create_matrix_from_csv
ep_gpt_a = create_matrix_from_csv(join('tmp_results', 'ep-gpt-new.csv'))
flores_gpt_a = create_matrix_from_csv(join('tmp_results', 'flores-gpt-new.csv'))

ep_deepl_a = create_matrix_from_csv(join('tmp_results', 'ep-deepl-new.csv'))
flores_deepl_a = create_matrix_from_csv(
    join('tmp_results', 'flores-deepl-new.csv'))

In [14]:
ep_gpt_a

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,34.370946,27.226428,34.769287,33.360553,22.103023,32.065179,25.182416,27.095604,26.384279,29.764738
de,34.923509,,23.498233,32.922238,35.042658,20.678602,31.271454,24.960307,25.802126,27.208893,28.182273
el,32.673476,28.348979,,34.431766,37.856485,20.006031,36.533975,26.296305,25.520287,30.196619,27.444643
en,34.791041,27.711227,28.89436,,37.077636,19.360128,33.598053,26.785969,28.40758,24.308233,28.484771
es,36.53351,32.717835,30.587555,36.749479,,19.564808,35.973488,28.559097,27.624149,32.345052,28.335849
fi,29.136995,26.714491,22.760685,31.593031,29.93274,,30.381735,22.433048,22.691877,24.684268,24.671074
fr,33.08689,28.849903,27.856996,34.09553,37.663445,18.445387,,26.906858,27.174163,27.389646,27.324933
it,27.187949,23.597261,,29.513785,32.465649,15.750369,29.235352,,23.679009,26.652081,22.332657
nl,30.984333,24.054102,23.026105,32.032896,30.069953,17.905032,29.111434,23.610502,,24.346593,25.0503
pt,32.879224,28.582109,28.723902,33.341236,35.602202,19.012961,34.095221,26.337313,26.594406,,24.71303


In [15]:
flores_gpt_a

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,38.723192,25.380582,51.612687,26.931556,25.479791,43.836579,29.616165,28.882634,42.124911,38.970147
de,38.936773,,24.967368,48.963555,26.409032,26.008855,41.030546,29.324952,29.177627,38.222157,36.382156
el,34.650895,32.342131,,43.519187,25.424947,22.464683,39.091221,25.638073,26.520976,35.7517,32.487665
en,49.187001,43.616338,28.944465,,29.203995,29.35585,51.769676,32.590283,30.851282,51.19318,46.292829
es,28.791149,28.489824,19.872813,36.168866,,19.340911,32.433983,21.702213,23.289135,24.359257,27.349275
fi,30.449397,30.080374,21.36115,40.222993,23.086827,,36.602554,25.805762,25.389318,32.681139,27.77927
fr,36.805764,35.072333,24.959154,49.781756,26.747322,24.455008,,26.487884,26.670731,40.099356,35.354419
it,29.421292,30.98788,21.065624,39.551184,22.665812,20.858643,33.369091,,23.531327,29.237774,28.481378
nl,28.3024,28.497524,19.155999,37.252077,23.534409,18.833439,33.370438,23.940471,,30.863328,25.530058
pt,39.339296,37.002434,24.283239,55.84542,25.456757,24.562808,45.060996,27.357773,27.757053,,36.912072


In [16]:
ep_deepl_a

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,34.385631,28.314379,41.2681,36.482526,26.042656,36.324905,27.991856,31.391773,31.458661,31.727313
de,35.585945,,26.912516,38.105844,36.685753,24.916505,38.480937,27.553335,28.802054,31.424861,31.063943
el,34.456317,30.346813,,39.348581,38.50428,23.818339,37.842123,28.818518,29.003258,34.737197,29.862416
en,37.738315,33.252523,31.293972,,41.240245,24.928956,39.115435,29.105844,32.6988,34.755117,33.038183
es,36.922592,33.205005,30.42012,43.328981,,25.207217,39.905221,30.379758,30.444467,36.00094,30.917954
fi,32.832669,29.069888,25.447235,35.930704,31.843459,,32.840629,24.855503,27.54621,28.274695,27.194317
fr,32.911754,30.340119,27.644802,38.625616,37.737605,22.769264,,28.542673,29.138755,33.646352,28.640318
it,29.588321,27.548499,26.210405,34.031305,35.645623,20.254806,34.32476,,26.648269,31.018331,24.952409
nl,31.435696,27.251185,23.438733,33.574663,31.266511,22.191759,31.979191,25.254464,,28.62628,25.957736
pt,32.886646,30.805776,28.666556,37.426591,39.090143,23.784557,39.430663,29.213892,29.362042,,28.219705


In [17]:
flores_deepl_a

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,37.672796,27.218862,54.7125,26.535082,26.346475,44.845832,31.007058,29.498587,35.056749,39.712876
de,41.238043,,25.507188,49.683546,24.971794,26.252606,42.101425,30.991275,28.79298,33.279544,37.767803
el,34.524259,31.604821,,41.833046,24.65254,21.787515,38.718568,27.954014,24.730214,29.847014,31.218151
en,50.469445,44.282054,30.547784,,28.829424,29.584911,52.293859,34.78297,32.307941,42.181492,47.023633
es,30.453018,27.728002,20.595654,35.938867,,20.055799,35.655291,27.239731,24.674447,27.288681,27.600085
fi,33.223503,30.836166,22.213639,38.951042,21.501887,,37.221518,26.619688,25.090463,28.389285,30.194499
fr,38.627784,34.659331,25.401132,49.325426,26.238782,25.02737,,30.564119,27.449797,33.755092,35.391611
it,31.313418,29.332941,22.191447,37.550398,24.164235,20.6957,36.780228,,23.995671,27.986739,28.903252
nl,31.264938,29.69164,20.400856,36.5445,23.025289,21.955547,35.10101,26.212417,,27.941179,28.18743
pt,39.889873,36.704264,26.423464,54.01387,25.982424,24.695195,44.31388,31.181137,27.339607,,37.336295


## Alignment Differences

In [28]:
diff_flores_deepl = flores_deepl - flores_deepl_a
diff_flores_deepl

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,0.230223,-0.153848,0.08703,-0.17643,0.027512,-0.340165,-0.059067,-0.149488,-0.096705,0.096707
de,-0.127202,,-0.1492,-0.453408,-0.104209,-0.388461,-0.602138,-0.312248,-0.218689,0.046117,-0.433002
el,0.059145,0.010573,,0.016323,-0.123319,-0.153315,-0.087341,0.011867,-0.110266,-0.049603,-0.21569
en,0.141502,0.111778,0.041597,,-0.091886,0.190157,0.130142,0.014103,0.064516,0.071145,0.053168
es,-0.076787,-0.109971,-0.053195,-0.145184,,-0.150784,-0.085755,-0.056028,-0.265728,-0.120545,-0.028996
fi,-0.049397,-0.144748,-0.245408,-0.737053,-0.05234,,-0.243938,-0.218549,-0.299759,-0.50941,-0.077849
fr,-0.210928,-0.108201,0.031987,-0.098206,-0.174641,-0.097155,,0.036706,-0.128981,-0.12035,-0.06952
it,-0.027193,0.20943,-0.064067,-0.202064,0.068316,-0.070634,0.132777,,-0.105758,-0.16217,0.149408
nl,0.128337,0.203574,0.074539,-0.053549,0.099684,-0.408807,-0.096106,-0.061218,,-0.400799,0.009623
pt,-0.166219,-0.087953,-0.166283,-0.312384,-0.10433,-0.06812,-0.124507,-0.179702,-0.236669,,-0.260714


In [29]:
diff_ep_deepl = ep_deepl - ep_deepl_a
diff_ep_deepl

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,-0.324939,0.005895,-0.37541,0.634825,-0.538841,-0.629824,-0.424557,-0.527532,-0.411227,-0.492771
de,-0.611563,,-0.484862,-0.422146,-0.537086,-0.330256,-0.676283,-0.366684,-0.462167,-0.454149,-1.066968
el,-0.054998,-0.354183,,-0.328443,-0.318023,-0.173156,-0.580191,-0.127513,-0.474362,-0.474498,-0.581994
en,-0.240986,-0.705054,-0.494683,,-0.102749,-0.559558,-0.618584,0.056548,-0.495897,-0.559153,-0.786615
es,0.142985,-0.344478,-0.21319,-0.054688,,-0.31138,0.006985,-0.038063,-0.172914,-0.191783,-0.40532
fi,-0.209388,-0.373721,-0.337681,-0.442405,0.186741,,-0.334467,-0.534671,-0.590578,-0.377499,-0.689824
fr,-0.039949,0.008352,0.17238,-0.317072,-0.033656,0.228308,,0.246307,-0.135388,-0.418876,-0.112487
it,-0.515409,-1.020002,,-0.826802,-0.895824,-0.421115,-1.004714,,-0.3095,-1.035368,-0.74557
nl,-2.718561,-1.997368,-1.308983,-2.33486,-2.161987,-2.72765,-2.170531,-1.524581,,-2.239036,-2.383094
pt,-0.168929,-0.066887,-0.005389,-0.220416,-0.014084,-0.626287,-0.374339,-0.399072,-0.674426,,-0.281555


In [31]:
diff_ep_gpt = ep_gpt - ep_gpt_a
diff_ep_gpt

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,-0.318739,0.176703,-0.503529,0.089606,-0.543271,-0.211994,-5.27307,-0.601907,-0.228184,-0.435237
de,-0.427013,,-0.096897,-25.438664,-0.636917,-15.911259,0.017364,-0.65785,-0.487954,0.181161,-0.930954
el,-0.144562,-0.43951,,-0.323449,-0.275091,-0.189833,-0.475552,-14.753128,-12.657668,-0.209,-0.27019
en,-0.192118,-0.622505,-0.353944,,-0.356419,-13.482465,-0.390102,0.310287,-0.539558,-0.409572,-0.764062
es,-0.117473,-0.332279,-15.184464,-34.339652,,-0.352868,-0.020823,-0.297409,-12.573437,-0.194451,-0.288541
fi,-0.152739,-0.31114,0.178518,0.212274,-0.154876,,0.131366,-15.193462,-15.962283,0.233026,-0.259694
fr,-0.079189,0.04064,0.200031,-0.202837,-0.167079,0.264544,,0.171593,0.046196,-0.005849,-0.120657
it,-0.272901,-0.766783,,-0.369449,-0.728679,-2.574717,-0.606789,,-0.445623,-0.72226,-0.598463
nl,-2.49987,-1.491116,-1.562039,-2.338958,-2.624143,-1.836563,-2.175102,-1.592474,,-1.490815,-2.472158
pt,-0.34334,-0.183096,0.027407,-0.262737,0.144115,-13.166417,-0.297948,-0.56541,-0.63309,,-0.279465


In [32]:
diff_flores_gpt = flores_gpt - flores_gpt_a
diff_flores_gpt

Unnamed: 0,da,de,el,en,es,fi,fr,it,nl,pt,sv
da,,0.140531,-0.236439,0.065542,-0.162596,-0.07584,-0.222897,-0.063968,0.030863,-0.109746,-0.001507
de,-0.245552,,-0.338644,-0.15161,-0.319123,-0.436586,-0.5633,-0.194047,-0.295846,0.12707,-0.337825
el,0.122508,0.04396,,0.046119,-0.241813,0.002879,0.002092,-0.016439,-0.086591,0.137313,-0.106641
en,0.065663,-0.002031,0.046019,,-0.024333,0.012071,0.178196,0.070756,0.078883,0.177332,0.058725
es,-0.150892,-0.214445,-19.644003,-0.183253,,-0.090957,-0.178896,-0.048038,-0.1764,-0.080563,-0.086143
fi,-30.155907,-0.093372,-21.143964,-0.562789,-22.436813,,-0.399255,-0.003061,-24.789801,-32.097938,-0.156951
fr,-0.150211,-0.208978,-0.210899,-0.084914,-0.109152,-0.037611,,0.022818,-0.010421,-0.135712,-0.064
it,0.017809,0.192763,0.001467,-0.344629,-0.034839,-0.006297,-32.746935,,-0.104889,0.006473,0.17002
nl,0.217443,0.288967,-0.182432,-0.261022,-0.16452,-0.451502,-0.245076,0.21223,,-0.323003,0.23089
pt,-0.206922,-0.214456,-0.125207,-0.300365,-0.119901,-0.12991,-0.280908,-0.11699,-0.180431,,-0.137881


In [41]:
import numpy as np
labels = ['deepl flores', 'deepl ep', 'gpt flores', 'gpt ep']
diffs = [diff_flores_deepl, diff_ep_deepl, diff_flores_gpt, diff_ep_gpt]
for label, diff in zip(labels, diffs):
    print(label)
    print('mean', np.nanmean(diff.values))
    print('max', np.nanmax(diff.values))
    print('min', np.nanmin(diff.values))
    print()

deepl flores
mean -0.0873563640619304
max 0.23022281107075315
min -0.7370528587885303

deepl ep
mean -0.5140479007117399
max 0.63482537045941
min -2.7276499527703955

gpt flores
mean -1.7426001696407432
max 0.2889666961300641
min -32.746934942929435

gpt ep
mean -2.3029207767512103
max 0.5002829725664384
min -34.3396518063101



* Alignment, as expected, as strong impact on GPT4.1 scores, as it was more likely to mis-align
* Less impact on DeepL scores but still notable.