In [50]:
import pandas as pd
from bayes_opt import BayesianOptimization
from scipy import stats


In [63]:
from prepare_scores import run_sentence_bleu, run_sentence_chrf

In [65]:
bleu = run_sentence_bleu(df['mt'], df['ref'])
chrf = run_sentence_chrf(df['mt'], df['ref'])

Running BLEU...: 100%|██████████| 150343/150343 [00:22<00:00, 6642.71it/s] 
Running chrF...: 100%|██████████| 150343/150343 [00:35<00:00, 4245.62it/s] 


In [79]:
df_bleu = pd.DataFrame({'id': df['id'], 'bleu': bleu})
df_chrf = pd.DataFrame({'id': df['id'], 'chrf': chrf})

df_bleu.to_csv('output/wmt-mqm-human-evaluation_with_bleu.csv', index=False)
df_chrf.to_csv('output/wmt-mqm-human-evaluation_with_chrf.csv', index=False)

In [80]:
file_orig = "output/wmt-mqm-human-evaluation.csv"
scores = [
    #(filename, seperator, lower_bnd, upper_bnd)
    ("output/wmt-mqm-human-evaluation_with_yisi.csv", ",", 0, 1),
    ("output/wmt-mqm-human-evaluation-gemba.csv", "|", -100, 0),
    ("output/wmt-mqm-human-evaluation_with_bleu.csv", ",", 0, 100),
    ("output/wmt-mqm-human-evaluation_with_chrf.csv", ",", 0, 100),
    
]



In [81]:
df = pd.read_csv(file_orig, sep = "|")
df.head()

Unnamed: 0,lp,src,mt,ref,score,system,annotators,domain,year,id
0,en-de,Michael Jackson wore tape on his nose to get f...,"Michael Jackson trug Klebeband auf der Nase, u...",Ehemaliger Bodyguard berichtet: Michael Jackso...,-2.333333,eTranslation.737,3,news,2020,0
1,en-de,Michael Jackson's former bodyguard has claimed...,Michael Jacksons ehemaliger Bodyguard behaupte...,Der ehemalige Bodyguard von Michael Jackson be...,-3.333333,eTranslation.737,3,news,2020,1
2,en-de,"Matt Fiddes, now a property developer and owne...","Matt Fiddes, heute Immobilienentwickler und Be...","Matt Fiddes, jetzt ein Bauträger und Inhaber e...",-2.666667,eTranslation.737,3,news,2020,2
3,en-de,"To get front pages, he would reportedly don su...","Um Titelseiten zu bekommen, soll er chirurgisc...","Um auf Titelseiten zu gelangen, trug er einen ...",-4.333333,eTranslation.737,3,news,2020,3
4,en-de,We'll tell you what's true. You can form your ...,"Wir sagen Ihnen, was wahr ist. Sie können sich...",Wir sagen Ihnen die Fakten. Sie können sich da...,-0.0,eTranslation.737,3,news,2020,4


In [82]:
df['score'].describe()

count    150343.000000
mean          5.930588
std          28.831939
min        -400.000000
25%          -4.833333
50%          -0.666667
75%          -0.000000
max         100.000000
Name: score, dtype: float64

In [83]:
human_scores = df['score']

In [84]:
metric_scores = {}
for scorefile, sep, lower_bound, upper_bound in scores:
    assert upper_bound > lower_bound
    df_metric = pd.read_csv(scorefile, sep=sep)
    columns = df_metric.columns
    for col in columns:
        if col == 'id':
            continue
        score = df_metric[col]
        score = (score - lower_bound) / (upper_bound - lower_bound)
        metric_scores[col] = score

In [85]:
metrics_names = list(metric_scores.keys())

In [90]:

print("metrics names:", metrics_names)

# Bounded region of parameter space
pbounds = {}
for metric_name in metrics_names:
    pbounds[metric_name] = (0, 1)

def black_box_function(**metric_weights):
    """Function with unknown internals we wish to maximize.

    This is just serving as an example, for all intents and
    purposes think of the internals of this function, i.e.: the process
    which generates its output values, as unknown.
    """
    final_metric_scores = []
    print("metric_weights:", metric_weights)
    
    count = 0
    for i in range(len(human_scores)): # data
        score = 0
        for metric_name in metric_scores: # metrics
            score += metric_scores[metric_name][i] * metric_weights[metric_name]
        final_metric_scores.append(score)
        count += 1

    # calculate kendall
    kendall_score = stats.kendalltau(human_scores, final_metric_scores)
    if sum([v for k,v in metric_weights.items()]) == 0:
        return 0
    return kendall_score.correlation - (1-sum([v for k,v in metric_weights.items()]))**2

optimizer = BayesianOptimization(
    f=black_box_function,
    pbounds=pbounds,
    random_state=1,
)

optimizer.maximize(
    init_points=2,
    n_iter=40,
)

print(optimizer.max)

metrics names: ['yisi', 'GEMBA_score', 'bleu', 'chrf']
|   iter    |  target   | GEMBA_... |   bleu    |   chrf    |   yisi    |
-------------------------------------------------------------------------
metric_weights: {'GEMBA_score': 0.417022004702574, 'bleu': 0.7203244934421581, 'chrf': 0.00011437481734488664, 'yisi': 0.30233257263183977}
| [39m1        [39m | [39m-0.0811  [39m | [39m0.417    [39m | [39m0.7203   [39m | [39m0.0001144[39m | [39m0.3023   [39m |
metric_weights: {'GEMBA_score': 0.14675589081711304, 'bleu': 0.0923385947687978, 'chrf': 0.1862602113776709, 'yisi': 0.34556072704304774}
| [35m2        [39m | [35m0.06576  [39m | [35m0.1468   [39m | [35m0.09234  [39m | [35m0.1863   [39m | [35m0.3456   [39m |
metric_weights: {'GEMBA_score': 0.135223176230175, 'bleu': 0.08755754162706758, 'chrf': 0.18999880685857423, 'yisi': 0.35636775324726094}
| [39m3        [39m | [39m0.06351  [39m | [39m0.1352   [39m | [39m0.08756  [39m | [39m0.19     [39m | 