# Required imports

In [1]:
import os
import json
import itertools
import matplotlib.pyplot as plt
import pandas as pd

from helper_functions import *
from opus_eng_fra_features import *

# Plot settings

In [2]:
SIZE = 10

plt.rc('xtick', labelsize=SIZE)
plt.rc('ytick', labelsize=SIZE)
plt.rc('font', size=SIZE)
plt.rc('axes', labelsize=SIZE)
plt.rcParams["figure.figsize"] = (15,11)


metrics_palette = {
    'comet22': "lightsteelblue", 
    'comet22-qe': "royalblue", 
    'chrf': "khaki", 
    'sacrebleu': "goldenrod",
}

scalers_palette = {
    'none, lin': "#11324D",
    'none, rf': "#6B7AA1", 
    'none, xgb': "#A6DCEF",
    'standard, lin': "#3A4D39",
    'standard, rf': "#79AC78", 
    'standard, xgb': "#D0E7D2", 
    'min-max, lin': "#7F669D",
    'min-max, rf': "#BA94D1", 
    'min-max, xgb': "#DEBACE",
}

%matplotlib widget

# Load data

In [3]:
BASE_DIR = f"{os.getcwd()}/../../"
MODEL_EVAL_DIR = BASE_DIR + "model_eval/"

In [4]:
fid_file = "fid-finetune_data-dataset_opus_eng_fra-timeInterval_10000-timeIntervalType_sentence-finetuneType_base.csv"
fid = pd.read_csv(MODEL_EVAL_DIR + fid_file)

In [None]:
fid.columns

## Add delta target features

In [6]:
for test_set in OPUS_TEST_SETS:
    for metric in METRICS:
        key = f"{test_set}_{metric}"
        fid[f"delta-target_{key}"] = fid[f"target_{key}"] - fid[f"curr_{key}"]

## View data

In [None]:
fid.loc[
    (fid.prev_finetune == 10000)
    & (fid.curr_finetune == 20000)
]

### Look for NaN

In [None]:
print("Feature name")
print("-"*50)
for col in fid.columns:
    print(f"{col}\t\t{fid[col].isna().sum()}")

# Load/Create general FID

In [9]:
try:
    general_fid = pd.read_csv(
        MODEL_EVAL_DIR + f"general_{fid_file}",
    )
    
except OSError as e:
    print(f"[W] {e}\n[E] General FID does not exist in dir {MODEL_EVAL_DIR}. Creating...")
    general_fid = create_general_fid(
        df=fid,
        dataset_name='opus',
    )
    list(general_fid.columns)
    
    general_fid.to_csv(
        path_or_buf=MODEL_EVAL_DIR + f"general_{fid_file}", 
        index=False
    )

# Experimental settings

## Predictors

In [None]:
predictors = [
    "lin",
    "rf", 
    "xgb",
]
f"predictors: {predictors}"

## Features dict

In [None]:
features_dict={
    'All': GENERIC_CONTENT_AWARE_FEATURES + BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES,
    'Basic': BASIC_FEATURES,
    'ContAware': GENERIC_CONTENT_AWARE_FEATURES,
    "MTQual": GENERIC_SYS_PERF_FEATURES,
    "Basic-MTQual": BASIC_FEATURES + GENERIC_SYS_PERF_FEATURES,
    "Basic-ContAware": BASIC_FEATURES + GENERIC_CONTENT_AWARE_FEATURES,
    "ContAware-no-ngrams": [f for f in GENERIC_CONTENT_AWARE_FEATURES if "gram" not in f],
    "ContAware-MTQual": GENERIC_CONTENT_AWARE_FEATURES + GENERIC_SYS_PERF_FEATURES,
    'All-kiwi': GENERIC_CONTENT_AWARE_FEATURES + BASIC_FEATURES + [f for f in GENERIC_SYS_PERF_FEATURES if '-qe' in f ],
    'Basic-kiwi': BASIC_FEATURES + [f for f in GENERIC_SYS_PERF_FEATURES if '-qe' in f ],
    'ContAware-kiwi': GENERIC_CONTENT_AWARE_FEATURES + [f for f in GENERIC_SYS_PERF_FEATURES if '-qe' in f ],
}

print("features_dict:")
print(json.dumps(features_dict, indent=4))

# Evaluate FIPs

In [12]:
res_dict = {}
feature_imp_dict = {}

In [None]:
for metric in ['comet22', 'chrf', 'sacrebleu', 'comet22-qe']:
    eval_FIPs_offline(
        target=f'delta-target_test_set_{metric}', 
        features_dict=features_dict, 
        predictors=predictors, 
        res_dict=res_dict, 
        feature_imp_dict=feature_imp_dict, 
        dataset=general_fid,
        dataset_name='opus',
        fip_type='generic',
        fid_type='normal',
    )

## Results

In [14]:
results = pd.DataFrame(res_dict).transpose()
results['PCC'] = results['test-PCC'] * 100
results['MAE'] = results['test-mae']

results = results.round({'PCC': 2, 'MAE': 4})

### COMET22

In [None]:
get_results_table(
    results = results.loc[results.metric == 'comet22'],
    fid_type = 'normal',
    average = False,
    single = False,    # whether to return a single table or a table for each predictor
    to_latex = True,  # whether to print the table in latex
)

### chrF

In [None]:
get_results_table(
    results = results.loc[results.metric == 'chrf'],
    fid_type = 'normal',
    average = False,
    single = False,    # whether to return a single table or a table for each predictor
    to_latex = True,  # whether to print the table in latex
)

### sacreBLEU

In [None]:
get_results_table(
    results = results.loc[results.metric == 'sacrebleu'],
    fid_type = 'normal',
    average = False,
    single = False,    # whether to return a single table or a table for each predictor
    to_latex = True,  # whether to print the table in latex
)

### comet22-qe

In [None]:
get_results_table(
    results = results.loc[results.metric == 'comet22-qe'],
    fid_type = 'normal',
    average = False,
    single = False,    # whether to return a single table or a table for each predictor
    to_latex = True,  # whether to print the table in latex
)

# Test Leave-N-Out

In [12]:
lno_res_dict = {}
lno_feature_imp_dict = {}

In [None]:
test_set_combinations = []
subset_size = 1
for subset in itertools.combinations(OPUS_TEST_SETS, subset_size):
    test_set_combinations.append(list(subset))

print(test_set_combinations)
print(len(test_set_combinations))

for metric in ['comet22-qe']: #['comet22', 'chrf', 'sacrebleu', 'comet22-qe']:
    for test_set_comb in test_set_combinations:
        eval_FIPs_offline(
            target=f'delta-target_test_set_{metric}', 
            features_dict=features_dict, 
            predictors=predictors, 
            res_dict=lno_res_dict, 
            feature_imp_dict=lno_feature_imp_dict, 
            dataset=general_fid,
            dataset_name='opus',
            fip_type='generic',
            fid_type='normal',
            l1o_test_set=test_set_comb,
        )

## Results

In [14]:
lno_results = pd.DataFrame(lno_res_dict).transpose()
lno_results['PCC'] = lno_results['test-PCC'] * 100
lno_results['MAE'] = lno_results['test-mae']

lno_results = lno_results.round({'PCC': 2, 'MAE': 4})

### comet22

In [None]:
FID_TYPE = 'normal'

for predictor in lno_results.predictor.unique():
    res_table = lno_results.loc[
        (lno_results.fid_type == FID_TYPE)
        & (lno_results.predictor == predictor)
        & (lno_results.metric == 'comet22')
    ][[
        'l1o-test_set',
        'predictor',
        'metric',
        'features',
        'MAE',
        'PCC',
    ]].pivot(
        index='l1o-test_set', columns=['predictor', 'features'], values=['MAE', 'PCC']
    ).stack(level=0).transpose().style.apply(
        lambda col: highlight_opt(col),
        axis=0,
    )

    display(res_table)
    
#     if 'rf' in predictor:
     # to latex
    print(res_table.to_latex(convert_css=True))

### chrF

In [None]:
FID_TYPE = 'normal'

for predictor in lno_results.predictor.unique():
    res_table = lno_results.loc[
        (lno_results.fid_type == FID_TYPE)
        & (lno_results.predictor == predictor)
        & (lno_results.metric == 'chrf')
    ][[
        'l1o-test_set',
        'predictor',
        'metric',
        'features',
        'MAE',
        'PCC',
    ]].pivot(
        index='l1o-test_set', columns=['predictor', 'features'], values=['MAE', 'PCC']
    ).stack(level=0).transpose().style.apply(
        lambda col: highlight_opt(col),
        axis=0,
    )

    display(res_table)
    
#     if 'rf' in predictor:
     # to latex
    print(res_table.to_latex(convert_css=True))

### sacreBLEU

In [None]:
FID_TYPE = 'normal'

for predictor in lno_results.predictor.unique():
    res_table = lno_results.loc[
        (lno_results.fid_type == FID_TYPE)
        & (lno_results.predictor == predictor)
        & (lno_results.metric == 'sacrebleu')
    ][[
        'l1o-test_set',
        'predictor',
        'metric',
        'features',
        'MAE',
        'PCC',
    ]].pivot(
        index='l1o-test_set', columns=['predictor', 'features'], values=['MAE', 'PCC']
    ).stack(level=0).transpose().style.apply(
        lambda col: highlight_opt(col),
        axis=0,
    )

    display(res_table)
    
#     if 'rf' in predictor:
     # to latex
    print(res_table.to_latex(convert_css=True))

### comet22-qe

In [None]:
FID_TYPE = 'normal'

for predictor in lno_results.predictor.unique():
    res_table = lno_results.loc[
        (lno_results.fid_type == FID_TYPE)
        & (lno_results.predictor == predictor)
        & (lno_results.metric == 'comet22-qe')
    ][[
        'l1o-test_set',
        'predictor',
        'metric',
        'features',
        'MAE',
        'PCC',
    ]].pivot(
        index='l1o-test_set', columns=['predictor', 'features'], values=['MAE', 'PCC']
    ).stack(level=0).transpose().style.apply(
        lambda col: highlight_opt(col),
        axis=0,
    )

    display(res_table)
    
#     if 'rf' in predictor:
     # to latex
    print(res_table.to_latex(convert_css=True))