In [728]:
import os
import json
import pandas as pd
from pathlib import Path

output_folder = "./tmp/"

def load_json(fname):
    with open(fname, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def mkdir(filename):
    path = Path(filename)
    path.parent.mkdir(parents=True, exist_ok=True)
    return filename

# monkeypatching round
def round(value, *args, **kwargs):
    return value

In [729]:
%cd ../mimir_results

/Users/javierr/git/mimir-evaluation-suite/mimir_results


In [730]:
zero = [
    "norec_sentence_nb",
    "norec_document_nb",
    "mimir_bias",
    "noridiom_nb",
    "noridiom_nn",
    "ncb",
    "norbelebele_nb",
    "nrk_nb",
    "nrk_nn",
    "noropenbookqa_nb",
    "noropenbookqa_nb_use_fact",
    "noropenbookqa_nn",
    "noropenbookqa_nn_use_fact",
    "norcommonsenseqa_nb",
    "norcommonsenseqa_nn",
    "nortruthfulqa_mc_nb",
    "nortruthfulqa_mc_nn",
    "nortruthfulqa_gen_nb",
    "norquad_nb",
    # "schibsted_vg_nb",
    "ask_gec_nb",
    "norsumm_nb",
    "norsumm_nn",
    "tatoeba_eng_nno_nn",
    "tatoeba_nno_eng_nn",
    "tatoeba_eng_nob_nb",
    "tatoeba_nob_eng_nb",
    "tatoeba_nob_nno_nb",
    "tatoeba_nno_nob_nn",
]

In [731]:
prompts = {
    "norquad_nb": {
        "prompt-0": "Tittel: {title}\n\nTekst: {passage}\n\nSpørsmål: {question}\n\nSvar:",
        "prompt-1": 'Tittel: {title}\n\nTekst: {passage}\n\nGitt teksten over, hva er svaret på følgende spørsmål? "{question}"\n\nSvar:',
        "prompt-2": "Tittel: {title}\n\nTekst: {passage}\n\nSpørsmål: {question}\n\nSvar:",
        "prompt-3": 'Tittel: {title}\n\nTekst: {passage}\n\nHvordan kan man svare på spørsmålet "{question}", gitt teksten over?\n\nSvar:',
        "prompt-4": 'Tittel: {title}\n\nTekst:{passage}\n\nGitt teksten over, besvar følgende spørsmål: "{question}"\n\nSvar:',
    }
}

task2metric = {
    "mimir_bias": ["pct_stereotype", "likelihood_diff"],
    "ncb": ["acc"],
    "norec_sentence_nb": ["acc", "f1"],
    "norec_document_nb": ["acc", "f1"],
    "tapaco_no_detection_nb": ["acc"],
    "norbelebele_nb": ["acc"],
    "nrk_nb": ["acc"],
    "nrk_nn": ["acc"],
    "noropenbookqa_nb": ["acc"],
    "noropenbookqa_nn": ["acc"],
    "noropenbookqa_nb_use_fact": ["acc"],
    "noropenbookqa_nn_use_fact": ["acc"],
    "norcommonsenseqa_nn": ["acc"],
    "norcommonsenseqa_nb": ["acc"],
    "nortruthfulqa_mc_nb": ["acc"],
    "nortruthfulqa_mc_nn": ["acc"],
    "norquad_nb": ["exact_match", "f1"],
    "noridiom_nb": ["em", "fscore"],
    "noridiom_nn": ["em", "fscore"],
    "norsumm_nb": [
        "bleu_max",
        "bleu_avg",
        "rougeL_max",
        "rougeL_avg",
        "bertscore_f1_max",
        "bertscore_f1_avg",
    ],
    "norsumm_nn": [
        "bleu_max",
        "bleu_avg",
        "rougeL_max",
        "rougeL_avg",
        "bertscore_f1_max",
        "bertscore_f1_avg",
    ],
    "nortruthfulqa_gen_nb": ["bleu_max", "rougeL_max"],
    # "schibsted_vg_nb": ["bleu", "chrf"],
    "ask_gec_nb": ["errant"],
    "tatoeba_eng_nno_nn": ["bleu", "chrf"],
    "tatoeba_nno_eng_nn": ["bleu", "chrf"],
    "tatoeba_eng_nob_nb": ["bleu", "chrf"],
    "tatoeba_nob_eng_nb": ["bleu", "chrf"],
    "tatoeba_nob_nno_nb": ["bleu", "chrf"],
    "tatoeba_nno_nob_nn": ["bleu", "chrf"],
}


def pretty_metric(
    task,
    metric_name,
    score,
    metric_list=[
        "f1",
        "acc",
        "pct_stereotype",
        "acc_norm",
        "em",
        "fscore",
        "bertscore_f1_avg",
        "bertscore_f1_max",
    ],
):
    pretty_metric_name = metric_name.replace(",none", "")
    pretty_metric_score = (
        round(score * 100, 3) if pretty_metric_name in metric_list else round(score, 3)
    )
    if task == "norquad_nb":
        pretty_metric_score = round(score, 3)
    return pretty_metric_name, pretty_metric_score


def collect_task_ranking_results(
    task,
    k=0,
    ignore_models=["gpt-sw3-6.7b", "mimir-mistral-7b-extended-instruct", "mimir-mistral-7b-extended-scratch-instruct"],
    ignore_metrics=[
        "alias",
        "bleu_acc",
        "bleu_diff",
        "rouge1_acc",
        "rouge1_diff",
        "rouge2_max",
        "rouge2_acc",
        "rouge2_diff",
        "rougeL_acc",
        "rougeL_diff",
    ],
    verbose=True,
    columns=["task", "model", "k-shot"],
):
    res = []
    res_fdir = f"{task}/{k}-shot"
    res_columns = columns.copy()
    for model_organization in os.listdir(res_fdir):
        model_fdir = os.path.join(res_fdir, model_organization)
        for model in os.listdir(model_fdir):
            if model in ignore_models:
                continue
            model_res_fpath = os.path.join(model_fdir, model, "results.json")
            if verbose:
                print(model_res_fpath)
            model_res = load_json(model_res_fpath)
            model_res_scores = model_res["results"][task]
            curr_configuration_res = [task, model, k]
            for metric_name, score in model_res_scores.items():
                if "stderr" in metric_name or metric_name in ignore_metrics:
                    continue
                pretty_metric_name, pretty_metric_score = pretty_metric(
                    task=task, metric_name=metric_name, score=score
                )
                if pretty_metric_name not in res_columns:
                    res_columns.append(pretty_metric_name)
                curr_configuration_res.append(pretty_metric_score)
            res.append(curr_configuration_res)
    return pd.DataFrame(res, columns=res_columns)


def collect_task_prompt_results(
    task,
    k,
    ignore_models=["gpt-sw3-6.7b", "mimir-mistral-7b-extended-instruct", "mimir-mistral-7b-extended-scratch-instruct"],
    verbose=True,
    columns=["task", "model", "prompt", "k-shot"],
    prompts=prompts,
):
    res = []
    res_fdir = f"{task}/{k}-shot"
    res_columns = columns.copy()
    for model_organization in os.listdir(res_fdir):
        model_fdir = os.path.join(res_fdir, model_organization)
        for model in os.listdir(model_fdir):
            if model in ignore_models:
                continue
            model_res_fpath = os.path.join(model_fdir, model, "results.json")
            if not os.path.exists(model_res_fpath):
                continue
            if verbose:
                print(model_res_fpath)
            model_res = load_json(model_res_fpath)
            model_res_scores = {
                prompt_name: prompt_res
                for prompt_name, prompt_res in model_res["results"].items()
                if prompt_name != task
            }
            for configuration_name, configuration_res in model_res_scores.items():
                prompt = (
                    prompts[task][configuration_name]
                    if task in prompts
                    else model_res["configs"][configuration_name]["doc_to_text"]
                )
                curr_configuration_res = [task, model, prompt, k]
                for metric_name, score in configuration_res.items():
                    if "stderr" in metric_name or metric_name == "alias":
                        continue
                    pretty_metric_name, pretty_metric_score = pretty_metric(
                        task=task, metric_name=metric_name, score=score
                    )
                    if pretty_metric_name not in res_columns:
                        res_columns.append(pretty_metric_name)
                    curr_configuration_res.append(pretty_metric_score)
                res.append(curr_configuration_res)
    return pd.DataFrame(res, columns=res_columns)

In [732]:
collect_task_prompt_results(task="tatoeba_nob_nno_nb", k=4)

tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-fiction/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-rightholders/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-mistral-7b-base/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-factual/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-mistral-7b-extended-scratch/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-translated/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-untranslated/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-books/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-untranslated-withnewspapers/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-nonfiction/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-mistral-7b-base-scratch/results.json
tatoeba_nob_nno_nb/4-shot/mimir-project/mimir-7b-newspapers/results.json


Unnamed: 0,task,model,prompt,k-shot,bleu,chrf
0,tatoeba_nob_nno_nb,mimir-7b-fiction,Bokmål: {{sourceString}}\nNynorsk:,4,75.510648,86.641011
1,tatoeba_nob_nno_nb,mimir-7b-fiction,Oversett følgende setning til nynorsk: {{sourc...,4,75.273164,86.809881
2,tatoeba_nob_nno_nb,mimir-7b-fiction,Gi en nynorsk oversettelse av denne setningen:...,4,75.024818,86.836795
3,tatoeba_nob_nno_nb,mimir-7b-fiction,"Hva blir ""{{sourceString}}"" på nynorsk?\nNynorsk:",4,74.567599,86.880112
4,tatoeba_nob_nno_nb,mimir-7b-rightholders,Bokmål: {{sourceString}}\nNynorsk:,4,77.255057,88.64305
5,tatoeba_nob_nno_nb,mimir-7b-rightholders,Oversett følgende setning til nynorsk: {{sourc...,4,76.795031,88.574907
6,tatoeba_nob_nno_nb,mimir-7b-rightholders,Gi en nynorsk oversettelse av denne setningen:...,4,77.082867,88.422037
7,tatoeba_nob_nno_nb,mimir-7b-rightholders,"Hva blir ""{{sourceString}}"" på nynorsk?\nNynorsk:",4,76.691199,88.342955
8,tatoeba_nob_nno_nb,mimir-mistral-7b-base,Bokmål: {{sourceString}}\nNynorsk:,4,71.63683,87.157549
9,tatoeba_nob_nno_nb,mimir-mistral-7b-base,Oversett følgende setning til nynorsk: {{sourc...,4,70.188621,86.395335


In [733]:
zero_shot = [
    "norec_sentence_nb",
    "norec_document_nb",
    "mimir_bias",
#    "tapaco_no_detection_nb",
    "norsumm_nb",
    "norsumm_nn",
    "noridiom_nb",
    "noridiom_nn",
    "ncb",
    "norbelebele_nb",
    "nrk_nb",
    "nrk_nn",
    "noropenbookqa_nb",
    "noropenbookqa_nb_use_fact",
    "noropenbookqa_nn",
    "noropenbookqa_nn_use_fact",
    "norcommonsenseqa_nb",
    "norcommonsenseqa_nn",
    "nortruthfulqa_mc_nb",
    "nortruthfulqa_mc_nn",
    "norquad_nb",
    "nortruthfulqa_gen_nb",
]

In [734]:
from functools import reduce


overall = {
    task: (
        collect_task_prompt_results(task, k=0, verbose=False)
        if task not in ["mimir_bias", "ncb"]
        else collect_task_ranking_results(task, k=0, verbose=False)
    )
    for task in zero_shot
}

In [735]:
def merge_ranking_results(tasks, overall=overall, task2metric=task2metric, on="model"):
    res = {
        task_name: overall[task_name].rename(
            columns={col: f"{task_name} ({col})" for col in task2metric[task_name]}
        )
        for task_name in tasks
    }
    df = reduce(
        lambda df_left, df_right: pd.merge(df_left, df_right, on="model"),
        list(res.values()),
    )
    df.rename(columns={"k-shot_x": "k-shot"}, inplace=True)
    df = df[
        [
            col
            for col in df.columns
            if not any([name in col for name in ["task_", "shot_"]])
        ]
    ]
    return df


def aggregate_df(df, task, task2metric, select_best):
    task_res = []
    task_columns = ["model"] + task2metric[task]
    for model, subset in df.groupby("model"):
        model_res = [model]
        for metric in task2metric[task]:

            agg_res = dict(subset[metric].describe())
            if select_best:
                model_res.append(round(agg_res["max"], 2))
            else:
                model_res.append(
                    f"{round(agg_res['mean'], 2)} ± {round(agg_res['std'], 1)} [{round(agg_res['max'], 2)}]"
                )
        task_res.append(model_res)
    return pd.DataFrame(task_res, columns=task_columns)


def merge_task_prompt_results(
    tasks, overall=overall, task2metric=task2metric, select_best=True, on="model"
):
    res = {
        task_name: aggregate_df(
            overall[task_name], task_name, task2metric, select_best
        ).rename(
            columns={col: f"{task_name} ({col})" for col in task2metric[task_name]}
        )
        for task_name in tasks
    }

    df = reduce(
        lambda df_left, df_right: pd.merge(df_left, df_right, on="model"),
        list(res.values()),
    )
    df.rename(columns={"k-shot_x": "k-shot"}, inplace=True)
    df = df[
        [
            col
            for col in df.columns
            if not any(
                [
                    name in col
                    for name in [
                        "task_",
                        "shot_",
                        "sentence_nb (acc",
                        "document_nb (acc",
                        "acc_norm",
                    ]
                ]
            )
        ]
    ]
    return df

In [736]:
canonical_order = [
    "mimir-mistral-7b-extended-scratch",
    "mimir-mistral-7b-base-scratch",
    "mimir-7b-books",
    "mimir-7b-newspapers",
    "mimir-7b-rightholders",
    "mimir-7b-fiction",
    "mimir-7b-nonfiction",
    "mimir-7b-factual",
    "mimir-7b-untranslated",
    "mimir-7b-untranslated-withnewspapers",
    "mimir-7b-translated",
    "mimir-mistral-7b-extended",
    "mimir-mistral-7b-base",
]

skill = {
    "Sentiment Analysis": ["norec_sentence_nb", "norec_document_nb"],
    "Fairness & Truthfulness": [
        "mimir_bias",
        "nortruthfulqa_mc_nb",
        "nortruthfulqa_mc_nn",
        "nortruthfulqa_gen_nb",
    ],
    "Reading Comprehension": ["norbelebele_nb", "norquad_nb"],
    "World Knowledge": [
        "nrk_nb",
        "nrk_nn",
        "noropenbookqa_nb",
        "noropenbookqa_nn",
        "noropenbookqa_nb_use_fact",
        "noropenbookqa_nn_use_fact",
    ],
    "Commonsense Reasoning": ["norcommonsenseqa_nb", "norcommonsenseqa_nn"],
    "Norwegian Syntax": [
        "ncb",
        "ask_gec_nb",
        "noridiom_nb",
        "noridiom_nn",
    ],
    "Summarization": ["norsumm_nb", "norsumm_nn"],
    "Translation": [
        "tatoeba_eng_nno_nn",
        "tatoeba_nno_eng_nn",
        "tatoeba_eng_nob_nb",
        "tatoeba_nob_eng_nb",
        "tatoeba_nob_nno_nb",
        "tatoeba_nno_nob_nn",
    ],
    "Linguistic Analysis": [
        "inverse_compression",
        "min_max_lix",
        "normalized_inverse_sb",
    ]
    # "Headline generation": ["schibsted_vg_nb"],
}

In [737]:
beautify_columns = {
    "model": "Model",
    # "norec_sentence_nb (f1)": "NoReC",
    # "norec_document_nb (f1)": "NoReC",
}


def pretty_model(model_name):
    model_d = {
        "mimir-mistral-7b-base": "base (warm)",
        "mimir-mistral-7b-extended": "extended (warm)",
        "mimir-7b-fiction": "base + fiction books",
        "mimir-7b-nonfiction": "base + nonfiction books",
        "mimir-7b-factual": "base + nonfiction books + newspapers",
        "mimir-7b-newspapers": "base + newspapers",
        "mimir-7b-books": "base + books",
        "mimir-7b-rightholders": "base + books + newspapers",
        "mimir-7b-untranslated-withnewspapers": "base + original books + newspapers",
        "mimir-7b-untranslated": "base + original books",
        "mimir-7b-translated": "base + translated books",
        "mimir-mistral-7b-base-scratch": "base",
        "mimir-mistral-7b-extended-scratch": "extended",
    }
    # mimir, conf = model_name.replace("-mistral-", "").split("7b")
    # mimir = mimir.replace("mimir", "\textsc{mimir}")
    # pretty_name = f"{mimir}$_\text" + "{" + f"{conf.strip(' -')}".replace("\text", "\text{") + "}$"
    return model_d[model_name]
    # return pretty_name

all_dfs = []
all_tasks = []

def aggregate_by_skill(
    task,
    model_order=canonical_order,
    select_best=True,
    skill=skill,
    add_baselines=[],
    add_k=0,
    overall=overall,
    task2metric=task2metric,
    base_model="mimir-mistral-7b-base-scratch",
    target_metric="f1",
    beautify_columns=beautify_columns,
    append_to_all_dfs=True,
):
    df = merge_task_prompt_results(
        [task], select_best=select_best, overall=overall, task2metric=task2metric
    )
    df = df[df["model"].isin(model_order)]
    # print(f"Task: {task}; Num rows: {df.shape[0]}")
    reference_score = {
        task: score.item()
        for task, score in dict(df[df["model"] == base_model]).items()
        if task != "model"
    }
    if add_baselines:
        df = pd.concat([df, pd.DataFrame(add_baselines, columns=df.columns.tolist())])

    model_order = model_order.copy() + [
        baseline_name for baseline_name, _ in add_baselines
    ]
    ascending = False if "mimir_bias" not in task else True
    model_rank = {
        model: i + 1
        for i, model in enumerate(
            df.sort_values(
                f"{task} ({target_metric})", ascending=ascending
            ).model.tolist()
        )
    }
    df["Rank"] = df["model"].apply(lambda x: model_rank[x])
    df = df.set_index("model").loc[model_order]
    agg, agg_cols = [], [
        "Rank",
        "Model",
        f"{task} ({target_metric})",
        f"delta ({target_metric})",
    ]
    for model_name, row in df.iterrows():
        row_res = [int(row["Rank"])]
        if model_name.startswith("mimir"):
            row_res.append(pretty_model(model_name))
            for task_name, ref in reference_score.items():
                if task_name != f"{task} ({target_metric})":
                    continue
                if model_name == base_model:
                    row_res.append(ref)
                    row_res.append("xmark")
                elif (
                    model_name.endswith("scratch")
                    or ("base" in model_name and model_name != base_model)
                    or "extended" in model_name
                ):
                    row_res.append(row[task_name])
                    row_res.append("xmark")
                else:
                    model_conf_score = row[task_name]
                    row_res.append(model_conf_score)
                    delta = round(model_conf_score - ref, 1)
                    if delta > 0:
                        row_res.append(f"+{delta}")
                    else:
                        row_res.append(f"-{delta}")
        else:
            row_res.extend([model_name, row[f"{task} ({target_metric})"], "xmark"])
        agg.append(row_res)
    agg_df = pd.DataFrame(agg, columns=agg_cols)
    if add_k is not None:
        agg_df["k"] = add_k
        agg_df = agg_df[
            [
                "Rank",
                "Model",
                "k",
                f"{task} ({target_metric})",
                f"delta ({target_metric})",
            ]
        ]
    agg_df.rename(columns=beautify_columns, inplace=True)
    agg_df = agg_df.set_index("Rank")
    if append_to_all_dfs:
        global all_dfs, all_tasks
        all_dfs.append(agg_df)
        all_tasks.append(task)
    return agg_df


def print_latex_df(df):
    print(
        df.to_latex()
        .replace("@", "\\")
        .replace("xmark", "\\xmark")  # .replace("$delta$", "delta")
    )

### Single-shot tasks

In [738]:
task = "norbelebele_nb"
add_baselines = [
    ["Random", 25.00],
]
norbelebele_nb = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
norbelebele_nb.to_csv(mkdir(f"{output_folder}/qa/norbelebele_nb.tsv"), sep="\t", index=False)
# print_latex_df(norbelebele_nb)

In [739]:
task = "nrk_nb"
add_baselines = [["Random", 27.91], ["Constant", 30.97]]
nrk_nb = aggregate_by_skill(task, add_baselines=[], target_metric="acc")
nrk_nb.to_csv(mkdir(f"{output_folder}/qa/nrk_nb.tsv"), sep="\t", index=False)
# print_latex_df(nrk_nb)

In [740]:
task = "nrk_nn"
add_baselines = [["Random", 26.76], ["Constant", 30.45]]
nkr_nn = aggregate_by_skill(task, add_baselines=[], target_metric="acc")
nkr_nn.to_csv(mkdir(f"{output_folder}/qa/nrk_nn.tsv"), sep="\t", index=False)
# print_latex_df(nrk_nn)

In [741]:
task = "norcommonsenseqa_nb"
add_baselines = [["Random", 20.00]]
norcommonsenseqa_nb = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
norcommonsenseqa_nb.to_csv(mkdir(
    f"{output_folder}/qa/norcommonsenseqa_nb.tsv"), sep="\t", index=False
)
# print_latex_df(norcommonsenseqa_nb)

task = "norcommonsenseqa_nn"
add_baselines = [["Random", 20.00]]
norcommonsenseqa_nn = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
norcommonsenseqa_nn.to_csv(mkdir(
    f"{output_folder}/qa/norcommonsenseqa_nn.tsv"), sep="\t", index=False
)
# print_latex_df(aggregate_by_skill(task, add_baselines=[], target_metric="acc"))

In [742]:
task = "ncb"
add_baselines = [
    ["Random", 50.00],
]
ncb = aggregate_by_skill(task, add_baselines=[], target_metric="acc")
ncb.to_csv(mkdir(f"{output_folder}/ranking/ncb.tsv"), sep="\t", index=False)
# print_latex_df(ncb)

In [743]:
task = "noridiom_nb"
add_baselines = []
em = aggregate_by_skill(task, add_baselines=[], target_metric="em")
f1 = aggregate_by_skill(task, add_baselines=[], target_metric="fscore")
em["Rank"] = em.index.tolist()
noridiom_nb = (
    em[["Rank", "Model", "noridiom_nb (em)", "delta (em)"]]
    .merge(f1, on="Model")
    .set_index("Rank")
)
noridiom_nb.to_csv(mkdir(f"{output_folder}/generation/noridiom_nb.tsv"), sep="\t", index=False)
# print_latex_df(noridiom_nb)

In [744]:
noridiom_nb

Unnamed: 0_level_0,Model,noridiom_nb (em),delta (em),k,noridiom_nb (fscore),delta (fscore)
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9,extended,25.124378,xmark,0,36.78422,xmark
13,base,12.686567,xmark,0,24.146826,xmark
6,base + books,48.756219,+36.069651741293534,0,57.445196,+33.29837020684245
1,base + newspapers,69.402985,+56.71641791044777,0,74.910771,+50.76394449858004
4,base + books + newspapers,51.492537,+38.80597014925374,0,59.057058,+34.910231949301235
12,base + fiction books,16.169154,+3.48258706467662,0,26.930504,+2.7836777244152273
7,base + nonfiction books,48.00995,+35.32338308457712,0,57.685313,+33.538486640988715
2,base + nonfiction books + newspapers,56.965174,+44.278606965174134,0,64.04641,+39.89958408417572
5,base + original books,49.751244,+37.06467661691543,0,59.148034,+35.00120786117262
3,base + original books + newspapers,54.477612,+41.791044776119406,0,61.874001,+37.72717474539235


In [745]:
task = "noridiom_nn"
add_baselines = []
em = aggregate_by_skill(task, add_baselines=[], target_metric="em")
f1 = aggregate_by_skill(task, add_baselines=[], target_metric="fscore")
em["Rank"] = em.index.tolist()
noridiom_nn = (
    em[["Rank", "Model", "noridiom_nn (em)", "delta (em)"]]
    .merge(f1, on="Model")
    .set_index("Rank")
)
noridiom_nn.to_csv(mkdir(f"{output_folder}/generation/noridiom_nn.tsv"), sep="\t", index=False)
# print_latex_df(noridiom_nn)

In [746]:
task = "nortruthfulqa_mc_nb"
add_baselines = [["Random", 27.27]]
nortruthfulqa_mc_nb = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
nortruthfulqa_mc_nb.to_csv(mkdir(
    f"{output_folder}/qa/nortruthfulqa_mc_nb.tsv"), sep="\t", index=False
)
# print_latex_df(nortruthfulqa_mc_nb)

In [747]:
task = "nortruthfulqa_mc_nn"
add_baselines = [["Random", 24.56]]
nortruthfulqa_mc_nn = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
nortruthfulqa_mc_nn.to_csv(mkdir(
    f"{output_folder}/qa/nortruthfulqa_mc_nn.tsv"), sep="\t", index=False
)
# print_latex_df(nortruthfulqa_mc_nn)

In [748]:
import numpy as np
from operator import itemgetter


def build_ranks(df):
    ranks = {}
    for i, row in df.iterrows():
        rank = np.mean([row[c] for c in dict(row) if "Rank" in c])
        ranks[row["Model"]] = round(rank, 3)
    inverse = {}
    counter = 0
    for model, rank in sorted(ranks.items(), key=itemgetter(1)):
        counter += 1
        if rank not in inverse:
            inverse[rank] = counter
        else:
            continue
    return {model: inverse[rank] for model, rank in ranks.items()}


task = "nortruthfulqa_gen_nb"

bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

trthflqa_rank = build_ranks(bleu_max.merge(rougeL_max))
trhtflqa = []
cols = []

for i, row in bleu_max.merge(rougeL_max).iterrows():

    model_res = []

    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("nortruthfulqa_gen_nb (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    trhtflqa.append(model_res)

df = pd.DataFrame(trhtflqa, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: trthflqa_rank[x])
df = df.set_index("Rank")
df.to_csv(mkdir(f"{output_folder}/generation/nortruthfulqa_gen_nb.tsv"), sep="\t", index=False)
# print_latex_df(df)

In [749]:
task = "mimir_bias"
mimir_bias = aggregate_by_skill(task, add_baselines=[], target_metric="pct_stereotype")
mimir_bias.to_csv(mkdir(f"{output_folder}/ranking/mimir_bias.tsv"), sep="\t", index=False)
# print_latex_df(mimir_bias)

In [750]:
task = "norsumm_nb"

bertscore_f1_max = aggregate_by_skill(
    task, add_baselines=[], target_metric="bertscore_f1_max"
)
bertscore_f1_max["Rank_bertscore_f1_max"] = bertscore_f1_max.index.tolist()
bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

norsumm_nb_rank = build_ranks(bleu_max.merge(rougeL_max).merge(bertscore_f1_max))
norsumm_nb = []
cols = []

for i, row in bleu_max.merge(rougeL_max).merge(bertscore_f1_max).iterrows():
    model_res = []
    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("norsumm_nb (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    norsumm_nb.append(model_res)

df = pd.DataFrame(norsumm_nb, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: norsumm_nb_rank[x])
df = df.set_index("Rank")
df
# df.to_csv(mkdir(f"{output_folder}/generation/norsumm_nb.tsv"), sep="\t", index=False)
# print_latex_df(df)

Unnamed: 0_level_0,Model,k,bleu,delta bleu,rougeL,delta rougeL,bertscore_f1,delta bertscore_f1
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,extended,0,26.014963,xmark,47.390284,xmark,73.762794,xmark
4,base,0,19.230192,xmark,37.624689,xmark,71.399814,xmark
8,base + books,0,14.750003,--4.480189090543133,31.870236,--5.754452750153831,66.903302,--4.496512413024902
10,base + newspapers,0,12.5153,--6.71489166423733,29.838179,--7.786510230948213,68.843894,--2.555920282999665
5,base + books + newspapers,0,17.859253,--1.370938231442583,36.127881,--1.496807684616364,68.574436,--2.8253776828447883
11,base + fiction books,0,14.18932,--5.04087206254378,30.855552,--6.7691371094723785,65.949312,--5.450502435366303
8,base + nonfiction books,0,15.151113,--4.079078914193223,32.633379,--4.991309806325951,65.876243,--5.523570875326783
7,base + nonfiction books + newspapers,0,15.219038,--4.011153669864921,32.813865,--4.81082335495843,66.931845,--4.467969536781311
11,base + original books,0,14.351724,--4.87846777175554,31.549131,--6.075557610495949,65.770862,--5.628951688607529
6,base + original books + newspapers,0,15.789491,--3.4407010278343932,34.340631,--3.2840573657549257,68.342594,--3.0572197834650723


In [751]:
task = "norsumm_nn"

bertscore_f1_max = aggregate_by_skill(
    task, add_baselines=[], target_metric="bertscore_f1_max"
)
bertscore_f1_max["Rank_bertscore_f1_max"] = bertscore_f1_max.index.tolist()
bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

norsumm_nb_rank = build_ranks(bleu_max.merge(rougeL_max).merge(bertscore_f1_max))
norsumm_nb = []
cols = []

for i, row in bleu_max.merge(rougeL_max).merge(bertscore_f1_max).iterrows():
    model_res = []
    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("norsumm_nn (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    norsumm_nb.append(model_res)

df = pd.DataFrame(norsumm_nb, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: norsumm_nb_rank[x])
df = df.set_index("Rank")
df.to_csv(mkdir(f"{output_folder}/generation/norsumm_nn.tsv"), sep="\t", index=False)
# print_latex_df(df)

### Multi-shot tasks

In [752]:
task = "norec_document_nb"

add_baselines = [["Random", 48.43], ["Constant", 40.12]]

ks = [0, 1]

norec_document_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norec_document_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=[], overall=norec_document_nb[k], add_k=k
        )
        for k in ks
    ]
)

In [753]:
from collections import deque


def reogranize(
    df, pretty_order=[pretty_model(m) for m in canonical_order], add_baselines=[]
):
    res_df = df.copy()
    res = []
    if add_baselines:
        pretty_order.extend(add_baselines)
    for model in pretty_order:
        subset = res_df[res_df["Model"] == model]
        if model in ["Random", "Constant"]:
            subset["k"] = "xmark"
            res.append(pd.DataFrame([subset.iloc[0]]))
        else:
            res.append(subset)
    return pd.concat(res)


def reogranize_by_k(
    df,
    change_rank=True,
    change_cols=False,
    pretty_order=[pretty_model(m) for m in canonical_order],
    add_baselines=[],
):
    res = []
    if add_baselines:
        pretty_order.extend(add_baselines)
    for i, subset in df.groupby("k"):
        if change_rank:
            subset["Rank"] = subset.index.tolist()
        for model in pretty_order:
            k_subset = subset[subset["Model"] == model]
            if model != "@factual":
                k_subset["k"] = ""
            res.append(k_subset)
    res_df = pd.concat(res)  # .drop_duplicates(subset=["k", "Model", "Rank"])
    res_df = res_df.set_index("k")  # .drop_duplicates()
    res_cols = res_df.columns.tolist()
    if change_cols and res_cols[0] != "Rank":
        new_columns = deque(res_cols)
        new_columns.rotate(1)
        return res_df[list(new_columns)].reset_index()
    return res_df

In [754]:
# reogranize_by_k(norec_document_nb_df, add_baselines=["Random", "Constant"]).to_csv(mkdir(
reogranize_by_k(norec_document_nb_df, add_baselines=[]).to_csv(mkdir(
    f"{output_folder}/clf/norec_document_nb.tsv"), sep="\t", index=False
)
# print_latex_df()

In [755]:
task = "norec_sentence_nb"

add_baselines = [["Random", 48.52], ["Constant", 40.75]]

ks = [0, 1, 4, 16]

norec_sentence_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norec_sentence_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=[], overall=norec_sentence_nb[k], add_k=k
        )
        for k in ks
    ]
)

In [756]:
p1 = reogranize_by_k(
    norec_sentence_nb_df[norec_sentence_nb_df["k"].isin([0, 1])],
    add_baselines=[],  #["Random", "Constant"],
)
p2 = reogranize_by_k(
    norec_sentence_nb_df[norec_sentence_nb_df["k"].isin([4, 16])],
    add_baselines=[],  # ["Random", "Constant"],
)

In [757]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/clf/norec_sentence_nb.tsv"), sep="\t", index=False
)

In [758]:
# print_latex_df(p1)

In [759]:
# print_latex_df(p2)

In [760]:
task = "norquad_nb"

ks = [0, 1]

norquad_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norquad_em_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="exact_match",
            overall=norquad_nb[k],
            add_k=k,
        )
        for k in ks
    ]
)

norquad_f1_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=[], target_metric="f1", overall=norquad_nb[k], add_k=k
        )
        for k in ks
    ]
)

norquad_em_df["Rank"] = norquad_em_df.index.tolist()

norquad_nb_df = (
    norquad_em_df[
        ["Rank", "Model", "k", "norquad_nb (exact_match)", "delta (exact_match)"]
    ]
    .merge(norquad_f1_df, on=["Model", "k"])
    .set_index("Rank")
)

# print_latex_df(reogranize_by_k(norquad_nb_df, add_baselines=[]))

In [761]:
reogranize_by_k(norquad_nb_df, add_baselines=[]).to_csv(mkdir(
    f"{output_folder}/qa/norquad_nb.tsv"), sep="\t", index=False
)

In [762]:
task = "noropenbookqa_nb"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nb[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [763]:
p1 = reogranize_by_k(
    noropenbookqa_nb_df[noropenbookqa_nb_df["k"].isin([0, 1])],
    add_baselines=[],  # ["Random", "Constant"],
)
# print_latex_df(p1)

In [764]:
p2 = reogranize_by_k(
    noropenbookqa_nb_df[noropenbookqa_nb_df["k"].isin([4, 16])],
    add_baselines=[],  # ["Random", "Constant"],
)
# print_latex_df(p2)

In [765]:
pd.concat([p1, p2]).to_csv(mkdir(f"{output_folder}/qa/noropenbookqa_nb.tsv"), sep="\t", index=False)

In [766]:
task = "noropenbookqa_nn"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nn = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nn_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nn[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [767]:
p1 = reogranize_by_k(
    noropenbookqa_nn_df[noropenbookqa_nn_df["k"].isin([0, 1])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p1)

In [768]:
p2 = reogranize_by_k(
    noropenbookqa_nn_df[noropenbookqa_nn_df["k"].isin([4, 16])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p2)

In [769]:
pd.concat([p1, p2]).to_csv(mkdir(f"{output_folder}/qa/noropenbookqa_nn.tsv"), sep="\t", index=False)

In [770]:
task = "noropenbookqa_nb_use_fact"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nb_use_fact = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nb_use_fact_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nb_use_fact[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [771]:
import warnings

warnings.filterwarnings("ignore")

In [772]:
p1 = reogranize_by_k(
    noropenbookqa_nb_use_fact_df[noropenbookqa_nb_use_fact_df["k"].isin([0, 4])],
    add_baselines=["Random"],
)
# print_latex_df(p1)

In [773]:
p2 = reogranize_by_k(
    noropenbookqa_nb_use_fact_df[noropenbookqa_nb_use_fact_df["k"].isin([1, 16])],
    add_baselines=["Random"],
)
# print_latex_df(p2)

In [774]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/qa/noropenbookqa_nb_use_fact.tsv"), sep="\t", index=False
)

In [775]:
task = "noropenbookqa_nn_use_fact"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nn_use_fact = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nn_use_fact_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nn_use_fact[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [776]:
p1 = reogranize_by_k(
    noropenbookqa_nn_use_fact_df[noropenbookqa_nn_use_fact_df["k"].isin([0, 4])],
    add_baselines=["Random"],
)
# print_latex_df(p1)

In [777]:
p2 = reogranize_by_k(
    noropenbookqa_nn_use_fact_df[noropenbookqa_nn_use_fact_df["k"].isin([1, 16])],
    add_baselines=["Random"],
)
# print_latex_df(p2)

In [778]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/qa/noropenbookqa_nn_use_fact.tsv"), sep="\t", index=False
)

In [779]:
# 260
task = "ask_gec_nb"
ask_gec = pd.read_csv("ask_gec_nb/ask_gec_k_shot.tsv", sep="\t").rename(
    columns={"k": "k-shot"}
)
ask_gec["task"] = task

ask_gec = ask_gec[["task", "model", "prompt", "k-shot", "errant"]]

ask_gec_overall = {k: {task: subset} for k, subset in ask_gec.groupby("k-shot")}

ask_gec_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=ask_gec_overall[k],
            add_k=k,
            target_metric="errant",
        )
        for k in ask_gec_overall
    ]
)

In [780]:
p1 = reogranize_by_k(
    ask_gec_df[ask_gec_df["k"].isin([0, 1])], add_baselines=[], change_cols=True
)
# print_latex_df(p1)

In [781]:
p2 = reogranize_by_k(
    ask_gec_df[ask_gec_df["k"].isin([1, 16])], add_baselines=[], change_cols=True
)
# print_latex_df()

In [782]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/generation/ask_gec_nb.tsv"), sep="\t", index=False
)

In [783]:
# bertscore_vg = pd.read_csv("bertscore_vg_k_shot.tsv", sep="\t")

In [784]:
# bertscore_vg.sample(5)

In [785]:
# task = "schibsted_vg_nb"

# ks = [0, 1]

# schibsted_vg_nb = {
#     k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
# }

# schibsted_vg_nb_bs = {k: {task: subset} for k, subset in bertscore_vg.groupby("k")}

In [786]:
# schibsted_vg_nb[0]["schibsted_vg_nb"].sample(2)

In [787]:
# schibsted_vg_nb_bs[0]["schibsted_vg_nb"].sample(2)

In [788]:
task2metric_bs = {
#    "schibsted_vg_nb": ["bertscore_f1"],
    "tatoeba_eng_nno_nn": ["bertscore_f1"],
    "tatoeba_nno_eng_nn": ["bertscore_f1"],
    "tatoeba_eng_nob_nb": ["bertscore_f1"],
    "tatoeba_nob_eng_nb": ["bertscore_f1"],
    "tatoeba_nob_nno_nb": ["bertscore_f1"],
    "tatoeba_nno_nob_nn": ["bertscore_f1"],
}

In [789]:
# schibsted_res = {}
# target_cols = [
#     "Rank",
#     "Model",
#     "bleu",
#     "delta bleu",
#     "chrf",
#     "delta chrf",
#     "bertscore_f1",
#     "delta bertscore_f1",
#     "k",
# ]

# for k in ks:
#     bleu = aggregate_by_skill(
#         task,
#         add_baselines=[],
#         target_metric="bleu",
#         overall=schibsted_vg_nb[k],
#         add_k=k,
#     )
#     bleu["Rank_bleu"] = bleu.index.tolist()

#     chrf = aggregate_by_skill(
#         task,
#         add_baselines=[],
#         target_metric="chrf",
#         overall=schibsted_vg_nb[k],
#         add_k=k,
#     )
#     chrf["Rank_chrf"] = chrf.index.tolist()

#     bertscore = aggregate_by_skill(
#         task,
#         add_baselines=[],
#         target_metric="bertscore_f1",
#         overall=schibsted_vg_nb_bs[k],
#         add_k=k,
#         task2metric=task2metric_bs,
#     )
#     bertscore["Rank_bertscore"] = bertscore.index.tolist()

#     merged = bleu.merge(chrf).merge(bertscore)
#     ranks = build_ranks(merged)

#     k_res, cols = [], []
#     for i, row in merged.iterrows():
#         model_res = []
#         for key, v in dict(row).items():
#             key = (
#                 key.replace("_max)", "")
#                 .replace("schibsted_vg_nb (", "")
#                 .replace("delta (", "delta ")
#                 .rstrip(")")
#             )
#             if "Rank" in key:
#                 continue
#             if key not in cols:
#                 cols.append(key)
#             model_res.append(v)
#         k_res.append(model_res)
#     k_df = pd.DataFrame(k_res, columns=cols)
#     k_df["Rank"] = df["Model"].apply(lambda x: ranks[x])
#     schibsted_res[k] = k_df[target_cols]

In [790]:
# print_latex_df(reogranize_by_k(schibsted_res[0]))

In [791]:
# print_latex_df(reogranize_by_k(schibsted_res[1]))

In [792]:
bertscore_mt = pd.read_csv("bertscore_mt_k_shot.tsv", sep="\t")

In [793]:
bertscore_mt.head(2)

Unnamed: 0,model,bertscore_f1,k,prompt,task
0,mimir-mistral-7b-base-scratch,86.471,0,prompt_0,tatoeba_eng_nno_nn
1,mimir-mistral-7b-base-scratch,79.985,0,prompt_2,tatoeba_eng_nno_nn


In [794]:
def get_mt_results(
    task,
    ks,
    bertscore_mt=bertscore_mt,
    task2metric_bs=task2metric_bs,
    change_cols=True,
    task2metric=task2metric,
):
    mt = {k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks}

    mt_bs = {
        k: {task: subset}
        for k, subset in bertscore_mt[bertscore_mt["task"] == task].groupby("k")
    }

    mt_res = {}

    target_cols = [
        "Rank",
        "Model",
        "bleu",
        "delta bleu",
        "chrf",
        "delta chrf",
        # "bertscore_f1",
        # "delta bertscore_f1",
        "k",
    ]

    for k in ks:
        bleu = aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="bleu",
            overall=mt[k],
            add_k=k,
            task2metric=task2metric,
        )
        bleu["Rank_bleu"] = bleu.index.tolist()

        chrf = aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="chrf",
            overall=mt[k],
            add_k=k,
            task2metric=task2metric,
        )
        chrf["Rank_chrf"] = chrf.index.tolist()

        # bertscore = aggregate_by_skill(
        #     task,
        #     add_baselines=[],
        #     target_metric="bertscore_f1",
        #     overall=mt_bs[k],
        #     add_k=k,
        #     task2metric=task2metric_bs,
        # )
        # bertscore["Rank_bertscore"] = bertscore.index.tolist()

        merged = bleu.merge(chrf)  # .merge(bertscore)
        ranks = build_ranks(merged)
        k_res, cols = [], []
        for i, row in merged.iterrows():
            model_res = []
            for key, v in dict(row).items():
                key = (
                    key.replace("_max)", "")
                    .replace(f"{task} (", "")
                    .replace("delta (", "delta ")
                    .rstrip(")")
                )
                if "Rank" in key:
                    continue
                if key not in cols:
                    cols.append(key)
                model_res.append(v)
            k_res.append(model_res)
        k_df = pd.DataFrame(k_res, columns=cols)
        k_df["Rank"] = k_df["Model"].apply(lambda x: ranks[x])
        mt_res[k] = k_df[target_cols]
    return mt_res

In [795]:
ks = [0, 1, 4, 16]

tatoeba_eng_nob_nb = get_mt_results(task="tatoeba_eng_nob_nb", ks=ks)

In [796]:
p1 = reogranize_by_k(tatoeba_eng_nob_nb[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_eng_nob_nb[4], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_eng_nob_nb[1], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_eng_nob_nb[16], change_rank=False, add_baselines=[])

In [797]:
pd.concat([p1, p3, p2, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_eng_nob_nb.tsv"), sep="\t", index=False
)

In [798]:
tatoeba_nob_eng_nb = get_mt_results(task="tatoeba_nob_eng_nb", ks=ks)

In [799]:
p1 = reogranize_by_k(tatoeba_nob_eng_nb[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_nob_eng_nb[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_nob_eng_nb[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_nob_eng_nb[16], change_rank=False, add_baselines=[])

In [800]:
pd.concat([p1, p2, p3, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_nob_eng_nb.tsv"), sep="\t", index=False
)

In [801]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [802]:
tatoeba_eng_nno_nn = get_mt_results(task="tatoeba_eng_nno_nn", ks=ks)

In [803]:
p1 = reogranize_by_k(tatoeba_eng_nno_nn[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_eng_nno_nn[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_eng_nno_nn[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_eng_nno_nn[16], change_rank=False, add_baselines=[])

In [804]:
pd.concat([p1, p2, p3, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_eng_nno_nn.tsv"), sep="\t", index=False
)

In [805]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [806]:
tatoeba_nno_eng_nn = get_mt_results(task="tatoeba_nno_eng_nn", ks=ks)

In [807]:
p1 = reogranize_by_k(tatoeba_nno_eng_nn[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_nno_eng_nn[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_nno_eng_nn[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_nno_eng_nn[16], change_rank=False, add_baselines=[])

In [808]:
pd.concat([p1, p2, p3, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_nno_eng_nn.tsv"), sep="\t", index=False
)

In [809]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [810]:
tatoeba_nob_nno_nb = get_mt_results(task="tatoeba_nob_nno_nb", ks=[0])
del tatoeba_nob_nno_nb[0]["k"]
p1 = reogranize(tatoeba_nob_nno_nb[0].set_index("Rank"), add_baselines=[])
p1.to_csv(mkdir(f"{output_folder}/generation/tatoeba_nob_nno_nb.tsv"), sep="\t", index=False)
# print_latex_df(p1)


# ks = [0, 1, 4]
# tatoeba_nob_nno_nb = get_mt_results(task="tatoeba_nob_nno_nb", ks=ks)
# p1 = reogranize_by_k(tatoeba_nob_nno_nb[0], change_rank=False, add_baselines=[])
# p2 = reogranize_by_k(tatoeba_nob_nno_nb[1], change_rank=False, add_baselines=[])
# p3 = reogranize_by_k(tatoeba_nob_nno_nb[4], change_rank=False, add_baselines=[])
# pd.concat([p1, p2, p3]).to_csv(mkdir(
#     f"{output_folder}/generation/tatoeba_nob_nno_nb.tsv"), sep="\t", index=False
# )

In [811]:
import shutil

shutil.make_archive(f"{output_folder}", "zip", f"{output_folder}")

'/Users/javierr/git/mimir-evaluation-suite/mimir_results/tmp/.zip'

In [812]:
tatoeba_nno_nob_nn = get_mt_results(task="tatoeba_nno_nob_nn", ks=[0])
del tatoeba_nno_nob_nn[0]["k"]
p1 = reogranize(tatoeba_nno_nob_nn[0].set_index("Rank"), add_baselines=[])
p1.to_csv(mkdir(f"{output_folder}/generation/tatoeba_nno_nob_nn.tsv"), sep="\t", index=False)
print_latex_df(p1)

\begin{tabular}{llrlrl}
\toprule
 & Model & bleu & delta bleu & chrf & delta chrf \\
Rank &  &  &  &  &  \\
\midrule
5 & extended & 12.501190 & \xmark & 47.179663 & \xmark \\
2 & base & 24.359707 & \xmark & 64.144959 & \xmark \\
13 & base + books & 8.479467 & --15.880239453223185 & 37.372424 & --26.772534319516154 \\
1 & base + newspapers & 45.307222 & +20.947515274587882 & 77.562702 & +13.41774278651225 \\
6 & base + books + newspapers & 11.799298 & --12.56040924963037 & 45.518905 & --18.626054151587304 \\
10 & base + fiction books & 8.925170 & --15.43453721576794 & 38.740423 & --25.40453535310106 \\
9 & base + nonfiction books & 9.789933 & --14.569774361049223 & 40.595627 & --23.54933218441913 \\
3 & base + nonfiction books + newspapers & 17.056241 & --7.303465626014528 & 55.192414 & --8.952544582216426 \\
11 & base + original books & 8.766193 & --15.593513570074098 & 37.968024 & --26.17693497179053 \\
4 & base + original books + newspapers & 16.526474 & --7.8332326200378475 & 53.496

In [813]:
lang_df = pd.read_csv("./linguistic_evaluation.tsv", sep="\t", index_col=False)[["model", "inverse_compression", "min_max_lix", "normalized_inverse_sb"]].set_index("model")
lang_df = lang_df.reindex(canonical_order)
lang_df = lang_df.reset_index()
lang_df.model = lang_df.model.apply(pretty_model)
lang_df = lang_df.set_index("model")
lang_df.index.name = "Model"
lang_df

Unnamed: 0_level_0,inverse_compression,min_max_lix,normalized_inverse_sb
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
extended,40.482552,65.047945,70.0733
base,40.309578,61.109302,70.7573
base + books,39.607098,67.60619,69.0641
base + newspapers,40.721587,65.938873,69.4485
base + books + newspapers,40.451438,67.054955,69.4711
base + fiction books,40.133242,78.824808,66.9688
base + nonfiction books,39.585148,62.928665,70.3146
base + nonfiction books + newspapers,40.351868,66.425485,69.6265
base + original books,39.625931,65.38095,70.1595
base + original books + newspapers,40.241449,66.374098,69.4104


In [828]:
# highest k-shot available (lower than 16)
ks = {
    "ask_gec_nb": 4,
    "mimir_bias": 0,
    "ncb": 0,
    "norbelebele_nb": 0,
    "norcommonsenseqa_nb": 0,
    "norcommonsenseqa_nn": 0,
    "norec_document_nb": 1,
    "norec_sentence_nb": 4,
    "noridiom_nb": 0,
    "noridiom_nn": 0,
    "noropenbookqa_nb_use_fact": 4,
    "noropenbookqa_nb": 4,
    "noropenbookqa_nn_use_fact": 4,
    "noropenbookqa_nn": 4,
    "norquad_nb": 1,
    "norsumm_nb": 0,
    "norsumm_nn": 0,
    "nortruthfulqa_gen_nb": 0,
    "nortruthfulqa_mc_nb": 0,
    "nortruthfulqa_mc_nn": 0,
    "nrk_nb": 0,
    "nrk_nn": 0,
    "tatoeba_eng_nno_nn": 4,
    "tatoeba_eng_nob_nb": 4,
    "tatoeba_nno_eng_nn": 4,
    "tatoeba_nno_nob_nn": 0,  # should be 4, but there's an error in the bertscore processing
    "tatoeba_nob_eng_nb": 4,
    "tatoeba_nob_nno_nb": 0,  # should be 4, but there's an error in the bertscore processing
    # "schibsted_vg_nb": 0,
}
report_metrics_dict = {
    "mimir_bias": ["pct_stereotype"],
    "ncb": ["acc"],
    "norec_sentence_nb": ["f1"],
    "norec_document_nb": ["f1"],
    "tapaco_no_detection_nb": ["acc"],
    "norbelebele_nb": ["acc"],
    "nrk_nb": ["acc"],
    "nrk_nn": ["acc"],
    "noropenbookqa_nb": ["acc"],
    "noropenbookqa_nn": ["acc"],
    "noropenbookqa_nb_use_fact": ["acc"],
    "noropenbookqa_nn_use_fact": ["acc"],
    "norcommonsenseqa_nn": ["acc"],
    "norcommonsenseqa_nb": ["acc"],
    "nortruthfulqa_mc_nb": ["acc"],
    "nortruthfulqa_mc_nn": ["acc"],
    "norquad_nb": ["f1"],
    "noridiom_nb": ["fscore"],
    "noridiom_nn": ["fscore"],
    "norsumm_nb": [
        #"bleu_max",
        # "bleu_avg",
        "rougeL_max",
        # "rougeL_avg",
        # "bertscore_f1_max",
        # "bertscore_f1_avg",
    ],
    "norsumm_nn": [
        # "bleu_max",
        # "bleu_avg",
        "rougeL_max",
        # "rougeL_avg",
        # "bertscore_f1_max",
        # "bertscore_f1_avg",
    ],
    "nortruthfulqa_gen_nb": ["rougeL_max"],  # ["bleu_max", "rougeL_max"],
    # "schibsted_vg_nb": ["bleu", "chrf"],
    "ask_gec_nb": ["errant"],
    "tatoeba_eng_nno_nn": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nno_eng_nn": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_eng_nob_nb": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nob_eng_nb": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nob_nno_nb": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nno_nob_nn": ["bleu"],  # ["bleu", "chrf"],
}
report_metrics = []
for report_metric_key, report_metric_values in report_metrics_dict.items():
    for report_metric_value in report_metric_values:
        report_metrics.append(f"{report_metric_key} ({report_metric_value})")
selected_dfs = []
for task, df in zip(*[all_tasks, all_dfs]):
    if df["k"].unique()[0] == ks[task]:
        selected_dfs.append(df.loc[:,~df.columns.str.contains('^(delta|rank|k)', case=False)].set_index("Model"))

report_df = pd.concat(selected_dfs, axis=1,  sort=False)
report_df = report_df.loc[:,report_df.columns.isin(report_metrics)]
print(report_df.columns)
report_df.columns = [col.split(" (")[0] for col in report_df.columns]
report_df = report_df.join(lang_df)
report_df

Index(['norbelebele_nb (acc)', 'nrk_nb (acc)', 'nrk_nn (acc)',
       'norcommonsenseqa_nb (acc)', 'norcommonsenseqa_nn (acc)', 'ncb (acc)',
       'noridiom_nb (fscore)', 'noridiom_nn (fscore)',
       'nortruthfulqa_mc_nb (acc)', 'nortruthfulqa_mc_nn (acc)',
       'nortruthfulqa_gen_nb (rougeL_max)', 'mimir_bias (pct_stereotype)',
       'norsumm_nb (rougeL_max)', 'norsumm_nn (rougeL_max)',
       'norec_document_nb (f1)', 'norec_sentence_nb (f1)', 'norquad_nb (f1)',
       'noropenbookqa_nb (acc)', 'noropenbookqa_nn (acc)',
       'noropenbookqa_nb_use_fact (acc)', 'noropenbookqa_nn_use_fact (acc)',
       'ask_gec_nb (errant)', 'tatoeba_eng_nob_nb (bleu)',
       'tatoeba_nob_eng_nb (bleu)', 'tatoeba_eng_nno_nn (bleu)',
       'tatoeba_nno_eng_nn (bleu)', 'tatoeba_nob_nno_nb (bleu)',
       'tatoeba_nno_nob_nn (bleu)'],
      dtype='object')


Unnamed: 0_level_0,norbelebele_nb,nrk_nb,nrk_nn,norcommonsenseqa_nb,norcommonsenseqa_nn,ncb,noridiom_nb,noridiom_nn,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,...,ask_gec_nb,tatoeba_eng_nob_nb,tatoeba_nob_eng_nb,tatoeba_eng_nno_nn,tatoeba_nno_eng_nn,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression,min_max_lix,normalized_inverse_sb
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
extended,30.555556,48.972222,59.924812,43.578644,38.947368,79.52381,36.78422,32.585235,52.631579,50.877193,...,37.07,56.667414,56.392314,44.590171,55.428263,11.658696,12.50119,40.482552,65.047945,70.0733
base,28.777778,45.805556,54.285714,42.857143,36.842105,77.261905,24.146826,14.221166,58.373206,54.385965,...,34.25,54.185678,54.655333,42.729146,53.642484,15.992702,24.359707,40.309578,61.109302,70.7573
base + books,27.666667,44.527778,53.007519,44.444444,34.736842,83.809524,57.445196,27.679681,59.330144,56.140351,...,35.52,55.262219,54.746218,42.131737,53.692677,12.757572,8.479467,39.607098,67.60619,69.0641
base + newspapers,30.222222,45.833333,55.18797,40.836941,34.736842,81.071429,74.910771,42.637639,47.84689,43.859649,...,32.65,53.875333,53.509675,44.126743,53.046373,64.203105,45.307222,40.721587,65.938873,69.4485
base + books + newspapers,27.777778,44.916667,54.511278,44.588745,38.947368,84.047619,59.057058,28.478496,58.851675,50.877193,...,35.19,55.304716,54.672177,42.716894,53.644498,16.639855,11.799298,40.451438,67.054955,69.4711
base + fiction books,27.666667,42.972222,49.473684,44.877345,37.894737,83.928571,26.930504,19.526365,45.454545,43.859649,...,31.94,53.794761,52.821867,41.767026,52.682172,9.186173,8.92517,40.133242,78.824808,66.9688
base + nonfiction books,29.0,43.833333,53.609023,43.434343,37.894737,84.166667,57.685313,31.497707,60.76555,63.157895,...,35.62,55.071566,54.325501,43.016892,53.771744,12.033895,9.789933,39.585148,62.928665,70.3146
base + nonfiction books + newspapers,29.0,45.0,55.18797,44.300144,41.052632,82.142857,64.04641,35.258885,56.45933,56.140351,...,35.2,54.982987,54.50096,42.870092,53.502854,14.401647,17.056241,40.351868,66.425485,69.6265
base + original books,28.222222,44.972222,54.586466,45.310245,36.842105,84.880952,59.148034,31.556056,61.722488,54.385965,...,35.3,54.900101,54.582642,42.598892,53.853974,13.039856,8.766193,39.625931,65.38095,70.1595
base + original books + newspapers,28.0,44.333333,55.037594,45.021645,37.894737,83.809524,61.874001,32.834751,55.980861,57.894737,...,35.24,54.962416,54.399651,43.038514,53.398338,16.442273,16.526474,40.241449,66.374098,69.4104


In [829]:
skills_df = {}
for skill_, metrics in skill.items():
    skills_df[skill_] = report_df[metrics]

In [830]:
report = pd.concat(skills_df.values(), keys=skills_df.keys(),axis=1)
report.to_csv("./report.tsv", sep="\t")
report

Unnamed: 0_level_0,Sentiment Analysis,Sentiment Analysis,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Reading Comprehension,Reading Comprehension,World Knowledge,World Knowledge,...,Summarization,Translation,Translation,Translation,Translation,Translation,Translation,Linguistic Analysis,Linguistic Analysis,Linguistic Analysis
Unnamed: 0_level_1,norec_sentence_nb,norec_document_nb,mimir_bias,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,nortruthfulqa_gen_nb,norbelebele_nb,norquad_nb,nrk_nb,nrk_nn,...,norsumm_nn,tatoeba_eng_nno_nn,tatoeba_nno_eng_nn,tatoeba_eng_nob_nb,tatoeba_nob_eng_nb,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression,min_max_lix,normalized_inverse_sb
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
extended,82.090207,65.395523,66.343434,52.631579,50.877193,25.181798,30.555556,48.199335,48.972222,59.924812,...,34.610032,44.590171,55.428263,56.667414,56.392314,11.658696,12.50119,40.482552,65.047945,70.0733
base,65.601004,67.026313,63.959596,58.373206,54.385965,23.122931,28.777778,47.299865,45.805556,54.285714,...,27.266934,42.729146,53.642484,54.185678,54.655333,15.992702,24.359707,40.309578,61.109302,70.7573
base + books,74.786108,70.287647,64.0,59.330144,56.140351,27.017284,27.666667,39.100198,44.527778,53.007519,...,27.748225,42.131737,53.692677,55.262219,54.746218,12.757572,8.479467,39.607098,67.60619,69.0641
base + newspapers,74.913273,69.293372,64.444444,47.84689,43.859649,22.856973,30.222222,40.364153,45.833333,55.18797,...,28.687373,44.126743,53.046373,53.875333,53.509675,64.203105,45.307222,40.721587,65.938873,69.4485
base + books + newspapers,75.353774,73.324913,65.131313,58.851675,50.877193,25.753459,27.777778,42.405521,44.916667,54.511278,...,27.597571,42.716894,53.644498,55.304716,54.672177,16.639855,11.799298,40.451438,67.054955,69.4711
base + fiction books,73.07337,68.971084,64.808081,45.454545,43.859649,25.486099,27.666667,39.7605,42.972222,49.473684,...,25.970056,41.767026,52.682172,53.794761,52.821867,9.186173,8.92517,40.133242,78.824808,66.9688
base + nonfiction books,76.786017,66.805905,64.323232,60.76555,63.157895,24.387466,29.0,34.218932,43.833333,53.609023,...,26.410719,43.016892,53.771744,55.071566,54.325501,12.033895,9.789933,39.585148,62.928665,70.3146
base + nonfiction books + newspapers,79.451572,71.493354,63.353535,56.45933,56.140351,24.114753,29.0,44.319326,45.0,55.18797,...,28.75063,42.870092,53.502854,54.982987,54.50096,14.401647,17.056241,40.351868,66.425485,69.6265
base + original books,76.417902,67.044712,62.10101,61.722488,54.385965,25.115272,28.222222,37.514894,44.972222,54.586466,...,25.923147,42.598892,53.853974,54.900101,54.582642,13.039856,8.766193,39.625931,65.38095,70.1595
base + original books + newspapers,75.493905,72.286656,63.959596,55.980861,57.894737,25.670059,28.0,42.838237,44.333333,55.037594,...,26.999041,43.038514,53.398338,54.962416,54.399651,16.442273,16.526474,40.241449,66.374098,69.4104


In [831]:
report_agg = pd.concat({k: report.pivot_table(k, "Model").T.mean() for k in skill.keys()}, axis=1).reindex(report.index)
report_agg.to_csv("./report_skill.tsv", sep="\t")
report_agg

Unnamed: 0_level_0,Sentiment Analysis,Fairness & Truthfulness,Reading Comprehension,World Knowledge,Commonsense Reasoning,Norwegian Syntax,Summarization,Translation,Linguistic Analysis
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
extended,73.742865,48.758501,39.377445,42.157904,41.263006,46.490816,41.000158,39.539675,58.534599
base,66.313658,49.960425,38.038821,40.246693,39.849624,37.469974,32.445811,40.927508,57.39206
base + books,72.536878,51.621945,33.383433,39.595676,39.590643,51.1136,29.80923,37.844982,58.759129
base + newspapers,72.103322,44.751989,35.293187,40.676246,37.786891,57.81746,29.262776,52.344742,58.702986
base + books + newspapers,74.339343,50.15341,35.091649,39.580583,41.768057,51.693293,31.862726,39.129573,58.992498
base + fiction books,71.022227,44.902094,33.713583,39.180261,41.386041,40.58136,28.412804,36.529528,61.975617
base + nonfiction books,71.795961,53.158536,31.609466,39.302408,40.66454,52.242422,29.522049,38.001588,57.609471
base + nonfiction books + newspapers,75.472463,50.016992,36.659663,40.176677,42.676388,54.162038,30.782248,39.552463,58.801284
base + original books,71.731307,50.831184,32.868558,40.356035,41.076175,52.721261,28.736139,37.956943,58.388794
base + original books + newspapers,73.89028,50.876313,35.419119,39.845628,41.458191,53.439569,30.669836,39.794611,58.675315


In [832]:
%pwd

'/Users/javierr/git/mimir-evaluation-suite/mimir_results'

In [821]:
# zero-shot
ks = {
    "ask_gec_nb": 0,
    "mimir_bias": 0,
    "ncb": 0,
    "norbelebele_nb": 0,
    "norcommonsenseqa_nb": 0,
    "norcommonsenseqa_nn": 0,
    "norec_document_nb": 0,
    "norec_sentence_nb": 0,
    "noridiom_nb": 0,
    "noridiom_nn": 0,
    "noropenbookqa_nb_use_fact": 0,
    "noropenbookqa_nb": 0,
    "noropenbookqa_nn_use_fact": 0,
    "noropenbookqa_nn": 0,
    "norquad_nb": 0,
    "norsumm_nb": 0,
    "norsumm_nn": 0,
    "nortruthfulqa_gen_nb": 0,
    "nortruthfulqa_mc_nb": 0,
    "nortruthfulqa_mc_nn": 0,
    "nrk_nb": 0,
    "nrk_nn": 0,
    "tatoeba_eng_nno_nn": 0,
    "tatoeba_eng_nob_nb": 0,
    "tatoeba_nno_eng_nn": 0,
    "tatoeba_nno_nob_nn": 0,  # should be 4, but there's an error in the bertscore processing
    "tatoeba_nob_eng_nb": 0,
    "tatoeba_nob_nno_nb": 0,  # should be 4, but there's an error in the bertscore processing
    # "schibsted_vg_nb": 0,
}
report_metrics = []
for report_metric_key, report_metric_values in report_metrics_dict.items():
    for report_metric_value in report_metric_values:
        report_metrics.append(f"{report_metric_key} ({report_metric_value})")
selected_dfs = []
for task, df in zip(*[all_tasks, all_dfs]):
    if df["k"].unique()[0] == ks[task]:
        selected_dfs.append(df.loc[:,~df.columns.str.contains('^(delta|rank|k)', case=False)].set_index("Model"))

report_df = pd.concat(selected_dfs, axis=1,  sort=False)
report_df = report_df.loc[:,report_df.columns.isin(report_metrics)]
print(report_df.columns)
report_df.columns = [col.split(" (")[0] for col in report_df.columns]
report_df = report_df.join(lang_df)
report_df

Index(['norbelebele_nb (acc)', 'nrk_nb (acc)', 'nrk_nn (acc)',
       'norcommonsenseqa_nb (acc)', 'norcommonsenseqa_nn (acc)', 'ncb (acc)',
       'noridiom_nb (fscore)', 'noridiom_nn (fscore)',
       'nortruthfulqa_mc_nb (acc)', 'nortruthfulqa_mc_nn (acc)',
       'nortruthfulqa_gen_nb (rougeL_max)', 'mimir_bias (pct_stereotype)',
       'norsumm_nb (rougeL_max)', 'norsumm_nn (rougeL_max)',
       'norec_document_nb (f1)', 'norec_sentence_nb (f1)', 'norquad_nb (f1)',
       'noropenbookqa_nb (acc)', 'noropenbookqa_nn (acc)',
       'noropenbookqa_nb_use_fact (acc)', 'noropenbookqa_nn_use_fact (acc)',
       'ask_gec_nb (errant)', 'tatoeba_eng_nob_nb (bleu)',
       'tatoeba_nob_eng_nb (bleu)', 'tatoeba_eng_nno_nn (bleu)',
       'tatoeba_nno_eng_nn (bleu)', 'tatoeba_nob_nno_nb (bleu)',
       'tatoeba_nno_nob_nn (bleu)'],
      dtype='object')


Unnamed: 0_level_0,norbelebele_nb,nrk_nb,nrk_nn,norcommonsenseqa_nb,norcommonsenseqa_nn,ncb,noridiom_nb,noridiom_nn,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,...,ask_gec_nb,tatoeba_eng_nob_nb,tatoeba_nob_eng_nb,tatoeba_eng_nno_nn,tatoeba_nno_eng_nn,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression,min_max_lix,normalized_inverse_sb
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
extended,30.555556,48.972222,59.924812,43.578644,38.947368,79.52381,36.78422,32.585235,52.631579,50.877193,...,7.94,19.718004,10.897911,14.94887,10.379039,11.658696,12.50119,40.482552,65.047945,70.0733
base,28.777778,45.805556,54.285714,42.857143,36.842105,77.261905,24.146826,14.221166,58.373206,54.385965,...,4.75,13.648272,10.071595,12.301419,12.860826,15.992702,24.359707,40.309578,61.109302,70.7573
base + books,27.666667,44.527778,53.007519,44.444444,34.736842,83.809524,57.445196,27.679681,59.330144,56.140351,...,10.68,10.632341,10.271388,4.723513,8.785296,12.757572,8.479467,39.607098,67.60619,69.0641
base + newspapers,30.222222,45.833333,55.18797,40.836941,34.736842,81.071429,74.910771,42.637639,47.84689,43.859649,...,5.32,34.42363,36.890106,28.175257,35.063173,64.203105,45.307222,40.721587,65.938873,69.4485
base + books + newspapers,27.777778,44.916667,54.511278,44.588745,38.947368,84.047619,59.057058,28.478496,58.851675,50.877193,...,7.84,10.229949,10.484609,4.881404,7.650597,16.639855,11.799298,40.451438,67.054955,69.4711
base + fiction books,27.666667,42.972222,49.473684,44.877345,37.894737,83.928571,26.930504,19.526365,45.454545,43.859649,...,8.61,8.069368,7.531059,4.704739,5.634942,9.186173,8.92517,40.133242,78.824808,66.9688
base + nonfiction books,29.0,43.833333,53.609023,43.434343,37.894737,84.166667,57.685313,31.497707,60.76555,63.157895,...,13.26,9.876487,10.542187,5.581007,9.631253,12.033895,9.789933,39.585148,62.928665,70.3146
base + nonfiction books + newspapers,29.0,45.0,55.18797,44.300144,41.052632,82.142857,64.04641,35.258885,56.45933,56.140351,...,7.91,9.214916,12.578001,5.053153,9.968868,14.401647,17.056241,40.351868,66.425485,69.6265
base + original books,28.222222,44.972222,54.586466,45.310245,36.842105,84.880952,59.148034,31.556056,61.722488,54.385965,...,9.94,11.89692,9.744685,4.879612,8.694367,13.039856,8.766193,39.625931,65.38095,70.1595
base + original books + newspapers,28.0,44.333333,55.037594,45.021645,37.894737,83.809524,61.874001,32.834751,55.980861,57.894737,...,7.93,9.335651,12.646572,4.867005,9.916842,16.442273,16.526474,40.241449,66.374098,69.4104


In [822]:
skills_df = {}
for skill_, metrics in skill.items():
    skills_df[skill_] = report_df[metrics]

report = pd.concat(skills_df.values(), keys=skills_df.keys(),axis=1)
# report.to_csv("./report.tsv", sep="\t")
report

Unnamed: 0_level_0,Sentiment Analysis,Sentiment Analysis,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Reading Comprehension,Reading Comprehension,World Knowledge,World Knowledge,...,Summarization,Translation,Translation,Translation,Translation,Translation,Translation,Linguistic Analysis,Linguistic Analysis,Linguistic Analysis
Unnamed: 0_level_1,norec_sentence_nb,norec_document_nb,mimir_bias,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,nortruthfulqa_gen_nb,norbelebele_nb,norquad_nb,nrk_nb,nrk_nn,...,norsumm_nn,tatoeba_eng_nno_nn,tatoeba_nno_eng_nn,tatoeba_eng_nob_nb,tatoeba_nob_eng_nb,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression,min_max_lix,normalized_inverse_sb
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
extended,48.406816,55.283439,66.343434,52.631579,50.877193,25.181798,30.555556,45.554279,48.972222,59.924812,...,34.610032,14.94887,10.379039,19.718004,10.897911,11.658696,12.50119,40.482552,65.047945,70.0733
base,70.091134,68.98817,63.959596,58.373206,54.385965,23.122931,28.777778,40.589246,45.805556,54.285714,...,27.266934,12.301419,12.860826,13.648272,10.071595,15.992702,24.359707,40.309578,61.109302,70.7573
base + books,70.712175,62.962352,64.0,59.330144,56.140351,27.017284,27.666667,35.58859,44.527778,53.007519,...,27.748225,4.723513,8.785296,10.632341,10.271388,12.757572,8.479467,39.607098,67.60619,69.0641
base + newspapers,57.10778,68.806231,64.444444,47.84689,43.859649,22.856973,30.222222,36.993047,45.833333,55.18797,...,28.687373,28.175257,35.063173,34.42363,36.890106,64.203105,45.307222,40.721587,65.938873,69.4485
base + books + newspapers,65.069465,68.350602,65.131313,58.851675,50.877193,25.753459,27.777778,37.655642,44.916667,54.511278,...,27.597571,4.881404,7.650597,10.229949,10.484609,16.639855,11.799298,40.451438,67.054955,69.4711
base + fiction books,66.134942,63.457872,64.808081,45.454545,43.859649,25.486099,27.666667,38.240878,42.972222,49.473684,...,25.970056,4.704739,5.634942,8.069368,7.531059,9.186173,8.92517,40.133242,78.824808,66.9688
base + nonfiction books,70.098783,62.490437,64.323232,60.76555,63.157895,24.387466,29.0,34.677829,43.833333,53.609023,...,26.410719,5.581007,9.631253,9.876487,10.542187,12.033895,9.789933,39.585148,62.928665,70.3146
base + nonfiction books + newspapers,64.600599,72.135835,63.353535,56.45933,56.140351,24.114753,29.0,37.377047,45.0,55.18797,...,28.75063,5.053153,9.968868,9.214916,12.578001,14.401647,17.056241,40.351868,66.425485,69.6265
base + original books,70.877957,63.503725,62.10101,61.722488,54.385965,25.115272,28.222222,33.495788,44.972222,54.586466,...,25.923147,4.879612,8.694367,11.89692,9.744685,13.039856,8.766193,39.625931,65.38095,70.1595
base + original books + newspapers,62.887731,65.116473,63.959596,55.980861,57.894737,25.670059,28.0,36.178001,44.333333,55.037594,...,26.999041,4.867005,9.916842,9.335651,12.646572,16.442273,16.526474,40.241449,66.374098,69.4104


In [823]:
report_agg = pd.concat({k: report.pivot_table(k, "Model").T.mean() for k in skill.keys()}, axis=1).reindex(report.index)
report_agg['Average'] = report_agg.mean(numeric_only=True, axis=1)
# report_agg.to_csv("./report_skill.tsv", sep="\t")
report_agg

Unnamed: 0_level_0,Sentiment Analysis,Fairness & Truthfulness,Reading Comprehension,World Knowledge,Commonsense Reasoning,Norwegian Syntax,Summarization,Translation,Linguistic Analysis,Average
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
extended,51.845127,48.758501,38.054917,42.166517,41.263006,39.208316,41.000158,13.350618,58.534599,41.575751
base,69.539652,49.960425,34.683512,39.535022,39.849624,30.094974,32.445811,14.87242,57.39206,40.930389
base + books,66.837264,51.621945,31.627628,38.804332,39.590643,44.9036,29.80923,9.274929,58.759129,41.247633
base + newspapers,62.957006,44.751989,33.607635,39.120476,37.786891,50.98496,29.262776,40.677082,58.702986,44.205756
base + books + newspapers,66.710034,50.15341,32.71671,39.348025,41.768057,44.855793,31.862726,10.280952,58.992498,41.854245
base + fiction books,64.796407,44.902094,32.953772,37.502828,41.386041,34.74886,28.412804,7.341908,61.975617,39.335592
base + nonfiction books,66.29461,53.158536,31.838915,38.428162,40.66454,46.652422,29.522049,9.575794,57.609471,41.527166
base + nonfiction books + newspapers,68.368217,50.016992,33.188524,39.107556,42.676388,47.339538,30.782248,11.378804,58.801284,42.406617
base + original books,67.190841,50.831184,30.859005,38.962839,41.076175,46.381261,28.736139,9.503606,58.388794,41.325538
base + original books + newspapers,64.002102,50.876313,32.089,38.829263,41.458191,46.612069,30.669836,11.62247,58.675315,41.648284
