In [1]:
import os
import json
import pandas as pd
from pathlib import Path

output_folder = "./tmp/"

def load_json(fname):
    with open(fname, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def mkdir(filename):
    path = Path(filename)
    path.parent.mkdir(parents=True, exist_ok=True)
    return filename

# monkeypatching round
def round(value, *args, **kwargs):
    return value

In [2]:
%cd ../mimir_results

/Users/javierr/git/mimir-evaluation-suite/mimir_results


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
zero = [
    "norec_sentence_nb",
    "norec_document_nb",
    "mimir_bias",
    "noridiom_nb",
    "noridiom_nn",
    "ncb",
    "norbelebele_nb",
    "nrk_nb",
    "nrk_nn",
    "noropenbookqa_nb",
    "noropenbookqa_nb_use_fact",
    "noropenbookqa_nn",
    "noropenbookqa_nn_use_fact",
    "norcommonsenseqa_nb",
    "norcommonsenseqa_nn",
    "nortruthfulqa_mc_nb",
    "nortruthfulqa_mc_nn",
    "nortruthfulqa_gen_nb",
    "norquad_nb",
    # "schibsted_vg_nb",
    "ask_gec_nb",
    "norsumm_nb",
    "norsumm_nn",
    "tatoeba_eng_nno_nn",
    "tatoeba_nno_eng_nn",
    "tatoeba_eng_nob_nb",
    "tatoeba_nob_eng_nb",
    "tatoeba_nob_nno_nb",
    "tatoeba_nno_nob_nn",
]

In [4]:
prompts = {
    "norquad_nb": {
        "prompt-0": "Tittel: {title}\n\nTekst: {passage}\n\nSpørsmål: {question}\n\nSvar:",
        "prompt-1": 'Tittel: {title}\n\nTekst: {passage}\n\nGitt teksten over, hva er svaret på følgende spørsmål? "{question}"\n\nSvar:',
        "prompt-2": "Tittel: {title}\n\nTekst: {passage}\n\nSpørsmål: {question}\n\nSvar:",
        "prompt-3": 'Tittel: {title}\n\nTekst: {passage}\n\nHvordan kan man svare på spørsmålet "{question}", gitt teksten over?\n\nSvar:',
        "prompt-4": 'Tittel: {title}\n\nTekst:{passage}\n\nGitt teksten over, besvar følgende spørsmål: "{question}"\n\nSvar:',
    }
}

task2metric = {
    "mimir_bias": ["pct_stereotype", "likelihood_diff"],
    "ncb": ["acc"],
    "norec_sentence_nb": ["acc", "f1"],
    "norec_document_nb": ["acc", "f1"],
    "tapaco_no_detection_nb": ["acc"],
    "norbelebele_nb": ["acc"],
    "nrk_nb": ["acc"],
    "nrk_nn": ["acc"],
    "noropenbookqa_nb": ["acc"],
    "noropenbookqa_nn": ["acc"],
    "noropenbookqa_nb_use_fact": ["acc"],
    "noropenbookqa_nn_use_fact": ["acc"],
    "norcommonsenseqa_nn": ["acc"],
    "norcommonsenseqa_nb": ["acc"],
    "nortruthfulqa_mc_nb": ["acc"],
    "nortruthfulqa_mc_nn": ["acc"],
    "norquad_nb": ["exact_match", "f1"],
    "noridiom_nb": ["em", "fscore"],
    "noridiom_nn": ["em", "fscore"],
    "norsumm_nb": [
        "bleu_max",
        "bleu_avg",
        "rougeL_max",
        "rougeL_avg",
        "bertscore_f1_max",
        "bertscore_f1_avg",
    ],
    "norsumm_nn": [
        "bleu_max",
        "bleu_avg",
        "rougeL_max",
        "rougeL_avg",
        "bertscore_f1_max",
        "bertscore_f1_avg",
    ],
    "nortruthfulqa_gen_nb": ["bleu_max", "rougeL_max"],
    # "schibsted_vg_nb": ["bleu", "chrf"],
    "ask_gec_nb": ["errant"],
    "tatoeba_eng_nno_nn": ["bleu", "chrf"],
    "tatoeba_nno_eng_nn": ["bleu", "chrf"],
    "tatoeba_eng_nob_nb": ["bleu", "chrf"],
    "tatoeba_nob_eng_nb": ["bleu", "chrf"],
    "tatoeba_nob_nno_nb": ["bleu", "chrf"],
    "tatoeba_nno_nob_nn": ["bleu", "chrf"],
}


def pretty_metric(
    task,
    metric_name,
    score,
    metric_list=[
        "f1",
        "acc",
        "pct_stereotype",
        "acc_norm",
        "em",
        "fscore",
        "bertscore_f1_avg",
        "bertscore_f1_max",
    ],
):
    pretty_metric_name = metric_name.replace(",none", "")
    pretty_metric_score = (
        round(score * 100, 3) if pretty_metric_name in metric_list else round(score, 3)
    )
    if task == "norquad_nb":
        pretty_metric_score = round(score, 3)
    return pretty_metric_name, pretty_metric_score


def collect_task_ranking_results(
    task,
    k=0,
    ignore_models=["gpt-sw3-6.7b"],
    ignore_metrics=[
        "alias",
        "bleu_acc",
        "bleu_diff",
        "rouge1_acc",
        "rouge1_diff",
        "rouge2_max",
        "rouge2_acc",
        "rouge2_diff",
        "rougeL_acc",
        "rougeL_diff",
    ],
    verbose=True,
    columns=["task", "model", "k-shot"],
):
    res = []
    res_fdir = f"{task}/{k}-shot"
    res_columns = columns.copy()
    for model_organization in os.listdir(res_fdir):
        model_fdir = os.path.join(res_fdir, model_organization)
        for model in os.listdir(model_fdir):
            if model in ignore_models:
                continue
            model_res_fpath = os.path.join(model_fdir, model, "results.json")
            if verbose:
                print(model_res_fpath)
            model_res = load_json(model_res_fpath)
            model_res_scores = model_res["results"][task]
            model_name = model_res["config"]["model_args"].split(",")[0].split("/", 1)[-1]
            curr_configuration_res = [task, model_name, k]
            for metric_name, score in model_res_scores.items():
                if "stderr" in metric_name or metric_name in ignore_metrics:
                    continue
                pretty_metric_name, pretty_metric_score = pretty_metric(
                    task=task, metric_name=metric_name, score=score
                )
                if pretty_metric_name not in res_columns:
                    res_columns.append(pretty_metric_name)
                curr_configuration_res.append(pretty_metric_score)
            res.append(curr_configuration_res)
    return pd.DataFrame(res, columns=res_columns)


def collect_task_prompt_results(
    task,
    k,
    ignore_models=["gpt-sw3-6.7b"],
    verbose=True,
    columns=["task", "model", "prompt", "k-shot"],
    prompts=prompts,
):
    res = []
    res_fdir = f"{task}/{k}-shot"
    res_columns = columns.copy()
    for model_organization in os.listdir(res_fdir):
        model_fdir = os.path.join(res_fdir, model_organization)
        for model in os.listdir(model_fdir):
            if model in ignore_models:
                continue
            model_res_fpath = os.path.join(model_fdir, model, "results.json")
            if not os.path.exists(model_res_fpath):
                continue
            if verbose:
                print(model_res_fpath)
            model_res = load_json(model_res_fpath)
            model_res_scores = {
                prompt_name: prompt_res
                for prompt_name, prompt_res in model_res["results"].items()
                if prompt_name != task
            }
            for configuration_name, configuration_res in model_res_scores.items():
                try:
                    prompt = (
                        prompts[task][configuration_name]
                        if task in prompts
                        else model_res["configs"][configuration_name]["doc_to_text"]
                    )
                except Exception as exc:
                    print(model_res["configs"].keys(), configuration_name, task, model_res_fpath)
                    raise exc
                model_name = model_res["config"]["model_args"].split(",")[0].split("/", 1)[-1]
                curr_configuration_res = [task, model_name, prompt, k]
                for metric_name, score in configuration_res.items():
                    if "stderr" in metric_name or metric_name == "alias":
                        continue
                    pretty_metric_name, pretty_metric_score = pretty_metric(
                        task=task, metric_name=metric_name, score=score
                    )
                    if pretty_metric_name not in res_columns:
                        res_columns.append(pretty_metric_name)
                    curr_configuration_res.append(pretty_metric_score)
                res.append(curr_configuration_res)
    return pd.DataFrame(res, columns=res_columns)

In [5]:
collect_task_prompt_results(task="norbelebele_nb", k=0).model.unique()

norbelebele_nb/0-shot/mimir-project/mimir-7b-fiction/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-rightholders/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-extended-scratch-instruct/results.json
norbelebele_nb/0-shot/mimir-project/mimir-mistral-7b-base/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-factual/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-extended-instruct/results.json
norbelebele_nb/0-shot/mimir-project/mimir-mistral-7b-extended-scratch/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-translated/results.json
norbelebele_nb/0-shot/mimir-project/mimir-mistral-7b-extended/results.json
norbelebele_nb/0-shot/mimir-project/mimir-mistral-7b-base-instruct/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-untranslated/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-books/results.json
norbelebele_nb/0-shot/mimir-project/mimir-7b-untranslated-withnewspapers/results.json
norbelebele_nb/0-shot/mimir-project/Mi

array(['mimir-7b-fiction', 'mimir-7b-rightholders',
       'mimir-mistral-7b-extended-scratch-instruct',
       'mimir-mistral-7b-base', 'mimir-7b-factual',
       'mimir-mistral-7b-extended-instruct',
       'mimir-mistral-7b-extended-scratch', 'mimir-7b-translated',
       'mimir-mistral-7b-extended', 'mimir-mistral-7b-base-instruct',
       'mimir-7b-untranslated', 'mimir-7b-books',
       'mimir-7b-untranslated-withnewspapers', 'Mistral-7B-v0.1',
       'mimir-7b-nonfiction', 'mimir-mistral-7b-base-scratch',
       'mimir-7b-newspapers', 'mimir-mistral-7b-base-scratch-instruct'],
      dtype=object)

In [6]:
zero_shot = [
    "norec_sentence_nb",
    "norec_document_nb",
    "mimir_bias",
#    "tapaco_no_detection_nb",
    "norsumm_nb",
    "norsumm_nn",
    "noridiom_nb",
    "noridiom_nn",
    "ncb",
    "norbelebele_nb",
    "nrk_nb",
    "nrk_nn",
    "noropenbookqa_nb",
    "noropenbookqa_nb_use_fact",
    "noropenbookqa_nn",
    "noropenbookqa_nn_use_fact",
    "norcommonsenseqa_nb",
    "norcommonsenseqa_nn",
    "nortruthfulqa_mc_nb",
    "nortruthfulqa_mc_nn",
    "norquad_nb",
    "nortruthfulqa_gen_nb",
]

In [7]:
from functools import reduce


overall = {
    task: (
        collect_task_prompt_results(task, k=0, verbose=False)
        if task not in ["mimir_bias", "ncb"]
        else collect_task_ranking_results(task, k=0, verbose=False)
    )
    for task in zero_shot
}

In [8]:
def merge_ranking_results(tasks, overall=overall, task2metric=task2metric, on="model"):
    res = {
        task_name: overall[task_name].rename(
            columns={col: f"{task_name} ({col})" for col in task2metric[task_name]}
        )
        for task_name in tasks
    }
    df = reduce(
        lambda df_left, df_right: pd.merge(df_left, df_right, on="model"),
        list(res.values()),
    )
    df.rename(columns={"k-shot_x": "k-shot"}, inplace=True)
    df = df[
        [
            col
            for col in df.columns
            if not any([name in col for name in ["task_", "shot_"]])
        ]
    ]
    return df


def aggregate_df(df, task, task2metric, select_best):
    task_res = []
    task_columns = ["model"] + task2metric[task]
    for model, subset in df.groupby("model"):
        model_res = [model]
        for metric in task2metric[task]:

            agg_res = dict(subset[metric].describe())
            if select_best:
                model_res.append(round(agg_res["max"], 2))
            else:
                model_res.append(
                    f"{round(agg_res['mean'], 2)} ± {round(agg_res['std'], 1)} [{round(agg_res['max'], 2)}]"
                )
        task_res.append(model_res)
    return pd.DataFrame(task_res, columns=task_columns)


def merge_task_prompt_results(
    tasks, overall=overall, task2metric=task2metric, select_best=True, on="model"
):
    res = {
        task_name: aggregate_df(
            overall[task_name], task_name, task2metric, select_best
        ).rename(
            columns={col: f"{task_name} ({col})" for col in task2metric[task_name]}
        )
        for task_name in tasks
    }

    df = reduce(
        lambda df_left, df_right: pd.merge(df_left, df_right, on="model"),
        list(res.values()),
    )
    df.rename(columns={"k-shot_x": "k-shot"}, inplace=True)
    df = df[
        [
            col
            for col in df.columns
            if not any(
                [
                    name in col
                    for name in [
                        "task_",
                        "shot_",
                        "sentence_nb (acc",
                        "document_nb (acc",
                        "acc_norm",
                    ]
                ]
            )
        ]
    ]
    return df

In [9]:
canonical_order = [
    "mimir-mistral-7b-extended-scratch",
    "mimir-mistral-7b-base-scratch",
    "mimir-7b-books",
    "mimir-7b-newspapers",
    "mimir-7b-rightholders",
    "mimir-7b-fiction",
    "mimir-7b-nonfiction",
    "mimir-7b-factual",
    "mimir-7b-untranslated",
    "mimir-7b-untranslated-withnewspapers",
    "mimir-7b-translated",
    "mimir-mistral-7b-extended",
    "mimir-mistral-7b-base",
    "Mistral-7B-v0.1",
    "mimir-mistral-7b-extended-scratch-instruct",
    "mimir-mistral-7b-base-scratch-instruct",
    "mimir-mistral-7b-extended-instruct",
    "mimir-mistral-7b-base-instruct",
]

skill = {
    "Sentiment Analysis": ["norec_sentence_nb", "norec_document_nb"],
    "Fairness & Truthfulness": [
        "mimir_bias",
        "nortruthfulqa_mc_nb",
        "nortruthfulqa_mc_nn",
        "nortruthfulqa_gen_nb",
    ],
    "Reading Comprehension": ["norbelebele_nb", "norquad_nb"],
    "World Knowledge": [
        "nrk_nb",
        "nrk_nn",
        "noropenbookqa_nb",
        "noropenbookqa_nn",
        "noropenbookqa_nb_use_fact",
        "noropenbookqa_nn_use_fact",
    ],
    "Commonsense Reasoning": ["norcommonsenseqa_nb", "norcommonsenseqa_nn"],
    "Norwegian Language": [
        "ncb",
        "ask_gec_nb",
        "noridiom_nb",
        "noridiom_nn",
    ],
    "Summarization": ["norsumm_nb", "norsumm_nn"],
    "Translation": [
        "tatoeba_eng_nno_nn",
        "tatoeba_nno_eng_nn",
        "tatoeba_eng_nob_nb",
        "tatoeba_nob_eng_nb",
        "tatoeba_nob_nno_nb",
        "tatoeba_nno_nob_nn",
    ],
    "Variation & Readability": [
        "inverse_compression_nob", "min_max_lix_nob", "inverse_sb_nob",
        "inverse_compression_nno", "min_max_lix_nno", "inverse_sb_nno",
    ]
    # "Headline generation": ["schibsted_vg_nb"],
}
language = {
    "Bokmål": [
        # "norec_sentence_nb",
        # "norec_document_nb",
        # "nortruthfulqa_gen_nb",
        "nortruthfulqa_mc_nb",
        # "norbelebele_nb",
        # "norquad_nb"
        "nrk_nb",
        "noropenbookqa_nb",
        "noropenbookqa_nb_use_fact",
        "norcommonsenseqa_nb",
        # "ask_gec_nb",
        "noridiom_nb",
        "norsumm_nb",
        "tatoeba_eng_nob_nb",
        "tatoeba_nob_eng_nb",
        "tatoeba_nob_nno_nb",
        "inverse_compression_nob",
        "min_max_lix_nob",
        "inverse_sb_nob",
    ],
    "Nynorsk": [
        "nortruthfulqa_mc_nn",
        "nrk_nn",
        "noropenbookqa_nn",
        "noropenbookqa_nn_use_fact",
        "norcommonsenseqa_nn",
        # "ncb",
        "noridiom_nn",
        "norsumm_nn",
        "tatoeba_eng_nno_nn",
        "tatoeba_nno_eng_nn",
        "tatoeba_nno_nob_nn",
        "inverse_sb_nno",
        "inverse_compression_nno",
        "min_max_lix_nno",
    ]
    # "Headline generation": ["schibsted_vg_nb"],
}
behaviours = {
    "Generation": [
        "ask_gec_nb",
        "noridiom_nb",
        "noridiom_nn",
        "norsumm_nb",
        "norsumm_nn",
        "nortruthfulqa_gen_nb",
        "tatoeba_eng_nno_nn",
        "tatoeba_nno_eng_nn",
        "tatoeba_eng_nob_nb",
        "tatoeba_nob_eng_nb",
        "tatoeba_nob_nno_nb",
        "tatoeba_nno_nob_nn",
        "inverse_compression_nob",
        "min_max_lix_nob",
        "inverse_sb_nob",
        "inverse_compression_nno",
        "min_max_lix_nno",
        "inverse_sb_nno",
    ], "Classification": [
        "norec_sentence_nb",
        "norec_document_nb",
    ], "Question Answering": [
        "norbelebele_nb",
        "norcommonsenseqa_nb",
        "norcommonsenseqa_nn",
        "noropenbookqa_nb",
        "noropenbookqa_nn",
        "noropenbookqa_nb_use_fact",
        "noropenbookqa_nn_use_fact",
        "norquad_nb",
        "nortruthfulqa_mc_nb",
        "nortruthfulqa_mc_nn",
        "nrk_nb",
        "nrk_nn",
    ], "Ranking": [
        "mimir_bias",
        "ncb",
    ]
}

In [10]:
beautify_columns = {
    "model": "Model",
    # "norec_sentence_nb (f1)": "NoReC",
    # "norec_document_nb (f1)": "NoReC",
}


def pretty_model(model_name):
    model_d = {
        "mimir-mistral-7b-base": "base (warm)",
        "mimir-mistral-7b-extended": "extended (warm)",
        "mimir-7b-fiction": "base + fiction books",
        "mimir-7b-nonfiction": "base + nonfiction books",
        "mimir-7b-factual": "base + nonfiction books + newspapers",
        "mimir-7b-newspapers": "base + newspapers",
        "mimir-7b-books": "base + books",
        "mimir-7b-rightholders": "base + books + newspapers",
        "mimir-7b-untranslated-withnewspapers": "base + original books + newspapers",
        "mimir-7b-untranslated": "base + original books",
        "mimir-7b-translated": "base + translated books",
        "mimir-mistral-7b-base-scratch": "base",
        "mimir-mistral-7b-extended-scratch": "extended",
        "mimir-mistral-7b-extended-scratch-instruct": "extended instruct",
        "mimir-mistral-7b-base-scratch-instruct": "base instruct",
        "mimir-mistral-7b-extended-instruct": "extended (warm) instruct",
        "mimir-mistral-7b-base-instruct": "base (warm) instruct",
        "Mistral-7B-v0.1": "Mistral 7B v0.1",
    }
    # mimir, conf = model_name.replace("-mistral-", "").split("7b")
    # mimir = mimir.replace("mimir", "\textsc{mimir}")
    # pretty_name = f"{mimir}$_\text" + "{" + f"{conf.strip(' -')}".replace("\text", "\text{") + "}$"
    return model_d.get(model_name, model_name)
    # return pretty_name

all_dfs = []
all_tasks = []

def aggregate_by_skill(
    task,
    model_order=canonical_order,
    select_best=True,
    skill=skill,
    add_baselines=[],
    add_k=0,
    overall=overall,
    task2metric=task2metric,
    base_model="mimir-mistral-7b-base-scratch",
    target_metric="f1",
    beautify_columns=beautify_columns,
    append_to_all_dfs=True,
):
    df = merge_task_prompt_results(
        [task], select_best=select_best, overall=overall, task2metric=task2metric
    )
    df = df[df["model"].isin(model_order)]
    # print(f"Task: {task}; Num rows: {df.shape[0]}")
    reference_score = {
        task: score.item()
        for task, score in dict(df[df["model"] == base_model]).items()
        if task != "model"
    }
    if add_baselines:
        df = pd.concat([df, pd.DataFrame(add_baselines, columns=df.columns.tolist())])

    model_order = model_order.copy() + [
        baseline_name for baseline_name, _ in add_baselines
    ]
    ascending = False if "mimir_bias" not in task else True
    model_rank = {
        model: i + 1
        for i, model in enumerate(
            df.sort_values(
                f"{task} ({target_metric})", ascending=ascending
            ).model.tolist()
        )
    }
    df["Rank"] = df["model"].apply(lambda x: model_rank[x])
    df = df.set_index("model").loc[model_order]
    agg, agg_cols = [], [
        "Rank",
        "Model",
        f"{task} ({target_metric})",
        f"delta ({target_metric})",
    ]
    for model_name, row in df.iterrows():
        row_res = [int(row["Rank"])]
        if model_name.startswith("mimir"):
            row_res.append(pretty_model(model_name))
            for task_name, ref in reference_score.items():
                if task_name != f"{task} ({target_metric})":
                    continue
                if model_name == base_model:
                    row_res.append(ref)
                    row_res.append("xmark")
                elif (
                    model_name.endswith("scratch")
                    or model_name.endswith("instruct")
                    or ("base" in model_name and model_name != base_model)
                    or "extended" in model_name
                ):
                    row_res.append(row[task_name])
                    row_res.append("xmark")
                else:
                    model_conf_score = row[task_name]
                    row_res.append(model_conf_score)
                    delta = round(model_conf_score - ref, 1)
                    if delta > 0:
                        row_res.append(f"+{delta}")
                    else:
                        row_res.append(f"-{delta}")
        else:
            row_res.extend([pretty_model(model_name), row[f"{task} ({target_metric})"], "xmark"])
        agg.append(row_res)
    agg_df = pd.DataFrame(agg, columns=agg_cols)
    if add_k is not None:
        agg_df["k"] = add_k
        agg_df = agg_df[
            [
                "Rank",
                "Model",
                "k",
                f"{task} ({target_metric})",
                f"delta ({target_metric})",
            ]
        ]
    agg_df.rename(columns=beautify_columns, inplace=True)
    agg_df = agg_df.set_index("Rank")
    if append_to_all_dfs:
        global all_dfs, all_tasks
        all_dfs.append(agg_df)
        all_tasks.append(task)
    return agg_df


def print_latex_df(df):
    print(
        df.to_latex()
        .replace("@", "\\")
        .replace("xmark", "\\xmark")  # .replace("$delta$", "delta")
    )

### Single-shot tasks

In [11]:
task = "norbelebele_nb"
add_baselines = [
    ["Random", 25.00],
]
norbelebele_nb = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
norbelebele_nb.to_csv(mkdir(f"{output_folder}/qa/norbelebele_nb.tsv"), sep="\t", index=False)
# print_latex_df(norbelebele_nb)

In [12]:
task = "nrk_nb"
add_baselines = [["Random", 27.91], ["Constant", 30.97]]
nrk_nb = aggregate_by_skill(task, add_baselines=[], target_metric="acc")
nrk_nb.to_csv(mkdir(f"{output_folder}/qa/nrk_nb.tsv"), sep="\t", index=False)
# print_latex_df(nrk_nb)

In [13]:
task = "nrk_nn"
add_baselines = [["Random", 26.76], ["Constant", 30.45]]
nkr_nn = aggregate_by_skill(task, add_baselines=[], target_metric="acc")
nkr_nn.to_csv(mkdir(f"{output_folder}/qa/nrk_nn.tsv"), sep="\t", index=False)
# print_latex_df(nrk_nn)

In [14]:
task = "norcommonsenseqa_nb"
add_baselines = [["Random", 20.00]]
norcommonsenseqa_nb = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
norcommonsenseqa_nb.to_csv(mkdir(
    f"{output_folder}/qa/norcommonsenseqa_nb.tsv"), sep="\t", index=False
)
# print_latex_df(norcommonsenseqa_nb)

task = "norcommonsenseqa_nn"
add_baselines = [["Random", 20.00]]
norcommonsenseqa_nn = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
norcommonsenseqa_nn.to_csv(mkdir(
    f"{output_folder}/qa/norcommonsenseqa_nn.tsv"), sep="\t", index=False
)
# print_latex_df(aggregate_by_skill(task, add_baselines=[], target_metric="acc"))

In [15]:
canonical_order, len(canonical_order)

(['mimir-mistral-7b-extended-scratch',
  'mimir-mistral-7b-base-scratch',
  'mimir-7b-books',
  'mimir-7b-newspapers',
  'mimir-7b-rightholders',
  'mimir-7b-fiction',
  'mimir-7b-nonfiction',
  'mimir-7b-factual',
  'mimir-7b-untranslated',
  'mimir-7b-untranslated-withnewspapers',
  'mimir-7b-translated',
  'mimir-mistral-7b-extended',
  'mimir-mistral-7b-base',
  'Mistral-7B-v0.1',
  'mimir-mistral-7b-extended-scratch-instruct',
  'mimir-mistral-7b-base-scratch-instruct',
  'mimir-mistral-7b-extended-instruct',
  'mimir-mistral-7b-base-instruct'],
 18)

In [16]:
merge_task_prompt_results(
        ["ncb"], select_best=True, overall=overall, task2metric=task2metric
    )

Unnamed: 0,model,ncb (acc)
0,Mistral-7B-v0.1,72.261905
1,mimir-7b-books,83.809524
2,mimir-7b-factual,82.142857
3,mimir-7b-fiction,83.928571
4,mimir-7b-newspapers,81.071429
5,mimir-7b-nonfiction,84.166667
6,mimir-7b-rightholders,84.047619
7,mimir-7b-translated,83.809524
8,mimir-7b-untranslated,84.880952
9,mimir-7b-untranslated-withnewspapers,83.809524


In [17]:
task = "ncb"
add_baselines = [
    ["Random", 50.00],
]
ncb = aggregate_by_skill(task, add_baselines=[], target_metric="acc")
ncb.to_csv(mkdir(f"{output_folder}/ranking/ncb.tsv"), sep="\t", index=False)
# print_latex_df(ncb)

In [18]:
task = "noridiom_nb"
add_baselines = []
em = aggregate_by_skill(task, add_baselines=[], target_metric="em")
f1 = aggregate_by_skill(task, add_baselines=[], target_metric="fscore")
em["Rank"] = em.index.tolist()
noridiom_nb = (
    em[["Rank", "Model", "noridiom_nb (em)", "delta (em)"]]
    .merge(f1, on="Model")
    .set_index("Rank")
)
noridiom_nb.to_csv(mkdir(f"{output_folder}/generation/noridiom_nb.tsv"), sep="\t", index=False)
# print_latex_df(noridiom_nb)

In [19]:
noridiom_nb

Unnamed: 0_level_0,Model,noridiom_nb (em),delta (em),k,noridiom_nb (fscore),delta (fscore)
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
13,extended,25.124378,xmark,0,36.78422,xmark
17,base,12.686567,xmark,0,24.146826,xmark
8,base + books,48.756219,+36.069651741293534,0,57.445196,+33.29837020684245
1,base + newspapers,69.402985,+56.71641791044777,0,74.910771,+50.76394449858004
5,base + books + newspapers,51.492537,+38.80597014925374,0,59.057058,+34.910231949301235
16,base + fiction books,16.169154,+3.48258706467662,0,26.930504,+2.7836777244152273
9,base + nonfiction books,48.00995,+35.32338308457712,0,57.685313,+33.538486640988715
3,base + nonfiction books + newspapers,56.965174,+44.278606965174134,0,64.04641,+39.89958408417572
7,base + original books,49.751244,+37.06467661691543,0,59.148034,+35.00120786117262
4,base + original books + newspapers,54.477612,+41.791044776119406,0,61.874001,+37.72717474539235


In [20]:
task = "noridiom_nn"
add_baselines = []
em = aggregate_by_skill(task, add_baselines=[], target_metric="em")
f1 = aggregate_by_skill(task, add_baselines=[], target_metric="fscore")
em["Rank"] = em.index.tolist()
noridiom_nn = (
    em[["Rank", "Model", "noridiom_nn (em)", "delta (em)"]]
    .merge(f1, on="Model")
    .set_index("Rank")
)
noridiom_nn.to_csv(mkdir(f"{output_folder}/generation/noridiom_nn.tsv"), sep="\t", index=False)
# print_latex_df(noridiom_nn)

In [21]:
task = "nortruthfulqa_mc_nb"
add_baselines = [["Random", 27.27]]
nortruthfulqa_mc_nb = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
nortruthfulqa_mc_nb.to_csv(mkdir(
    f"{output_folder}/qa/nortruthfulqa_mc_nb.tsv"), sep="\t", index=False
)
# print_latex_df(nortruthfulqa_mc_nb)

In [22]:
task = "nortruthfulqa_mc_nn"
add_baselines = [["Random", 24.56]]
nortruthfulqa_mc_nn = aggregate_by_skill(
    task, add_baselines=[], target_metric="acc"
)
nortruthfulqa_mc_nn.to_csv(mkdir(
    f"{output_folder}/qa/nortruthfulqa_mc_nn.tsv"), sep="\t", index=False
)
# print_latex_df(nortruthfulqa_mc_nn)

In [23]:
import numpy as np
from operator import itemgetter


def build_ranks(df):
    ranks = {}
    for i, row in df.iterrows():
        rank = np.mean([row[c] for c in dict(row) if "Rank" in c])
        ranks[row["Model"]] = round(rank, 3)
    inverse = {}
    counter = 0
    for model, rank in sorted(ranks.items(), key=itemgetter(1)):
        counter += 1
        if rank not in inverse:
            inverse[rank] = counter
        else:
            continue
    return {model: inverse[rank] for model, rank in ranks.items()}


task = "nortruthfulqa_gen_nb"

bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

trthflqa_rank = build_ranks(bleu_max.merge(rougeL_max))
trhtflqa = []
cols = []

for i, row in bleu_max.merge(rougeL_max).iterrows():

    model_res = []

    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("nortruthfulqa_gen_nb (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    trhtflqa.append(model_res)

df = pd.DataFrame(trhtflqa, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: trthflqa_rank[x])
df = df.set_index("Rank")
df.to_csv(mkdir(f"{output_folder}/generation/nortruthfulqa_gen_nb.tsv"), sep="\t", index=False)
# print_latex_df(df)

In [24]:
task = "mimir_bias"
target_metric = "pct_stereotype"
mimir_bias = aggregate_by_skill(task, add_baselines=[], target_metric=target_metric)
mimir_bias.to_csv(mkdir(f"{output_folder}/ranking/mimir_bias.tsv"), sep="\t", index=False)
# The ideal bias score is 50%
mimir_bias[f"{task} ({target_metric})"] = mimir_bias[f"{task} ({target_metric})"].apply(lambda x: 100 * 50 / x)
# print_latex_df(mimir_bias)

In [25]:
task = "norsumm_nb"

bertscore_f1_max = aggregate_by_skill(
    task, add_baselines=[], target_metric="bertscore_f1_max"
)
bertscore_f1_max["Rank_bertscore_f1_max"] = bertscore_f1_max.index.tolist()
bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

norsumm_nb_rank = build_ranks(bleu_max.merge(rougeL_max).merge(bertscore_f1_max))
norsumm_nb = []
cols = []

for i, row in bleu_max.merge(rougeL_max).merge(bertscore_f1_max).iterrows():
    model_res = []
    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("norsumm_nb (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    norsumm_nb.append(model_res)

df = pd.DataFrame(norsumm_nb, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: norsumm_nb_rank[x])
df = df.set_index("Rank")
df
# df.to_csv(mkdir(f"{output_folder}/generation/norsumm_nb.tsv"), sep="\t", index=False)
# print_latex_df(df)

Unnamed: 0_level_0,Model,k,bleu,delta bleu,rougeL,delta rougeL,bertscore_f1,delta bertscore_f1
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,extended,0,26.014963,xmark,47.390284,xmark,73.762794,xmark
8,base,0,19.230192,xmark,37.624689,xmark,71.399814,xmark
12,base + books,0,14.750003,--4.480189090543133,31.870236,--5.754452750153831,66.903302,--4.496512413024902
14,base + newspapers,0,12.5153,--6.71489166423733,29.838179,--7.786510230948213,68.843894,--2.555920282999665
9,base + books + newspapers,0,17.859253,--1.370938231442583,36.127881,--1.496807684616364,68.574436,--2.8253776828447883
15,base + fiction books,0,14.18932,--5.04087206254378,30.855552,--6.7691371094723785,65.949312,--5.450502435366303
12,base + nonfiction books,0,15.151113,--4.079078914193223,32.633379,--4.991309806325951,65.876243,--5.523570875326783
11,base + nonfiction books + newspapers,0,15.219038,--4.011153669864921,32.813865,--4.81082335495843,66.931845,--4.467969536781311
15,base + original books,0,14.351724,--4.87846777175554,31.549131,--6.075557610495949,65.770862,--5.628951688607529
10,base + original books + newspapers,0,15.789491,--3.4407010278343932,34.340631,--3.2840573657549257,68.342594,--3.0572197834650723


In [26]:
task = "norsumm_nn"

bertscore_f1_max = aggregate_by_skill(
    task, add_baselines=[], target_metric="bertscore_f1_max"
)
bertscore_f1_max["Rank_bertscore_f1_max"] = bertscore_f1_max.index.tolist()
bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

norsumm_nb_rank = build_ranks(bleu_max.merge(rougeL_max).merge(bertscore_f1_max))
norsumm_nb = []
cols = []

for i, row in bleu_max.merge(rougeL_max).merge(bertscore_f1_max).iterrows():
    model_res = []
    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("norsumm_nn (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    norsumm_nb.append(model_res)

df = pd.DataFrame(norsumm_nb, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: norsumm_nb_rank[x])
df = df.set_index("Rank")
df.to_csv(mkdir(f"{output_folder}/generation/norsumm_nn.tsv"), sep="\t", index=False)
# print_latex_df(df)

### Multi-shot tasks

In [27]:
task = "norec_document_nb"

add_baselines = [["Random", 48.43], ["Constant", 40.12]]

ks = [0, 1]

norec_document_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norec_document_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=[], overall=norec_document_nb[k], add_k=k
        )
        for k in ks
    ]
)

In [28]:
from collections import deque


def reogranize(
    df, pretty_order=[pretty_model(m) for m in canonical_order], add_baselines=[]
):
    res_df = df.copy()
    res = []
    if add_baselines:
        pretty_order.extend(add_baselines)
    for model in pretty_order:
        subset = res_df[res_df["Model"] == model]
        if model in ["Random", "Constant"]:
            subset["k"] = "xmark"
            res.append(pd.DataFrame([subset.iloc[0]]))
        else:
            res.append(subset)
    return pd.concat(res)


def reogranize_by_k(
    df,
    change_rank=True,
    change_cols=False,
    pretty_order=[pretty_model(m) for m in canonical_order],
    add_baselines=[],
):
    res = []
    if add_baselines:
        pretty_order.extend(add_baselines)
    for i, subset in df.groupby("k"):
        if change_rank:
            subset["Rank"] = subset.index.tolist()
        for model in pretty_order:
            k_subset = subset[subset["Model"] == model]
            if model != "@factual":
                k_subset["k"] = ""
            res.append(k_subset)
    res_df = pd.concat(res)  # .drop_duplicates(subset=["k", "Model", "Rank"])
    res_df = res_df.set_index("k")  # .drop_duplicates()
    res_cols = res_df.columns.tolist()
    if change_cols and res_cols[0] != "Rank":
        new_columns = deque(res_cols)
        new_columns.rotate(1)
        return res_df[list(new_columns)].reset_index()
    return res_df

In [29]:
# reogranize_by_k(norec_document_nb_df, add_baselines=["Random", "Constant"]).to_csv(mkdir(
reogranize_by_k(norec_document_nb_df, add_baselines=[]).to_csv(mkdir(
    f"{output_folder}/clf/norec_document_nb.tsv"), sep="\t", index=False
)
# print_latex_df()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""


In [30]:
task = "norec_sentence_nb"

add_baselines = [["Random", 48.52], ["Constant", 40.75]]

ks = [0, 1, 4, 16]

norec_sentence_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norec_sentence_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=[], overall=norec_sentence_nb[k], add_k=k
        )
        for k in ks
    ]
)

In [31]:
p1 = reogranize_by_k(
    norec_sentence_nb_df[norec_sentence_nb_df["k"].isin([0, 1])],
    add_baselines=[],  #["Random", "Constant"],
)
p2 = reogranize_by_k(
    norec_sentence_nb_df[norec_sentence_nb_df["k"].isin([4, 16])],
    add_baselines=[],  # ["Random", "Constant"],
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [32]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/clf/norec_sentence_nb.tsv"), sep="\t", index=False
)

In [33]:
# print_latex_df(p1)

In [34]:
# print_latex_df(p2)

In [35]:
task = "norquad_nb"

ks = [0, 1]

norquad_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norquad_em_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="exact_match",
            overall=norquad_nb[k],
            add_k=k,
        )
        for k in ks
    ]
)

norquad_f1_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=[], target_metric="f1", overall=norquad_nb[k], add_k=k
        )
        for k in ks
    ]
)

norquad_em_df["Rank"] = norquad_em_df.index.tolist()

norquad_nb_df = (
    norquad_em_df[
        ["Rank", "Model", "k", "norquad_nb (exact_match)", "delta (exact_match)"]
    ]
    .merge(norquad_f1_df, on=["Model", "k"])
    .set_index("Rank")
)

# print_latex_df(reogranize_by_k(norquad_nb_df, add_baselines=[]))

In [36]:
reogranize_by_k(norquad_nb_df, add_baselines=[]).to_csv(mkdir(
    f"{output_folder}/qa/norquad_nb.tsv"), sep="\t", index=False
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""


In [37]:
task = "noropenbookqa_nb"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nb[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [38]:
p1 = reogranize_by_k(
    noropenbookqa_nb_df[noropenbookqa_nb_df["k"].isin([0, 1])],
    add_baselines=[],  # ["Random", "Constant"],
)
# print_latex_df(p1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""


In [39]:
p2 = reogranize_by_k(
    noropenbookqa_nb_df[noropenbookqa_nb_df["k"].isin([4, 16])],
    add_baselines=[],  # ["Random", "Constant"],
)
# print_latex_df(p2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""


In [40]:
pd.concat([p1, p2]).to_csv(mkdir(f"{output_folder}/qa/noropenbookqa_nb.tsv"), sep="\t", index=False)

In [41]:
task = "noropenbookqa_nn"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nn = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nn_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nn[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [42]:
p1 = reogranize_by_k(
    noropenbookqa_nn_df[noropenbookqa_nn_df["k"].isin([0, 1])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""


In [43]:
p2 = reogranize_by_k(
    noropenbookqa_nn_df[noropenbookqa_nn_df["k"].isin([4, 16])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  k_subset["k"] = ""


In [44]:
pd.concat([p1, p2]).to_csv(mkdir(f"{output_folder}/qa/noropenbookqa_nn.tsv"), sep="\t", index=False)

In [45]:
k = 16
ignore_models=["gpt-sw3-6.7b"]
verbose=False
columns=["task", "model", "prompt", "k-shot"]

res = []
res_fdir = f"{task}/{k}-shot"
res_columns = columns.copy()
for model_organization in os.listdir(res_fdir):
    model_fdir = os.path.join(res_fdir, model_organization)
    for model in os.listdir(model_fdir):
        if model in ignore_models:
            continue
        model_res_fpath = os.path.join(model_fdir, model, "results.json")
        if not os.path.exists(model_res_fpath):
            continue
        if verbose:
            print(model_res_fpath)
        model_res = load_json(model_res_fpath)
        model_res_scores = {
            prompt_name: prompt_res
            for prompt_name, prompt_res in model_res["results"].items()
            if prompt_name != task
        }
        for configuration_name, configuration_res in model_res_scores.items():
            try:
                prompt = (
                    prompts[task][configuration_name]
                    if task in prompts
                    else model_res["configs"][configuration_name]["doc_to_text"]
                )
            except Exception as exc:
                print(configuration_name, configuration_res)
                raise exc

            model_name = model_res["config"]["model_args"].split(",")[0].split("/", 1)[-1]
            curr_configuration_res = [task, model_name, prompt, k]
            for metric_name, score in configuration_res.items():
                if "stderr" in metric_name or metric_name == "alias":
                    continue
                pretty_metric_name, pretty_metric_score = pretty_metric(
                    task=task, metric_name=metric_name, score=score
                )
                if pretty_metric_name not in res_columns:
                    res_columns.append(pretty_metric_name)
                curr_configuration_res.append(pretty_metric_score)
            res.append(curr_configuration_res)
pd.DataFrame(res, columns=res_columns)

Unnamed: 0,task,model,prompt,k-shot,acc,acc_norm
0,noropenbookqa_nn,mimir-7b-fiction,question_stem,16,24.444444,33.333333
1,noropenbookqa_nn,mimir-7b-fiction,{{question_stem}}\n\nSvaralternativer:\n- {{ch...,16,20.000000,24.444444
2,noropenbookqa_nn,mimir-7b-fiction,{{question_stem}}\nA: {{choices.text[0]}}\nB: ...,16,26.666667,26.666667
3,noropenbookqa_nn,mimir-7b-fiction,Spørsmål: {{question_stem}}\nA: {{choices.text...,16,27.777778,27.777778
4,noropenbookqa_nn,mimir-7b-fiction,{{question_stem}}\nVel rett svar blant desse a...,16,17.777778,26.666667
...,...,...,...,...,...,...
85,noropenbookqa_nn,mimir-mistral-7b-base-scratch-instruct,question_stem,16,23.333333,37.777778
86,noropenbookqa_nn,mimir-mistral-7b-base-scratch-instruct,{{question_stem}}\n\nSvaralternativer:\n- {{ch...,16,22.222222,25.555556
87,noropenbookqa_nn,mimir-mistral-7b-base-scratch-instruct,{{question_stem}}\nA: {{choices.text[0]}}\nB: ...,16,17.777778,17.777778
88,noropenbookqa_nn,mimir-mistral-7b-base-scratch-instruct,Spørsmål: {{question_stem}}\nA: {{choices.text...,16,17.777778,17.777778


In [46]:
model_res["configs"].keys(), configuration_name, task, model_res_fpath

(dict_keys(['prompt-0', 'prompt-1', 'prompt-2', 'prompt-3', 'prompt-4']),
 'prompt-4',
 'noropenbookqa_nn',
 'noropenbookqa_nn/16-shot/mimir-project/mimir-mistral-7b-base-scratch-instruct/results.json')

In [47]:
task = "noropenbookqa_nb_use_fact"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nb_use_fact = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=True)} for k in ks
}

noropenbookqa_nb_use_fact_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nb_use_fact[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-7b-fiction/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-7b-rightholders/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-mistral-7b-base/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-mistral-7b-extended-scratch-instruct/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-7b-factual/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-mistral-7b-extended-scratch/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-7b-translated/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-mistral-7b-extended/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-mistral-7b-base-instruct/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-7b-untranslated/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimir-mistral-7b-extended-instruct/results.json
noropenbookqa_nb_use_fact/0-shot/mimir-project/mimi

In [48]:
import warnings

warnings.filterwarnings("ignore")

In [49]:
p1 = reogranize_by_k(
    noropenbookqa_nb_use_fact_df[noropenbookqa_nb_use_fact_df["k"].isin([0, 4])],
    add_baselines=["Random"],
)
# print_latex_df(p1)

In [50]:
p2 = reogranize_by_k(
    noropenbookqa_nb_use_fact_df[noropenbookqa_nb_use_fact_df["k"].isin([1, 16])],
    add_baselines=["Random"],
)
# print_latex_df(p2)

In [51]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/qa/noropenbookqa_nb_use_fact.tsv"), sep="\t", index=False
)

In [52]:
task = "noropenbookqa_nn_use_fact"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nn_use_fact = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nn_use_fact_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=noropenbookqa_nn_use_fact[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [53]:
p1 = reogranize_by_k(
    noropenbookqa_nn_use_fact_df[noropenbookqa_nn_use_fact_df["k"].isin([0, 4])],
    add_baselines=["Random"],
)
# print_latex_df(p1)

In [54]:
p2 = reogranize_by_k(
    noropenbookqa_nn_use_fact_df[noropenbookqa_nn_use_fact_df["k"].isin([1, 16])],
    add_baselines=["Random"],
)
# print_latex_df(p2)

In [55]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/qa/noropenbookqa_nn_use_fact.tsv"), sep="\t", index=False
)

In [56]:
# 260
task = "ask_gec_nb"
ask_gec_res = pd.read_csv("ask_gec_nb/ask_gec_k_shot.tsv", sep="\t").rename(
    columns={"k": "k-shot"}
)
ask_gec_instruct = pd.read_csv("ask_gec_nb/ask_gec_nb_instruct.tsv", sep="\t").rename(
    columns={"k": "k-shot"}
)
ask_gec = pd.concat([ask_gec_res, ask_gec_instruct])
ask_gec["task"] = task

ask_gec = ask_gec[["task", "model", "prompt", "k-shot", "errant"]]

ask_gec_overall = {k: {task: subset} for k, subset in ask_gec.groupby("k-shot")}

ask_gec_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=ask_gec_overall[k],
            add_k=k,
            target_metric="errant",
        )
        for k in ask_gec_overall.keys()
    ]
)

In [57]:
p1 = reogranize_by_k(
    ask_gec_df[ask_gec_df["k"].isin([0, 1])], add_baselines=[], change_cols=True
)
# print_latex_df(p1)

In [58]:
p2 = reogranize_by_k(
    ask_gec_df[ask_gec_df["k"].isin([1, 16])], add_baselines=[], change_cols=True
)
# print_latex_df()

In [59]:
pd.concat([p1, p2]).to_csv(mkdir(
    f"{output_folder}/generation/ask_gec_nb.tsv"), sep="\t", index=False
)

In [60]:
# bertscore_vg = pd.read_csv("bertscore_vg_k_shot.tsv", sep="\t")

In [61]:
# bertscore_vg.sample(5)

In [62]:
# task = "schibsted_vg_nb"

# ks = [0, 1]

# schibsted_vg_nb = {
#     k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
# }

# schibsted_vg_nb_bs = {k: {task: subset} for k, subset in bertscore_vg.groupby("k")}

In [63]:
# schibsted_vg_nb[0]["schibsted_vg_nb"].sample(2)

In [64]:
# schibsted_vg_nb_bs[0]["schibsted_vg_nb"].sample(2)

In [65]:
task2metric_bs = {
#    "schibsted_vg_nb": ["bertscore_f1"],
    "tatoeba_eng_nno_nn": ["bertscore_f1"],
    "tatoeba_nno_eng_nn": ["bertscore_f1"],
    "tatoeba_eng_nob_nb": ["bertscore_f1"],
    "tatoeba_nob_eng_nb": ["bertscore_f1"],
    "tatoeba_nob_nno_nb": ["bertscore_f1"],
    "tatoeba_nno_nob_nn": ["bertscore_f1"],
}

In [66]:
# schibsted_res = {}
# target_cols = [
#     "Rank",
#     "Model",
#     "bleu",
#     "delta bleu",
#     "chrf",
#     "delta chrf",
#     "bertscore_f1",
#     "delta bertscore_f1",
#     "k",
# ]

# for k in ks:
#     bleu = aggregate_by_skill(
#         task,
#         add_baselines=[],
#         target_metric="bleu",
#         overall=schibsted_vg_nb[k],
#         add_k=k,
#     )
#     bleu["Rank_bleu"] = bleu.index.tolist()

#     chrf = aggregate_by_skill(
#         task,
#         add_baselines=[],
#         target_metric="chrf",
#         overall=schibsted_vg_nb[k],
#         add_k=k,
#     )
#     chrf["Rank_chrf"] = chrf.index.tolist()

#     bertscore = aggregate_by_skill(
#         task,
#         add_baselines=[],
#         target_metric="bertscore_f1",
#         overall=schibsted_vg_nb_bs[k],
#         add_k=k,
#         task2metric=task2metric_bs,
#     )
#     bertscore["Rank_bertscore"] = bertscore.index.tolist()

#     merged = bleu.merge(chrf).merge(bertscore)
#     ranks = build_ranks(merged)

#     k_res, cols = [], []
#     for i, row in merged.iterrows():
#         model_res = []
#         for key, v in dict(row).items():
#             key = (
#                 key.replace("_max)", "")
#                 .replace("schibsted_vg_nb (", "")
#                 .replace("delta (", "delta ")
#                 .rstrip(")")
#             )
#             if "Rank" in key:
#                 continue
#             if key not in cols:
#                 cols.append(key)
#             model_res.append(v)
#         k_res.append(model_res)
#     k_df = pd.DataFrame(k_res, columns=cols)
#     k_df["Rank"] = df["Model"].apply(lambda x: ranks[x])
#     schibsted_res[k] = k_df[target_cols]

In [67]:
# print_latex_df(reogranize_by_k(schibsted_res[0]))

In [68]:
# print_latex_df(reogranize_by_k(schibsted_res[1]))

In [69]:
bertscore_mt = pd.read_csv("bertscore_mt_k_shot.tsv", sep="\t")
bertscore_instruct = pd.read_csv("bertscore_instruct.tsv", sep="\t")
bertscore_mt = pd.concat([bertscore_mt, bertscore_instruct])

In [70]:
bertscore_mt.tail(2)

Unnamed: 0,model,bertscore_f1,k,prompt,task
486,mimir-mistral-7b-base-instruct,97.347,4,prompt-1,tatoeba_nob_nno_nb
487,mimir-mistral-7b-base-instruct,97.498,4,prompt-0,tatoeba_nob_nno_nb


In [71]:
def get_mt_results(
    task,
    ks,
    bertscore_mt=bertscore_mt,
    task2metric_bs=task2metric_bs,
    change_cols=True,
    task2metric=task2metric,
):
    mt = {k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks}

    mt_bs = {
        k: {task: subset}
        for k, subset in bertscore_mt[bertscore_mt["task"] == task].groupby("k")
    }

    mt_res = {}

    target_cols = [
        "Rank",
        "Model",
        "bleu",
        "delta bleu",
        "chrf",
        "delta chrf",
        # "bertscore_f1",
        # "delta bertscore_f1",
        "k",
    ]

    for k in ks:
        bleu = aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="bleu",
            overall=mt[k],
            add_k=k,
            task2metric=task2metric,
        )
        bleu["Rank_bleu"] = bleu.index.tolist()

        chrf = aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="chrf",
            overall=mt[k],
            add_k=k,
            task2metric=task2metric,
        )
        chrf["Rank_chrf"] = chrf.index.tolist()

        # bertscore = aggregate_by_skill(
        #     task,
        #     add_baselines=[],
        #     target_metric="bertscore_f1",
        #     overall=mt_bs[k],
        #     add_k=k,
        #     task2metric=task2metric_bs,
        # )
        # bertscore["Rank_bertscore"] = bertscore.index.tolist()

        merged = bleu.merge(chrf)  # .merge(bertscore)
        ranks = build_ranks(merged)
        k_res, cols = [], []
        for i, row in merged.iterrows():
            model_res = []
            for key, v in dict(row).items():
                key = (
                    key.replace("_max)", "")
                    .replace(f"{task} (", "")
                    .replace("delta (", "delta ")
                    .rstrip(")")
                )
                if "Rank" in key:
                    continue
                if key not in cols:
                    cols.append(key)
                model_res.append(v)
            k_res.append(model_res)
        k_df = pd.DataFrame(k_res, columns=cols)
        k_df["Rank"] = k_df["Model"].apply(lambda x: ranks[x])
        mt_res[k] = k_df[target_cols]
    return mt_res

In [72]:
ks = [0, 1, 4, 16]

tatoeba_eng_nob_nb = get_mt_results(task="tatoeba_eng_nob_nb", ks=ks)

In [73]:
p1 = reogranize_by_k(tatoeba_eng_nob_nb[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_eng_nob_nb[4], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_eng_nob_nb[1], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_eng_nob_nb[16], change_rank=False, add_baselines=[])

In [74]:
pd.concat([p1, p3, p2, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_eng_nob_nb.tsv"), sep="\t", index=False
)

In [75]:
tatoeba_nob_eng_nb = get_mt_results(task="tatoeba_nob_eng_nb", ks=ks)

In [76]:
p1 = reogranize_by_k(tatoeba_nob_eng_nb[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_nob_eng_nb[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_nob_eng_nb[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_nob_eng_nb[16], change_rank=False, add_baselines=[])

In [77]:
pd.concat([p1, p2, p3, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_nob_eng_nb.tsv"), sep="\t", index=False
)

In [78]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [79]:
tatoeba_eng_nno_nn = get_mt_results(task="tatoeba_eng_nno_nn", ks=ks)

In [80]:
p1 = reogranize_by_k(tatoeba_eng_nno_nn[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_eng_nno_nn[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_eng_nno_nn[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_eng_nno_nn[16], change_rank=False, add_baselines=[])

In [81]:
pd.concat([p1, p2, p3, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_eng_nno_nn.tsv"), sep="\t", index=False
)

In [82]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [83]:
tatoeba_nno_eng_nn = get_mt_results(task="tatoeba_nno_eng_nn", ks=ks)

In [84]:
p1 = reogranize_by_k(tatoeba_nno_eng_nn[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_nno_eng_nn[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_nno_eng_nn[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_nno_eng_nn[16], change_rank=False, add_baselines=[])

In [85]:
pd.concat([p1, p2, p3, p4]).to_csv(mkdir(
    f"{output_folder}/generation/tatoeba_nno_eng_nn.tsv"), sep="\t", index=False
)

In [86]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [87]:
tatoeba_nob_nno_nb = get_mt_results(task="tatoeba_nob_nno_nb", ks=[0])
del tatoeba_nob_nno_nb[0]["k"]
p1 = reogranize(tatoeba_nob_nno_nb[0].set_index("Rank"), add_baselines=[])
p1.to_csv(mkdir(f"{output_folder}/generation/tatoeba_nob_nno_nb.tsv"), sep="\t", index=False)
# print_latex_df(p1)


# ks = [0, 1, 4]
# tatoeba_nob_nno_nb = get_mt_results(task="tatoeba_nob_nno_nb", ks=ks)
# p1 = reogranize_by_k(tatoeba_nob_nno_nb[0], change_rank=False, add_baselines=[])
# p2 = reogranize_by_k(tatoeba_nob_nno_nb[1], change_rank=False, add_baselines=[])
# p3 = reogranize_by_k(tatoeba_nob_nno_nb[4], change_rank=False, add_baselines=[])
# pd.concat([p1, p2, p3]).to_csv(mkdir(
#     f"{output_folder}/generation/tatoeba_nob_nno_nb.tsv"), sep="\t", index=False
# )

In [88]:
import shutil

shutil.make_archive(f"{output_folder}", "zip", f"{output_folder}")

'/Users/javierr/git/mimir-evaluation-suite/mimir_results/tmp/.zip'

In [89]:
tatoeba_nno_nob_nn = get_mt_results(task="tatoeba_nno_nob_nn", ks=[0])
del tatoeba_nno_nob_nn[0]["k"]
p1 = reogranize(tatoeba_nno_nob_nn[0].set_index("Rank"), add_baselines=[])
p1.to_csv(mkdir(f"{output_folder}/generation/tatoeba_nno_nob_nn.tsv"), sep="\t", index=False)
# print_latex_df(p1)

In [90]:
la_df = pd.read_csv("./linguistic_evaluation.tsv", sep="\t", index_col=False)[[
    "model",
    "inverse_compression_nob", "min_max_lix_nob", "inverse_sb_nob",
    "inverse_compression_nno", "min_max_lix_nno", "inverse_sb_nno",
]]
# la_df = la_df.loc[:,~la_df.columns.str.contains('^instruct', case=False)]
la_df = la_df.set_index("model")
la_df = la_df.reindex(canonical_order)
la_df = la_df.reset_index()
la_df.model = la_df.model.apply(pretty_model)
la_df = la_df.set_index("model")
la_df.index.name = "Model"
la_df

Unnamed: 0_level_0,inverse_compression_nob,min_max_lix_nob,inverse_sb_nob,inverse_compression_nno,min_max_lix_nno,inverse_sb_nno
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
extended,40.324207,59.86015,72.68,41.911148,73.030525,77.2965
base,40.036834,57.052582,72.8016,41.907636,67.601785,78.586
base + books,39.207998,61.899285,71.0419,41.416442,76.608565,77.2907
base + newspapers,40.673554,62.691685,72.1036,42.018572,71.04502,76.5547
base + books + newspapers,40.251167,62.991205,71.7438,42.027402,73.451778,77.1318
base + fiction books,40.014405,74.873205,69.211,41.516171,85.025755,75.3629
base + nonfiction books,38.962051,57.010537,72.1191,41.818258,72.3413,78.5838
base + nonfiction books + newspapers,40.049662,62.358548,71.838,42.080458,72.84494,77.4321
base + original books,39.157334,59.336895,72.3795,41.564487,75.014835,77.5432
base + original books + newspapers,40.020811,62.235042,71.7394,41.795536,72.965553,76.9432


In [91]:
report_metrics_dict = {
    "mimir_bias": ["pct_stereotype"],
    "ncb": ["acc"],
    "norec_sentence_nb": ["f1"],
    "norec_document_nb": ["f1"],
    "tapaco_no_detection_nb": ["acc"],
    "norbelebele_nb": ["acc"],
    "nrk_nb": ["acc"],
    "nrk_nn": ["acc"],
    "noropenbookqa_nb": ["acc"],
    "noropenbookqa_nn": ["acc"],
    "noropenbookqa_nb_use_fact": ["acc"],
    "noropenbookqa_nn_use_fact": ["acc"],
    "norcommonsenseqa_nn": ["acc"],
    "norcommonsenseqa_nb": ["acc"],
    "nortruthfulqa_mc_nb": ["acc"],
    "nortruthfulqa_mc_nn": ["acc"],
    "norquad_nb": ["f1"],
    "noridiom_nb": ["fscore"],
    "noridiom_nn": ["fscore"],
    "norsumm_nb": [
        #"bleu_max",
        # "bleu_avg",
        "rougeL_max",
        # "rougeL_avg",
        # "bertscore_f1_max",
        # "bertscore_f1_avg",
    ],
    "norsumm_nn": [
        # "bleu_max",
        # "bleu_avg",
        "rougeL_max",
        # "rougeL_avg",
        # "bertscore_f1_max",
        # "bertscore_f1_avg",
    ],
    "nortruthfulqa_gen_nb": ["rougeL_max"],  # ["bleu_max", "rougeL_max"],
    # "schibsted_vg_nb": ["bleu", "chrf"],
    "ask_gec_nb": ["errant"],
    "tatoeba_eng_nno_nn": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nno_eng_nn": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_eng_nob_nb": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nob_eng_nb": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nob_nno_nb": ["bleu"],  # ["bleu", "chrf"],
    "tatoeba_nno_nob_nn": ["bleu"],  # ["bleu", "chrf"],
    "inverse_compression_nob": None,
    "min_max_lix_nob": None,
    "inverse_sb_nob": None,
    "inverse_compression_nno": None,
    "min_max_lix_nno": None,
    "inverse_sb_nno": None,
}
report_metrics = []
for report_metric_key, report_metric_values in report_metrics_dict.items():
    if report_metric_values is not None:
        for report_metric_value in report_metric_values:
            report_metrics.append(f"{report_metric_key} ({report_metric_value})")
    else:
        report_metrics.append(f"{report_metric_key}")

In [92]:
# best k-shot
report_df = pd.concat([df.loc[:,~df.columns.str.contains('^(delta|rank|k)', case=False)].set_index("Model") for df in all_dfs], axis=1,  sort=False)
report_df = report_df.loc[:,report_df.columns.isin(report_metrics)]
report_df.columns = [col.split(" (")[0] for col in report_df.columns]
report_df = report_df.join(la_df)
report_df

Unnamed: 0_level_0,norbelebele_nb,nrk_nb,nrk_nn,norcommonsenseqa_nb,norcommonsenseqa_nn,ncb,noridiom_nb,noridiom_nn,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,...,tatoeba_nno_eng_nn,tatoeba_nno_eng_nn,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression_nob,min_max_lix_nob,inverse_sb_nob,inverse_compression_nno,min_max_lix_nno,inverse_sb_nno
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
extended,30.555556,48.972222,59.924812,43.578644,38.947368,79.52381,36.78422,32.585235,52.631579,50.877193,...,55.428263,57.158084,11.658696,12.50119,40.324207,59.86015,72.68,41.911148,73.030525,77.2965
base,28.777778,45.805556,54.285714,42.857143,36.842105,77.261905,24.146826,14.221166,58.373206,54.385965,...,53.642484,55.100973,15.992702,24.359707,40.036834,57.052582,72.8016,41.907636,67.601785,78.586
base + books,27.666667,44.527778,53.007519,44.444444,34.736842,83.809524,57.445196,27.679681,59.330144,56.140351,...,53.692677,55.416833,12.757572,8.479467,39.207998,61.899285,71.0419,41.416442,76.608565,77.2907
base + newspapers,30.222222,45.833333,55.18797,40.836941,34.736842,81.071429,74.910771,42.637639,47.84689,43.859649,...,53.046373,54.404284,64.203105,45.307222,40.673554,62.691685,72.1036,42.018572,71.04502,76.5547
base + books + newspapers,27.777778,44.916667,54.511278,44.588745,38.947368,84.047619,59.057058,28.478496,58.851675,50.877193,...,53.644498,55.146589,16.639855,11.799298,40.251167,62.991205,71.7438,42.027402,73.451778,77.1318
base + fiction books,27.666667,42.972222,49.473684,44.877345,37.894737,83.928571,26.930504,19.526365,45.454545,43.859649,...,52.682172,54.46554,9.186173,8.92517,40.014405,74.873205,69.211,41.516171,85.025755,75.3629
base + nonfiction books,29.0,43.833333,53.609023,43.434343,37.894737,84.166667,57.685313,31.497707,60.76555,63.157895,...,53.771744,54.832202,12.033895,9.789933,38.962051,57.010537,72.1191,41.818258,72.3413,78.5838
base + nonfiction books + newspapers,29.0,45.0,55.18797,44.300144,41.052632,82.142857,64.04641,35.258885,56.45933,56.140351,...,53.502854,55.027108,14.401647,17.056241,40.049662,62.358548,71.838,42.080458,72.84494,77.4321
base + original books,28.222222,44.972222,54.586466,45.310245,36.842105,84.880952,59.148034,31.556056,61.722488,54.385965,...,53.853974,55.291793,13.039856,8.766193,39.157334,59.336895,72.3795,41.564487,75.014835,77.5432
base + original books + newspapers,28.0,44.333333,55.037594,45.021645,37.894737,83.809524,61.874001,32.834751,55.980861,57.894737,...,53.398338,55.598089,16.442273,16.526474,40.020811,62.235042,71.7394,41.795536,72.965553,76.9432


In [93]:
skills_df = {}
for skill_, metrics in skill.items():
    skills_df[skill_] = report_df[metrics]

report = pd.concat(skills_df.values(), keys=skills_df.keys(),axis=1)
report.to_csv("./report_skill_metrics.tsv", sep="\t")
report_agg = pd.concat({k: report.pivot_table(k, "Model").T.mean() for k in skill.keys()}, axis=1).reindex(report.index)
report_agg.to_csv("./report_skill.tsv", sep="\t")
report_agg['Average'] = report_agg.mean(numeric_only=True, axis=1)
report_agg

Unnamed: 0_level_0,Sentiment Analysis,Fairness & Truthfulness,Reading Comprehension,World Knowledge,Commonsense Reasoning,Norwegian Language,Summarization,Translation,Variation & Readability,Average
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
extended,69.258718,51.013994,41.43639,37.996433,41.263006,37.977609,41.000158,40.09764,60.850422,46.766041
base,67.256641,53.514114,38.888963,36.381757,39.849624,31.187128,32.445811,38.981407,59.664406,44.241095
base + books,70.04036,55.153195,34.118485,36.164393,39.590643,40.4792,29.80923,36.730046,61.244148,44.814411
base + newspapers,69.392916,48.03743,35.859807,36.215941,37.786891,42.278548,29.262776,47.46496,60.847855,45.238569
base + books + newspapers,71.730552,53.062579,35.946314,36.34024,41.768057,40.177596,31.862726,37.304509,61.266192,45.495418
base + fiction books,69.765543,47.987792,35.222681,35.617509,41.386041,32.389349,28.412804,35.335529,64.333906,43.383461
base + nonfiction books,69.479177,56.510831,32.632254,35.434997,40.66454,41.33567,29.522049,37.070004,60.139174,44.7543
base + nonfiction books + newspapers,72.886413,53.909157,36.898791,36.429837,42.676388,41.599736,30.782248,37.684953,61.100618,45.99646
base + original books,70.635859,55.434428,33.077635,36.253476,41.076175,41.076435,28.736139,37.005963,60.832709,44.903202
base + original books + newspapers,71.246724,54.430002,35.672079,36.103438,41.458191,41.161182,30.669836,37.661927,60.949924,45.4837


In [94]:
langs_df = {}
for lang, metrics in language.items():
    langs_df[lang] = report_df[metrics]

report = pd.concat(langs_df.values(), keys=langs_df.keys(),axis=1)
report.to_csv("./report_lang_metrics.tsv", sep="\t")
report_agg = pd.concat({k: report.pivot_table(k, "Model").T.mean() for k in language.keys()}, axis=1).reindex(report.index)
report_agg.to_csv("./report_lang.tsv", sep="\t")
report_agg['Average'] = report_agg.mean(numeric_only=True, axis=1)
report_agg

Unnamed: 0_level_0,Bokmål,Nynorsk,Average
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
extended,43.776151,40.551865,42.164008
base,41.778837,38.63082,40.204828
base + books,42.621341,38.137055,40.379198
base + newspapers,46.736442,41.851464,44.293953
base + books + newspapers,43.236917,38.272229,40.754573
base + fiction books,40.283953,37.257121,38.770537
base + nonfiction books,42.3664,38.409789,40.388094
base + nonfiction books + newspapers,42.961059,39.418628,41.189843
base + original books,42.576561,38.627572,40.602066
base + original books + newspapers,42.96237,38.890291,40.92633


In [95]:
behaviours_df = {}
for behaviour, metrics in behaviours.items():
    behaviours_df[behaviour] = report_df[metrics]

report = pd.concat(behaviours_df.values(), keys=behaviours_df.keys(),axis=1)
report.to_csv("./report_behaviour_metrics.tsv", sep="\t")
report_agg = pd.concat({k: report.pivot_table(k, "Model").T.mean() for k in behaviours.keys()}, axis=1).reindex(report.index)
report_agg.to_csv("./report_behaviour.tsv", sep="\t")
report_agg['Average'] = report_agg.mean(numeric_only=True, axis=1)
report_agg

Unnamed: 0_level_0,Generation,Classification,Question Answering,Ranking,Average
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
extended,41.82914,69.258718,39.77119,77.444609,57.075914
base,39.051948,67.256641,38.559878,77.718129,55.646649
base + books,39.84201,70.04036,37.918652,80.967262,57.192071
base + newspapers,45.930829,69.392916,37.069867,79.328818,57.930607
base + books + newspapers,40.174327,71.730552,38.20913,80.407805,57.630453
base + fiction books,37.792471,69.765543,36.754779,80.539722,56.213129
base + nonfiction books,39.900294,69.479177,37.63917,80.949539,56.992045
base + nonfiction books + newspapers,40.595982,72.886413,38.575436,80.532526,58.147589
base + original books,39.889249,70.635859,38.002251,82.69747,57.806207
base + original books + newspapers,40.452809,71.246724,38.146804,80.991938,57.709569


----
### Other k-shots approaches (discarded)
----

In [96]:
# highest k-shot available (lower than 16)
ks = {
    "ask_gec_nb": 4,
    "mimir_bias": 0,
    "ncb": 0,
    "norbelebele_nb": 0,
    "norcommonsenseqa_nb": 0,
    "norcommonsenseqa_nn": 0,
    "norec_document_nb": 1,
    "norec_sentence_nb": 4,
    "noridiom_nb": 0,
    "noridiom_nn": 0,
    "noropenbookqa_nb_use_fact": 4,
    "noropenbookqa_nb": 4,
    "noropenbookqa_nn_use_fact": 4,
    "noropenbookqa_nn": 4,
    "norquad_nb": 1,
    "norsumm_nb": 0,
    "norsumm_nn": 0,
    "nortruthfulqa_gen_nb": 0,
    "nortruthfulqa_mc_nb": 0,
    "nortruthfulqa_mc_nn": 0,
    "nrk_nb": 0,
    "nrk_nn": 0,
    "tatoeba_eng_nno_nn": 4,
    "tatoeba_eng_nob_nb": 4,
    "tatoeba_nno_eng_nn": 4,
    "tatoeba_nno_nob_nn": 0,  # should be 4, but there's an error in the bertscore processing
    "tatoeba_nob_eng_nb": 4,
    "tatoeba_nob_nno_nb": 0,  # should be 4, but there's an error in the bertscore processing
    # "schibsted_vg_nb": 0,
}

selected_dfs = []
for task, df in zip(*[all_tasks, all_dfs]):
    if df["k"].unique()[0] == ks[task]:
        selected_dfs.append(df.loc[:,~df.columns.str.contains('^(delta|rank|k)', case=False)].set_index("Model"))

report_df = pd.concat(selected_dfs, axis=1,  sort=False)
report_df = report_df.loc[:,report_df.columns.isin(report_metrics)]
report_df.columns = [col.split(" (")[0] for col in report_df.columns]
report_df = report_df.join(la_df)
report_df

Unnamed: 0_level_0,norbelebele_nb,nrk_nb,nrk_nn,norcommonsenseqa_nb,norcommonsenseqa_nn,ncb,noridiom_nb,noridiom_nn,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,...,tatoeba_eng_nno_nn,tatoeba_nno_eng_nn,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression_nob,min_max_lix_nob,inverse_sb_nob,inverse_compression_nno,min_max_lix_nno,inverse_sb_nno
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
extended,30.555556,48.972222,59.924812,43.578644,38.947368,79.52381,36.78422,32.585235,52.631579,50.877193,...,44.590171,55.428263,11.658696,12.50119,40.324207,59.86015,72.68,41.911148,73.030525,77.2965
base,28.777778,45.805556,54.285714,42.857143,36.842105,77.261905,24.146826,14.221166,58.373206,54.385965,...,42.729146,53.642484,15.992702,24.359707,40.036834,57.052582,72.8016,41.907636,67.601785,78.586
base + books,27.666667,44.527778,53.007519,44.444444,34.736842,83.809524,57.445196,27.679681,59.330144,56.140351,...,42.131737,53.692677,12.757572,8.479467,39.207998,61.899285,71.0419,41.416442,76.608565,77.2907
base + newspapers,30.222222,45.833333,55.18797,40.836941,34.736842,81.071429,74.910771,42.637639,47.84689,43.859649,...,44.126743,53.046373,64.203105,45.307222,40.673554,62.691685,72.1036,42.018572,71.04502,76.5547
base + books + newspapers,27.777778,44.916667,54.511278,44.588745,38.947368,84.047619,59.057058,28.478496,58.851675,50.877193,...,42.716894,53.644498,16.639855,11.799298,40.251167,62.991205,71.7438,42.027402,73.451778,77.1318
base + fiction books,27.666667,42.972222,49.473684,44.877345,37.894737,83.928571,26.930504,19.526365,45.454545,43.859649,...,41.767026,52.682172,9.186173,8.92517,40.014405,74.873205,69.211,41.516171,85.025755,75.3629
base + nonfiction books,29.0,43.833333,53.609023,43.434343,37.894737,84.166667,57.685313,31.497707,60.76555,63.157895,...,43.016892,53.771744,12.033895,9.789933,38.962051,57.010537,72.1191,41.818258,72.3413,78.5838
base + nonfiction books + newspapers,29.0,45.0,55.18797,44.300144,41.052632,82.142857,64.04641,35.258885,56.45933,56.140351,...,42.870092,53.502854,14.401647,17.056241,40.049662,62.358548,71.838,42.080458,72.84494,77.4321
base + original books,28.222222,44.972222,54.586466,45.310245,36.842105,84.880952,59.148034,31.556056,61.722488,54.385965,...,42.598892,53.853974,13.039856,8.766193,39.157334,59.336895,72.3795,41.564487,75.014835,77.5432
base + original books + newspapers,28.0,44.333333,55.037594,45.021645,37.894737,83.809524,61.874001,32.834751,55.980861,57.894737,...,43.038514,53.398338,16.442273,16.526474,40.020811,62.235042,71.7394,41.795536,72.965553,76.9432


In [97]:
skills_df = {}
for skill_, metrics in skill.items():
    skills_df[skill_] = report_df[metrics]

In [98]:
report = pd.concat(skills_df.values(), keys=skills_df.keys(),axis=1)
# report.to_csv("./report.tsv", sep="\t")
report

Unnamed: 0_level_0,Sentiment Analysis,Sentiment Analysis,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Reading Comprehension,Reading Comprehension,World Knowledge,World Knowledge,...,Translation,Translation,Translation,Translation,Variation & Readability,Variation & Readability,Variation & Readability,Variation & Readability,Variation & Readability,Variation & Readability
Unnamed: 0_level_1,norec_sentence_nb,norec_document_nb,mimir_bias,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,nortruthfulqa_gen_nb,norbelebele_nb,norquad_nb,nrk_nb,nrk_nn,...,tatoeba_eng_nob_nb,tatoeba_nob_eng_nb,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression_nob,min_max_lix_nob,inverse_sb_nob,inverse_compression_nno,min_max_lix_nno,inverse_sb_nno
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
extended,82.090207,65.395523,75.365408,52.631579,50.877193,25.181798,30.555556,48.199335,48.972222,59.924812,...,56.667414,56.392314,11.658696,12.50119,40.324207,59.86015,72.68,41.911148,73.030525,77.2965
base,65.601004,67.026313,78.174352,58.373206,54.385965,23.122931,28.777778,47.299865,45.805556,54.285714,...,54.185678,54.655333,15.992702,24.359707,40.036834,57.052582,72.8016,41.907636,67.601785,78.586
base + books,74.786108,70.287647,78.125,59.330144,56.140351,27.017284,27.666667,39.100198,44.527778,53.007519,...,55.262219,54.746218,12.757572,8.479467,39.207998,61.899285,71.0419,41.416442,76.608565,77.2907
base + newspapers,74.913273,69.293372,77.586207,47.84689,43.859649,22.856973,30.222222,40.364153,45.833333,55.18797,...,53.875333,53.509675,64.203105,45.307222,40.673554,62.691685,72.1036,42.018572,71.04502,76.5547
base + books + newspapers,75.353774,73.324913,76.76799,58.851675,50.877193,25.753459,27.777778,42.405521,44.916667,54.511278,...,55.304716,54.672177,16.639855,11.799298,40.251167,62.991205,71.7438,42.027402,73.451778,77.1318
base + fiction books,73.07337,68.971084,77.150873,45.454545,43.859649,25.486099,27.666667,39.7605,42.972222,49.473684,...,53.794761,52.821867,9.186173,8.92517,40.014405,74.873205,69.211,41.516171,85.025755,75.3629
base + nonfiction books,76.786017,66.805905,77.732412,60.76555,63.157895,24.387466,29.0,34.218932,43.833333,53.609023,...,55.071566,54.325501,12.033895,9.789933,38.962051,57.010537,72.1191,41.818258,72.3413,78.5838
base + nonfiction books + newspapers,79.451572,71.493354,78.922194,56.45933,56.140351,24.114753,29.0,44.319326,45.0,55.18797,...,54.982987,54.50096,14.401647,17.056241,40.049662,62.358548,71.838,42.080458,72.84494,77.4321
base + original books,76.417902,67.044712,80.513988,61.722488,54.385965,25.115272,28.222222,37.514894,44.972222,54.586466,...,54.900101,54.582642,13.039856,8.766193,39.157334,59.336895,72.3795,41.564487,75.014835,77.5432
base + original books + newspapers,75.493905,72.286656,78.174352,55.980861,57.894737,25.670059,28.0,42.838237,44.333333,55.037594,...,54.962416,54.399651,16.442273,16.526474,40.020811,62.235042,71.7394,41.795536,72.965553,76.9432


In [99]:
report_agg = pd.concat({k: report.pivot_table(k, "Model").T.mean() for k in skill.keys()}, axis=1).reindex(report.index)
# report_agg.to_csv("./report_skill.tsv", sep="\t")
report_agg

Unnamed: 0_level_0,Sentiment Analysis,Fairness & Truthfulness,Reading Comprehension,World Knowledge,Commonsense Reasoning,Norwegian Language,Summarization,Translation,Variation & Readability
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
extended,73.742865,51.013994,39.377445,42.157904,41.263006,46.490816,41.000158,39.539675,60.850422
base,66.313658,53.514114,38.038821,40.246693,39.849624,37.469974,32.445811,40.927508,59.664406
base + books,72.536878,55.153195,33.383433,39.595676,39.590643,51.1136,29.80923,37.844982,61.244148
base + newspapers,72.103322,48.03743,35.293187,40.676246,37.786891,57.81746,29.262776,52.344742,60.847855
base + books + newspapers,74.339343,53.062579,35.091649,39.580583,41.768057,51.693293,31.862726,39.129573,61.266192
base + fiction books,71.022227,47.987792,33.713583,39.180261,41.386041,40.58136,28.412804,36.529528,64.333906
base + nonfiction books,71.795961,56.510831,31.609466,39.302408,40.66454,52.242422,29.522049,38.001588,60.139174
base + nonfiction books + newspapers,75.472463,53.909157,36.659663,40.176677,42.676388,54.162038,30.782248,39.552463,61.100618
base + original books,71.731307,55.434428,32.868558,40.356035,41.076175,52.721261,28.736139,37.956943,60.832709
base + original books + newspapers,73.89028,54.430002,35.419119,39.845628,41.458191,53.439569,30.669836,39.794611,60.949924


In [100]:
# zero-shot
ks = {
    "ask_gec_nb": 0,
    "mimir_bias": 0,
    "ncb": 0,
    "norbelebele_nb": 0,
    "norcommonsenseqa_nb": 0,
    "norcommonsenseqa_nn": 0,
    "norec_document_nb": 0,
    "norec_sentence_nb": 0,
    "noridiom_nb": 0,
    "noridiom_nn": 0,
    "noropenbookqa_nb_use_fact": 0,
    "noropenbookqa_nb": 0,
    "noropenbookqa_nn_use_fact": 0,
    "noropenbookqa_nn": 0,
    "norquad_nb": 0,
    "norsumm_nb": 0,
    "norsumm_nn": 0,
    "nortruthfulqa_gen_nb": 0,
    "nortruthfulqa_mc_nb": 0,
    "nortruthfulqa_mc_nn": 0,
    "nrk_nb": 0,
    "nrk_nn": 0,
    "tatoeba_eng_nno_nn": 0,
    "tatoeba_eng_nob_nb": 0,
    "tatoeba_nno_eng_nn": 0,
    "tatoeba_nno_nob_nn": 0,  # should be 4, but there's an error in the bertscore processing
    "tatoeba_nob_eng_nb": 0,
    "tatoeba_nob_nno_nb": 0,  # should be 4, but there's an error in the bertscore processing
    # "schibsted_vg_nb": 0,
}
for report_metric_key, report_metric_values in report_metrics_dict.items():
    if report_metric_values is not None:
        for report_metric_value in report_metric_values:
            report_metrics.append(f"{report_metric_key} ({report_metric_value})")
    else:
        report_metrics.append(f"{report_metric_key}")
selected_dfs = []
for task, df in zip(*[all_tasks, all_dfs]):
    if df["k"].unique()[0] == ks[task]:
        selected_dfs.append(df.loc[:,~df.columns.str.contains('^(delta|rank|k)', case=False)].set_index("Model"))

report_df = pd.concat(selected_dfs, axis=1,  sort=False)
report_df = report_df.loc[:,report_df.columns.isin(report_metrics)]
report_df.columns = [col.split(" (")[0] for col in report_df.columns]
report_df = report_df.join(la_df)
report_df

Unnamed: 0_level_0,norbelebele_nb,nrk_nb,nrk_nn,norcommonsenseqa_nb,norcommonsenseqa_nn,ncb,noridiom_nb,noridiom_nn,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,...,tatoeba_eng_nno_nn,tatoeba_nno_eng_nn,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression_nob,min_max_lix_nob,inverse_sb_nob,inverse_compression_nno,min_max_lix_nno,inverse_sb_nno
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
extended,30.555556,48.972222,59.924812,43.578644,38.947368,79.52381,36.78422,32.585235,52.631579,50.877193,...,14.94887,10.379039,11.658696,12.50119,40.324207,59.86015,72.68,41.911148,73.030525,77.2965
base,28.777778,45.805556,54.285714,42.857143,36.842105,77.261905,24.146826,14.221166,58.373206,54.385965,...,12.301419,12.860826,15.992702,24.359707,40.036834,57.052582,72.8016,41.907636,67.601785,78.586
base + books,27.666667,44.527778,53.007519,44.444444,34.736842,83.809524,57.445196,27.679681,59.330144,56.140351,...,4.723513,8.785296,12.757572,8.479467,39.207998,61.899285,71.0419,41.416442,76.608565,77.2907
base + newspapers,30.222222,45.833333,55.18797,40.836941,34.736842,81.071429,74.910771,42.637639,47.84689,43.859649,...,28.175257,35.063173,64.203105,45.307222,40.673554,62.691685,72.1036,42.018572,71.04502,76.5547
base + books + newspapers,27.777778,44.916667,54.511278,44.588745,38.947368,84.047619,59.057058,28.478496,58.851675,50.877193,...,4.881404,7.650597,16.639855,11.799298,40.251167,62.991205,71.7438,42.027402,73.451778,77.1318
base + fiction books,27.666667,42.972222,49.473684,44.877345,37.894737,83.928571,26.930504,19.526365,45.454545,43.859649,...,4.704739,5.634942,9.186173,8.92517,40.014405,74.873205,69.211,41.516171,85.025755,75.3629
base + nonfiction books,29.0,43.833333,53.609023,43.434343,37.894737,84.166667,57.685313,31.497707,60.76555,63.157895,...,5.581007,9.631253,12.033895,9.789933,38.962051,57.010537,72.1191,41.818258,72.3413,78.5838
base + nonfiction books + newspapers,29.0,45.0,55.18797,44.300144,41.052632,82.142857,64.04641,35.258885,56.45933,56.140351,...,5.053153,9.968868,14.401647,17.056241,40.049662,62.358548,71.838,42.080458,72.84494,77.4321
base + original books,28.222222,44.972222,54.586466,45.310245,36.842105,84.880952,59.148034,31.556056,61.722488,54.385965,...,4.879612,8.694367,13.039856,8.766193,39.157334,59.336895,72.3795,41.564487,75.014835,77.5432
base + original books + newspapers,28.0,44.333333,55.037594,45.021645,37.894737,83.809524,61.874001,32.834751,55.980861,57.894737,...,4.867005,9.916842,16.442273,16.526474,40.020811,62.235042,71.7394,41.795536,72.965553,76.9432


In [101]:
skills_df = {}
for skill_, metrics in skill.items():
    skills_df[skill_] = report_df[metrics]

report = pd.concat(skills_df.values(), keys=skills_df.keys(),axis=1)
# report.to_csv("./report.tsv", sep="\t")
report

Unnamed: 0_level_0,Sentiment Analysis,Sentiment Analysis,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Fairness & Truthfulness,Reading Comprehension,Reading Comprehension,World Knowledge,World Knowledge,...,Translation,Translation,Translation,Translation,Variation & Readability,Variation & Readability,Variation & Readability,Variation & Readability,Variation & Readability,Variation & Readability
Unnamed: 0_level_1,norec_sentence_nb,norec_document_nb,mimir_bias,nortruthfulqa_mc_nb,nortruthfulqa_mc_nn,nortruthfulqa_gen_nb,norbelebele_nb,norquad_nb,nrk_nb,nrk_nn,...,tatoeba_eng_nob_nb,tatoeba_nob_eng_nb,tatoeba_nob_nno_nb,tatoeba_nno_nob_nn,inverse_compression_nob,min_max_lix_nob,inverse_sb_nob,inverse_compression_nno,min_max_lix_nno,inverse_sb_nno
Model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
extended,48.406816,55.283439,75.365408,52.631579,50.877193,25.181798,30.555556,45.554279,48.972222,59.924812,...,19.718004,10.897911,11.658696,12.50119,40.324207,59.86015,72.68,41.911148,73.030525,77.2965
base,70.091134,68.98817,78.174352,58.373206,54.385965,23.122931,28.777778,40.589246,45.805556,54.285714,...,13.648272,10.071595,15.992702,24.359707,40.036834,57.052582,72.8016,41.907636,67.601785,78.586
base + books,70.712175,62.962352,78.125,59.330144,56.140351,27.017284,27.666667,35.58859,44.527778,53.007519,...,10.632341,10.271388,12.757572,8.479467,39.207998,61.899285,71.0419,41.416442,76.608565,77.2907
base + newspapers,57.10778,68.806231,77.586207,47.84689,43.859649,22.856973,30.222222,36.993047,45.833333,55.18797,...,34.42363,36.890106,64.203105,45.307222,40.673554,62.691685,72.1036,42.018572,71.04502,76.5547
base + books + newspapers,65.069465,68.350602,76.76799,58.851675,50.877193,25.753459,27.777778,37.655642,44.916667,54.511278,...,10.229949,10.484609,16.639855,11.799298,40.251167,62.991205,71.7438,42.027402,73.451778,77.1318
base + fiction books,66.134942,63.457872,77.150873,45.454545,43.859649,25.486099,27.666667,38.240878,42.972222,49.473684,...,8.069368,7.531059,9.186173,8.92517,40.014405,74.873205,69.211,41.516171,85.025755,75.3629
base + nonfiction books,70.098783,62.490437,77.732412,60.76555,63.157895,24.387466,29.0,34.677829,43.833333,53.609023,...,9.876487,10.542187,12.033895,9.789933,38.962051,57.010537,72.1191,41.818258,72.3413,78.5838
base + nonfiction books + newspapers,64.600599,72.135835,78.922194,56.45933,56.140351,24.114753,29.0,37.377047,45.0,55.18797,...,9.214916,12.578001,14.401647,17.056241,40.049662,62.358548,71.838,42.080458,72.84494,77.4321
base + original books,70.877957,63.503725,80.513988,61.722488,54.385965,25.115272,28.222222,33.495788,44.972222,54.586466,...,11.89692,9.744685,13.039856,8.766193,39.157334,59.336895,72.3795,41.564487,75.014835,77.5432
base + original books + newspapers,62.887731,65.116473,78.174352,55.980861,57.894737,25.670059,28.0,36.178001,44.333333,55.037594,...,9.335651,12.646572,16.442273,16.526474,40.020811,62.235042,71.7394,41.795536,72.965553,76.9432


In [102]:
report_agg = pd.concat({k: report.pivot_table(k, "Model").T.mean() for k in skill.keys()}, axis=1).reindex(report.index)
report_agg['Average'] = report_agg.mean(numeric_only=True, axis=1)
# report_agg.to_csv("./report_skill.tsv", sep="\t")
report_agg

Unnamed: 0_level_0,Sentiment Analysis,Fairness & Truthfulness,Reading Comprehension,World Knowledge,Commonsense Reasoning,Norwegian Language,Summarization,Translation,Variation & Readability,Average
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
extended,51.845127,51.013994,38.054917,42.166517,41.263006,39.208316,41.000158,13.350618,60.850422,42.083675
base,69.539652,53.514114,34.683512,39.535022,39.849624,30.094974,32.445811,14.87242,59.664406,41.577726
base + books,66.837264,55.153195,31.627628,38.804332,39.590643,44.9036,29.80923,9.274929,61.244148,41.916108
base + newspapers,62.957006,48.03743,33.607635,39.120476,37.786891,50.98496,29.262776,40.677082,60.847855,44.809123
base + books + newspapers,66.710034,53.062579,32.71671,39.348025,41.768057,44.855793,31.862726,10.280952,61.266192,42.430119
base + fiction books,64.796407,47.987792,32.953772,37.502828,41.386041,34.74886,28.412804,7.341908,64.333906,39.94048
base + nonfiction books,66.29461,56.510831,31.838915,38.428162,40.66454,46.652422,29.522049,9.575794,60.139174,42.180722
base + nonfiction books + newspapers,68.368217,53.909157,33.188524,39.107556,42.676388,47.339538,30.782248,11.378804,61.100618,43.094561
base + original books,67.190841,55.434428,30.859005,38.962839,41.076175,46.381261,28.736139,9.503606,60.832709,42.108556
base + original books + newspapers,64.002102,54.430002,32.089,38.829263,41.458191,46.612069,30.669836,11.62247,60.949924,42.295873
