In [32]:
import os
import json
import pandas as pd


def load_json(fname):
    with open(fname, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

In [1]:
%cd mimir_results

/pfs/lustrep3/scratch/project_465000498/vlad/mimir/mimir_results


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [3]:
zero = [
    "norec_sentence_nb",
    "norec_document_nb",
    "mimir_bias",
    "noridiom_nb",
    "noridiom_nn",
    "ncb",
    "norbelebele_nb",
    "nrk_nb",
    "nrk_nn",
    "noropenbookqa_nb",
    "noropenbookqa_nb_use_fact",
    "noropenbookqa_nn",
    "noropenbookqa_nn_use_fact",
    "norcommonsenseqa_nb",
    "norcommonsenseqa_nn",
    "nortruthfulqa_mc_nb",
    "nortruthfulqa_mc_nn",
    "nortruthfulqa_gen_nb",
    "norquad_nb",
    "schibsted_vg_nb",
    "ask_gec_nb",
    "norsumm_nb",
    "norsumm_nn",
    "tatoeba_eng_nno_nn",
    "tatoeba_nno_eng_nn",
    "tatoeba_eng_nob_nb",
    "tatoeba_nob_eng_nb",
    "tatoeba_nob_nno_nb",
    "tatoeba_nno_nob_nn",
]

In [7]:
prompts = {
    "norquad_nb": {
        "prompt-0": "Tittel: {title}\n\nTekst: {passage}\n\nSpørsmål: {question}\n\nSvar:",
        "prompt-1": 'Tittel: {title}\n\nTekst: {passage}\n\nGitt teksten over, hva er svaret på følgende spørsmål? "{question}"\n\nSvar:',
        "prompt-2": "Tittel: {title}\n\nTekst: {passage}\n\nSpørsmål: {question}\n\nSvar:",
        "prompt-3": 'Tittel: {title}\n\nTekst: {passage}\n\nHvordan kan man svare på spørsmålet "{question}", gitt teksten over?\n\nSvar:',
        "prompt-4": 'Tittel: {title}\n\nTekst:{passage}\n\nGitt teksten over, besvar følgende spørsmål: "{question}"\n\nSvar:',
    }
}

task2metric = {
    "mimir_bias": ["pct_stereotype", "likelihood_diff"],
    "ncb": ["acc"],
    "norec_sentence_nb": ["acc", "f1"],
    "norec_document_nb": ["acc", "f1"],
    "tapaco_no_detection_nb": ["acc"],
    "norbelebele_nb": ["acc"],
    "nrk_nb": ["acc"],
    "nrk_nn": ["acc"],
    "noropenbookqa_nb": ["acc"],
    "noropenbookqa_nn": ["acc"],
    "noropenbookqa_nb_use_fact": ["acc"],
    "noropenbookqa_nn_use_fact": ["acc"],
    "norcommonsenseqa_nn": ["acc"],
    "norcommonsenseqa_nb": ["acc"],
    "nortruthfulqa_mc_nb": ["acc"],
    "nortruthfulqa_mc_nn": ["acc"],
    "norquad_nb": ["exact_match", "f1"],
    "noridiom_nb": ["em", "fscore"],
    "noridiom_nn": ["em", "fscore"],
    "norsumm_nb": [
        "bleu_max",
        "bleu_avg",
        "rougeL_max",
        "rougeL_avg",
        "bertscore_f1_max",
        "bertscore_f1_avg",
    ],
    "norsumm_nn": [
        "bleu_max",
        "bleu_avg",
        "rougeL_max",
        "rougeL_avg",
        "bertscore_f1_max",
        "bertscore_f1_avg",
    ],
    "nortruthfulqa_gen_nb": ["bleu_max", "rougeL_max"],
    "schibsted_vg_nb": ["bleu", "chrf"],
    "ask_gec_nb": ["errant"],
    "tatoeba_eng_nno_nn": ["bleu", "chrf"],
    "tatoeba_nno_eng_nn": ["bleu", "chrf"],
    "tatoeba_eng_nob_nb": ["bleu", "chrf"],
    "tatoeba_nob_eng_nb": ["bleu", "chrf"],
    "tatoeba_nob_nno_nb": ["bleu", "chrf"],
    "tatoeba_nno_nob_nn": ["bleu", "chrf"],
}


def pretty_metric(
    task,
    metric_name,
    score,
    metric_list=[
        "f1",
        "acc",
        "pct_stereotype",
        "acc_norm",
        "em",
        "fscore",
        "bertscore_f1_avg",
        "bertscore_f1_max",
    ],
):
    pretty_metric_name = metric_name.replace(",none", "")
    pretty_metric_score = (
        round(score * 100, 3) if pretty_metric_name in metric_list else round(score, 3)
    )
    if task == "norquad_nb":
        pretty_metric_score = round(score, 3)
    return pretty_metric_name, pretty_metric_score


def collect_task_ranking_results(
    task,
    k=0,
    ignore_models=["gpt-sw3-6.7b"],
    ignore_metrics=[
        "alias",
        "bleu_acc",
        "bleu_diff",
        "rouge1_acc",
        "rouge1_diff",
        "rouge2_max",
        "rouge2_acc",
        "rouge2_diff",
        "rougeL_acc",
        "rougeL_diff",
    ],
    verbose=True,
    columns=["task", "model", "k-shot"],
):
    res = []
    res_fdir = f"{task}/{k}-shot"
    res_columns = columns.copy()
    for model_organization in os.listdir(res_fdir):
        model_fdir = os.path.join(res_fdir, model_organization)
        for model in os.listdir(model_fdir):
            if model in ignore_models:
                continue
            model_res_fpath = os.path.join(model_fdir, model, "results.json")
            if verbose:
                print(model_res_fpath)
            model_res = load_json(model_res_fpath)
            model_res_scores = model_res["results"][task]
            curr_configuration_res = [task, model, k]
            for metric_name, score in model_res_scores.items():
                if "stderr" in metric_name or metric_name in ignore_metrics:
                    continue
                pretty_metric_name, pretty_metric_score = pretty_metric(
                    task=task, metric_name=metric_name, score=score
                )
                if pretty_metric_name not in res_columns:
                    res_columns.append(pretty_metric_name)
                curr_configuration_res.append(pretty_metric_score)
            res.append(curr_configuration_res)
    return pd.DataFrame(res, columns=res_columns)


def collect_task_prompt_results(
    task,
    k,
    ignore_models=["gpt-sw3-6.7b"],
    verbose=True,
    columns=["task", "model", "prompt", "k-shot"],
    prompts=prompts,
):
    res = []
    res_fdir = f"{task}/{k}-shot"
    res_columns = columns.copy()
    for model_organization in os.listdir(res_fdir):
        model_fdir = os.path.join(res_fdir, model_organization)
        for model in os.listdir(model_fdir):
            if model in ignore_models:
                continue
            model_res_fpath = os.path.join(model_fdir, model, "results.json")
            if not os.path.exists(model_res_fpath):
                continue
            if verbose:
                print(model_res_fpath)
            model_res = load_json(model_res_fpath)
            model_res_scores = {
                prompt_name: prompt_res
                for prompt_name, prompt_res in model_res["results"].items()
                if prompt_name != task
            }
            for configuration_name, configuration_res in model_res_scores.items():
                prompt = (
                    prompts[task][configuration_name]
                    if task in prompts
                    else model_res["configs"][configuration_name]["doc_to_text"]
                )
                curr_configuration_res = [task, model, prompt, k]
                for metric_name, score in configuration_res.items():
                    if "stderr" in metric_name or metric_name == "alias":
                        continue
                    pretty_metric_name, pretty_metric_score = pretty_metric(
                        task=task, metric_name=metric_name, score=score
                    )
                    if pretty_metric_name not in res_columns:
                        res_columns.append(pretty_metric_name)
                    curr_configuration_res.append(pretty_metric_score)
                res.append(curr_configuration_res)
    return pd.DataFrame(res, columns=res_columns)

In [8]:
zero_shot = [
    "norec_sentence_nb",
    "norec_document_nb",
    "mimir_bias",
    "tapaco_no_detection_nb",
    "norsumm_nb",
    "norsumm_nn",
    "noridiom_nb",
    "noridiom_nn",
    "ncb",
    "norbelebele_nb",
    "nrk_nb",
    "nrk_nn",
    "noropenbookqa_nb",
    "noropenbookqa_nb_use_fact",
    "noropenbookqa_nn",
    "noropenbookqa_nn_use_fact",
    "norcommonsenseqa_nb",
    "norcommonsenseqa_nn",
    "nortruthfulqa_mc_nb",
    "nortruthfulqa_mc_nn",
    "norquad_nb",
    "nortruthfulqa_gen_nb",
]

In [9]:
from functools import reduce


overall = {
    task: (
        collect_task_prompt_results(task, k=0, verbose=False)
        if task not in ["mimir_bias", "ncb"]
        else collect_task_ranking_results(task, k=0, verbose=False)
    )
    for task in zero_shot
}

In [24]:
def merge_ranking_results(tasks, overall=overall, task2metric=task2metric, on="model"):
    res = {
        task_name: overall[task_name].rename(
            columns={col: f"{task_name} ({col})" for col in task2metric[task_name]}
        )
        for task_name in tasks
    }
    df = reduce(
        lambda df_left, df_right: pd.merge(df_left, df_right, on="model"),
        list(res.values()),
    )
    df.rename(columns={"k-shot_x": "k-shot"}, inplace=True)
    df = df[
        [
            col
            for col in df.columns
            if not any([name in col for name in ["task_", "shot_"]])
        ]
    ]
    return df


def aggregate_df(df, task, task2metric, select_best):
    task_res = []
    task_columns = ["model"] + task2metric[task]
    for model, subset in df.groupby("model"):
        model_res = [model]
        for metric in task2metric[task]:

            agg_res = dict(subset[metric].describe())
            if select_best:
                model_res.append(round(agg_res["max"], 2))
            else:
                model_res.append(
                    f"{round(agg_res['mean'], 2)} ± {round(agg_res['std'], 1)} [{round(agg_res['max'], 2)}]"
                )
        task_res.append(model_res)
    return pd.DataFrame(task_res, columns=task_columns)


def merge_task_prompt_results(
    tasks, overall=overall, task2metric=task2metric, select_best=True, on="model"
):
    res = {
        task_name: aggregate_df(
            overall[task_name], task_name, task2metric, select_best
        ).rename(
            columns={col: f"{task_name} ({col})" for col in task2metric[task_name]}
        )
        for task_name in tasks
    }

    df = reduce(
        lambda df_left, df_right: pd.merge(df_left, df_right, on="model"),
        list(res.values()),
    )
    df.rename(columns={"k-shot_x": "k-shot"}, inplace=True)
    df = df[
        [
            col
            for col in df.columns
            if not any(
                [
                    name in col
                    for name in [
                        "task_",
                        "shot_",
                        "sentence_nb (acc",
                        "document_nb (acc",
                        "acc_norm",
                    ]
                ]
            )
        ]
    ]
    return df

In [11]:
canonical_order = [
    "mimir-mistral-7b-extended",
    "mimir-mistral-7b-extended-scratch",
    "mimir-mistral-7b-base",
    "mimir-mistral-7b-base-scratch",
    "mimir-7b-fiction",
    "mimir-7b-nonfiction",
    "mimir-7b-factual",
    "mimir-7b-newspapers",
    "mimir-7b-books",
    "mimir-7b-rightholders",
    "mimir-7b-untranslated-withnewspapers",
    "mimir-7b-untranslated",
    "mimir-7b-translated",
]

skill = {
    "Sentiment analysis": ["norec_sentence_nb", "norec_document_nb"],
    "Fairness & truthfulness": [
        "mimir_bias",
        "nortruthfulqa_mc_nb",
        "nortruthfulqa_mc_nn",
        "nortruthfulqa_gen_nb",
    ],
    "Reading comprehension": ["norbelebele_nb", "norquad_nb"],
    "World knowledge": [
        "nrk_nb",
        "nrk_nn",
        "noropenbookqa_nb",
        "noropenbookqa_nn",
        "noropenbookqa_nb_use_fact",
        "noropenbookqa_nn_use_fact",
    ],
    "Commonsense reasoning": ["norcommonsenseqa_nb", "norcommonsenseqa_nn"],
    "Norwegian language: grammar, punctuation, and idioms": [
        "n,cb",
        "ask_gec_nb",
        "noridiom_nb",
        "noridiom_nn",
    ],
    "Text summarization": ["norsumm_nb", "norsumm_nn"],
    "Machine translation": [
        "tatoeba_eng_nno_nn",
        "tatoeba_nno_eng_nn",
        "tatoeba_eng_nob_nb",
        "tatoeba_nob_eng_nb",
        "tatoeba_nob_nno_nb",
        "tatoeba_nno_nob_nn",
    ],
    "Headline generation": ["schibsted_vg_nb"],
}

In [12]:
beautify_columns = {
    "model": "Model",
    "norec_sentence_nb (f1)": "NoReC",
    "norec_document_nb (f1)": "NoReC",
}


def pretty_model(model_name):
    model_d = {
        "mimir-mistral-7b-base": "@base",
        "mimir-mistral-7b-extended": "@extended",
        "mimir-7b-fiction": "@fiction",
        "mimir-7b-nonfiction": "@nonfiction",
        "mimir-7b-factual": "@factual",
        "mimir-7b-newspapers": "@newspapers",
        "mimir-7b-books": "@books",
        "mimir-7b-rightholders": "@rightholders",
        "mimir-7b-untranslated-withnewspapers": "@untranslatedwithnewspapers",
        "mimir-7b-untranslated": "@untranslated",
        "mimir-7b-translated": "@translated",
        "mimir-mistral-7b-base-scratch": "@basescratch",
        "mimir-mistral-7b-extended-scratch": "@extendedscratch",
    }
    # mimir, conf = model_name.replace("-mistral-", "").split("7b")
    # mimir = mimir.replace("mimir", "\textsc{mimir}")
    # pretty_name = f"{mimir}$_\text" + "{" + f"{conf.strip(' -')}".replace("\text", "\text{") + "}$"
    return model_d[model_name]
    # return pretty_name


def aggregate_by_skill(
    task,
    model_order=canonical_order,
    select_best=True,
    skill=skill,
    add_baselines=[],
    add_k=None,
    overall=overall,
    task2metric=task2metric,
    base_model="mimir-mistral-7b-base-scratch",
    target_metric="f1",
    beautify_columns=beautify_columns,
):
    df = merge_task_prompt_results(
        [task], select_best=select_best, overall=overall, task2metric=task2metric
    )
    df = df[df["model"].isin(model_order)]
    # print(f"Task: {task}; Num rows: {df.shape[0]}")
    reference_score = {
        task: score.item()
        for task, score in dict(df[df["model"] == base_model]).items()
        if task != "model"
    }
    if add_baselines:
        df = pd.concat([df, pd.DataFrame(add_baselines, columns=df.columns.tolist())])

    model_order = model_order.copy() + [
        baseline_name for baseline_name, _ in add_baselines
    ]
    ascending = False if "mimir_bias" not in task else True
    model_rank = {
        model: i + 1
        for i, model in enumerate(
            df.sort_values(
                f"{task} ({target_metric})", ascending=ascending
            ).model.tolist()
        )
    }
    df["Rank"] = df["model"].apply(lambda x: model_rank[x])
    df = df.set_index("model").loc[model_order]
    agg, agg_cols = [], [
        "Rank",
        "Model",
        f"{task} ({target_metric})",
        f"delta ({target_metric})",
    ]
    for model_name, row in df.iterrows():
        row_res = [int(row["Rank"])]
        if model_name.startswith("mimir"):
            row_res.append(pretty_model(model_name))
            for task_name, ref in reference_score.items():
                if task_name != f"{task} ({target_metric})":
                    continue
                if model_name == base_model:
                    row_res.append(ref)
                    row_res.append("xmark")
                elif (
                    model_name.endswith("scratch")
                    or ("base" in model_name and model_name != base_model)
                    or "extended" in model_name
                ):
                    row_res.append(row[task_name])
                    row_res.append("xmark")
                else:
                    model_conf_score = row[task_name]
                    row_res.append(model_conf_score)
                    delta = round(model_conf_score - ref, 1)
                    if delta > 0:
                        row_res.append(f"+{delta}")
                    else:
                        row_res.append(f"-{delta}")
        else:
            row_res.extend([model_name, row[f"{task} ({target_metric})"], "xmark"])
        agg.append(row_res)
    agg_df = pd.DataFrame(agg, columns=agg_cols)
    if add_k is not None:
        agg_df["k"] = add_k
        agg_df = agg_df[
            [
                "Rank",
                "Model",
                "k",
                f"{task} ({target_metric})",
                f"delta ({target_metric})",
            ]
        ]
    agg_df.rename(columns=beautify_columns, inplace=True)
    agg_df = agg_df.set_index("Rank")
    return agg_df


def print_latex_df(df):
    print(
        df.to_latex()
        .replace("@", "\\")
        .replace("xmark", "\\xmark")  # .replace("$delta$", "delta")
    )

### Single-shot tasks

In [25]:
task = "norbelebele_nb"
add_baselines = [
    ["Random", 25.00],
]
norbelebele_nb = aggregate_by_skill(
    task, add_baselines=add_baselines, target_metric="acc"
)
norbelebele_nb.to_csv("agg_results/qa/norbelebele_nb.tsv", sep="\t", index=False)
# print_latex_df(norbelebele_nb)

In [28]:
task = "nrk_nb"
add_baselines = [["Random", 27.91], ["Constant", 30.97]]
nrk_nb = aggregate_by_skill(task, add_baselines=add_baselines, target_metric="acc")
nrk_nb.to_csv("agg_results/qa/nrk_nb.tsv", sep="\t", index=False)
# print_latex_df(nrk_nb)

In [29]:
task = "nrk_nn"
add_baselines = [["Random", 26.76], ["Constant", 30.45]]
nkr_nn = aggregate_by_skill(task, add_baselines=add_baselines, target_metric="acc")
nkr_nn.to_csv("agg_results/qa/nrk_nn.tsv", sep="\t", index=False)
# print_latex_df(nrk_nn)

In [30]:
task = "norcommonsenseqa_nb"
add_baselines = [["Random", 20.00]]
norcommonsenseqa_nb = aggregate_by_skill(
    task, add_baselines=add_baselines, target_metric="acc"
)
norcommonsenseqa_nb.to_csv(
    "agg_results/qa/norcommonsenseqa_nb.tsv", sep="\t", index=False
)
# print_latex_df(norcommonsenseqa_nb)

task = "norcommonsenseqa_nn"
add_baselines = [["Random", 20.00]]
norcommonsenseqa_nn = aggregate_by_skill(
    task, add_baselines=add_baselines, target_metric="acc"
)
norcommonsenseqa_nn.to_csv(
    "agg_results/qa/norcommonsenseqa_nn.tsv", sep="\t", index=False
)
# print_latex_df(aggregate_by_skill(task, add_baselines=add_baselines, target_metric="acc"))

In [31]:
!mkdir agg_results/ranking

In [33]:
task = "ncb"
add_baselines = [
    ["Random", 50.00],
]
ncb = aggregate_by_skill(task, add_baselines=add_baselines, target_metric="acc")
ncb.to_csv("agg_results/ranking/ncb.tsv", sep="\t", index=False)
# print_latex_df(ncb)

In [34]:
!mkdir agg_results/generation

In [35]:
task = "noridiom_nb"
add_baselines = []
em = aggregate_by_skill(task, add_baselines=add_baselines, target_metric="em")
f1 = aggregate_by_skill(task, add_baselines=add_baselines, target_metric="fscore")
em["Rank"] = em.index.tolist()
noridiom_nb = (
    em[["Rank", "Model", "noridiom_nb (em)", "delta (em)"]]
    .merge(f1, on="Model")
    .set_index("Rank")
)
noridiom_nb.to_csv("agg_results/generation/noridiom_nb.tsv", sep="\t", index=False)
# print_latex_df(noridiom_nb)

In [37]:
task = "noridiom_nn"
add_baselines = []
em = aggregate_by_skill(task, add_baselines=add_baselines, target_metric="em")
f1 = aggregate_by_skill(task, add_baselines=add_baselines, target_metric="fscore")
em["Rank"] = em.index.tolist()
noridiom_nn = (
    em[["Rank", "Model", "noridiom_nn (em)", "delta (em)"]]
    .merge(f1, on="Model")
    .set_index("Rank")
)
noridiom_nn.to_csv("agg_results/generation/noridiom_nn.tsv", sep="\t", index=False)
# print_latex_df(noridiom_nn)

In [38]:
task = "nortruthfulqa_mc_nb"
add_baselines = [["Random", 27.27]]
nortruthfulqa_mc_nb = aggregate_by_skill(
    task, add_baselines=add_baselines, target_metric="acc"
)
nortruthfulqa_mc_nb.to_csv(
    "agg_results/qa/nortruthfulqa_mc_nb.tsv", sep="\t", index=False
)
# print_latex_df(nortruthfulqa_mc_nb)

In [39]:
task = "nortruthfulqa_mc_nn"
add_baselines = [["Random", 24.56]]
nortruthfulqa_mc_nn = aggregate_by_skill(
    task, add_baselines=add_baselines, target_metric="acc"
)
nortruthfulqa_mc_nn.to_csv(
    "agg_results/qa/nortruthfulqa_mc_nn.tsv", sep="\t", index=False
)
# print_latex_df(nortruthfulqa_mc_nn)

In [41]:
import numpy as np
from operator import itemgetter


def build_ranks(df):
    ranks = {}
    for i, row in df.iterrows():
        rank = np.mean([row[c] for c in dict(row) if "Rank" in c])
        ranks[row["Model"]] = round(rank, 3)
    inverse = {}
    counter = 0
    for model, rank in sorted(ranks.items(), key=itemgetter(1)):
        counter += 1
        if rank not in inverse:
            inverse[rank] = counter
        else:
            continue
    return {model: inverse[rank] for model, rank in ranks.items()}


task = "nortruthfulqa_gen_nb"

bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

trthflqa_rank = build_ranks(bleu_max.merge(rougeL_max))
trhtflqa = []
cols = []

for i, row in bleu_max.merge(rougeL_max).iterrows():

    model_res = []

    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("nortruthfulqa_gen_nb (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    trhtflqa.append(model_res)

df = pd.DataFrame(trhtflqa, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: trthflqa_rank[x])
df = df.set_index("Rank")
df.to_csv("agg_results/generation/nortruthfulqa_gen_nb.tsv", sep="\t", index=False)
# print_latex_df(df)

In [42]:
task = "mimir_bias"
mimir_bias = aggregate_by_skill(task, add_baselines=[], target_metric="pct_stereotype")
mimir_bias.to_csv("agg_results/ranking/mimir_bias.tsv", sep="\t", index=False)
# print_latex_df(mimir_bias)

In [47]:
task = "norsumm_nb"

bertscore_f1_max = aggregate_by_skill(
    task, add_baselines=[], target_metric="bertscore_f1_max"
)
bertscore_f1_max["Rank_bertscore_f1_max"] = bertscore_f1_max.index.tolist()
bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

norsumm_nb_rank = build_ranks(bleu_max.merge(rougeL_max).merge(bertscore_f1_max))
norsumm_nb = []
cols = []

for i, row in bleu_max.merge(rougeL_max).merge(bertscore_f1_max).iterrows():
    model_res = []
    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("norsumm_nb (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    norsumm_nb.append(model_res)

df = pd.DataFrame(norsumm_nb, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: norsumm_nb_rank[x])
df = df.set_index("Rank")
df
# df.to_csv("agg_results/generation/norsumm_nb.tsv", sep="\t", index=False)
# print_latex_df(df)

Unnamed: 0_level_0,Model,bleu,delta bleu,rougeL,delta rougeL,bertscore_f1,delta bertscore_f1
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2,@extended,22.31,xmark,42.48,xmark,73.5,xmark
1,@extendedscratch,26.02,xmark,47.39,xmark,73.76,xmark
3,@base,19.92,xmark,39.78,xmark,72.38,xmark
4,@basescratch,19.23,xmark,37.62,xmark,71.4,xmark
11,@fiction,14.19,--5.0,30.86,--6.8,65.95,--5.5
8,@nonfiction,15.15,--4.1,32.63,--5.0,65.88,--5.5
7,@factual,15.22,--4.0,32.81,--4.8,66.93,--4.5
10,@newspapers,12.52,--6.7,29.84,--7.8,68.84,--2.6
8,@books,14.75,--4.5,31.87,--5.7,66.9,--4.5
5,@rightholders,17.86,--1.4,36.13,--1.5,68.57,--2.8


In [49]:
task = "norsumm_nn"

bertscore_f1_max = aggregate_by_skill(
    task, add_baselines=[], target_metric="bertscore_f1_max"
)
bertscore_f1_max["Rank_bertscore_f1_max"] = bertscore_f1_max.index.tolist()
bleu_max = aggregate_by_skill(task, add_baselines=[], target_metric="bleu_max")
bleu_max["Rank_bleu_max"] = bleu_max.index.tolist()
rougeL_max = aggregate_by_skill(task, add_baselines=[], target_metric="rougeL_max")
rougeL_max["Rank_rougeL_max"] = rougeL_max.index.tolist()

norsumm_nb_rank = build_ranks(bleu_max.merge(rougeL_max).merge(bertscore_f1_max))
norsumm_nb = []
cols = []

for i, row in bleu_max.merge(rougeL_max).merge(bertscore_f1_max).iterrows():
    model_res = []
    for k, v in dict(row).items():
        k = (
            k.replace("_max)", "")
            .replace("norsumm_nn (", "")
            .replace("delta (", "delta ")
        )
        if "Rank" in k:
            continue
        if k not in cols:
            cols.append(k)
        model_res.append(v)
    norsumm_nb.append(model_res)

df = pd.DataFrame(norsumm_nb, columns=cols)
df["Rank"] = df["Model"].apply(lambda x: norsumm_nb_rank[x])
df = df.set_index("Rank")
df.to_csv("agg_results/generation/norsumm_nn.tsv", sep="\t", index=False)
# print_latex_df(df)

### Multi-shot tasks

In [59]:
task = "norec_document_nb"

add_baselines = [["Random", 48.43], ["Constant", 40.12]]

ks = [0, 1, 4]

norec_document_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norec_document_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=add_baselines, overall=norec_document_nb[k], add_k=k
        )
        for k in ks
    ]
)

In [123]:
from collections import deque


def reogranize(
    df, pretty_order=[pretty_model(m) for m in canonical_order], add_baselines=[]
):
    res_df = df.copy()
    res = []
    if add_baselines:
        pretty_order.extend(add_baselines)
    for model in pretty_order:
        subset = res_df[res_df["Model"] == model]
        if model in ["Random", "Constant"]:
            subset["k"] = "xmark"
            res.append(pd.DataFrame([subset.iloc[0]]))
        else:
            res.append(subset)
    return pd.concat(res)


def reogranize_by_k(
    df,
    change_rank=True,
    change_cols=False,
    pretty_order=[pretty_model(m) for m in canonical_order],
    add_baselines=[],
):
    res = []
    if add_baselines:
        pretty_order.extend(add_baselines)
    for i, subset in df.groupby("k"):
        if change_rank:
            subset["Rank"] = subset.index.tolist()
        for model in pretty_order:
            k_subset = subset[subset["Model"] == model]
            if model != "@factual":
                k_subset["k"] = ""
            res.append(k_subset)
    res_df = pd.concat(res)  # .drop_duplicates(subset=["k", "Model", "Rank"])
    res_df = res_df.set_index("k")  # .drop_duplicates()
    res_cols = res_df.columns.tolist()
    if change_cols and res_cols[0] != "Rank":
        new_columns = deque(res_cols)
        new_columns.rotate(1)
        return res_df[list(new_columns)]
    return res_df

In [84]:
reogranize_by_k(norec_document_nb_df, add_baselines=["Random", "Constant"]).to_csv(
    "agg_results/clf/norec_document_nb.tsv", sep="\t", index=False
)
# print_latex_df()

In [85]:
task = "norec_sentence_nb"

add_baselines = [["Random", 48.52], ["Constant", 40.75]]

ks = [0, 1, 4, 16]

norec_sentence_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norec_sentence_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=add_baselines, overall=norec_sentence_nb[k], add_k=k
        )
        for k in ks
    ]
)

In [86]:
p1 = reogranize_by_k(
    norec_sentence_nb_df[norec_sentence_nb_df["k"].isin([0, 1])],
    add_baselines=["Random", "Constant"],
)
p2 = reogranize_by_k(
    norec_sentence_nb_df[norec_sentence_nb_df["k"].isin([4, 16])],
    add_baselines=["Random", "Constant"],
)

In [92]:
pd.concat([p1, p2]).to_csv(
    "agg_results/clf/norec_sentence_nb.tsv", sep="\t", index=False
)

In [155]:
# print_latex_df(p1)

In [157]:
# print_latex_df(p2)

In [93]:
task = "norquad_nb"

ks = [0, 1]

norquad_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

norquad_em_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="exact_match",
            overall=norquad_nb[k],
            add_k=k,
        )
        for k in ks
    ]
)

norquad_f1_df = pd.concat(
    [
        aggregate_by_skill(
            task, add_baselines=[], target_metric="f1", overall=norquad_nb[k], add_k=k
        )
        for k in ks
    ]
)

norquad_em_df["Rank"] = norquad_em_df.index.tolist()

norquad_nb_df = (
    norquad_em_df[
        ["Rank", "Model", "k", "norquad_nb (exact_match)", "delta (exact_match)"]
    ]
    .merge(norquad_f1_df, on=["Model", "k"])
    .set_index("Rank")
)

# print_latex_df(reogranize_by_k(norquad_nb_df, add_baselines=[]))

In [95]:
reogranize_by_k(norquad_nb_df, add_baselines=[]).to_csv(
    "agg_results/qa/norquad_nb.tsv", sep="\t", index=False
)

In [96]:
task = "noropenbookqa_nb"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nb = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nb_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=add_baselines,
            overall=noropenbookqa_nb[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [98]:
p1 = reogranize_by_k(
    noropenbookqa_nb_df[noropenbookqa_nb_df["k"].isin([0, 1])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p1)

In [99]:
p2 = reogranize_by_k(
    noropenbookqa_nb_df[noropenbookqa_nb_df["k"].isin([4, 16])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p2)

In [100]:
pd.concat([p1, p2]).to_csv("agg_results/qa/noropenbookqa_nb.tsv", sep="\t", index=False)

In [101]:
task = "noropenbookqa_nn"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nn = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nn_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=add_baselines,
            overall=noropenbookqa_nn[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [102]:
p1 = reogranize_by_k(
    noropenbookqa_nn_df[noropenbookqa_nn_df["k"].isin([0, 1])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p1)

In [103]:
p2 = reogranize_by_k(
    noropenbookqa_nn_df[noropenbookqa_nn_df["k"].isin([4, 16])],
    add_baselines=["Random", "Constant"],
)
# print_latex_df(p2)

In [105]:
pd.concat([p1, p2]).to_csv("agg_results/qa/noropenbookqa_nn.tsv", sep="\t", index=False)

In [107]:
task = "noropenbookqa_nb_use_fact"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nb_use_fact = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nb_use_fact_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=add_baselines,
            overall=noropenbookqa_nb_use_fact[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [83]:
import warnings

warnings.filterwarnings("ignore")

In [108]:
p1 = reogranize_by_k(
    noropenbookqa_nb_use_fact_df[noropenbookqa_nb_use_fact_df["k"].isin([0, 4])],
    add_baselines=["Random"],
)
# print_latex_df(p1)

In [109]:
p2 = reogranize_by_k(
    noropenbookqa_nb_use_fact_df[noropenbookqa_nb_use_fact_df["k"].isin([1, 16])],
    add_baselines=["Random"],
)
# print_latex_df(p2)

In [110]:
pd.concat([p1, p2]).to_csv(
    "agg_results/qa/noropenbookqa_nb_use_fact.tsv", sep="\t", index=False
)

In [111]:
task = "noropenbookqa_nn_use_fact"
add_baselines = [
    ["Random", 25.00],
]

ks = [0, 1, 4, 16]

noropenbookqa_nn_use_fact = {
    k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks
}

noropenbookqa_nn_use_fact_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=add_baselines,
            overall=noropenbookqa_nn_use_fact[k],
            add_k=k,
            target_metric="acc",
        )
        for k in ks
    ]
)

In [112]:
p1 = reogranize_by_k(
    noropenbookqa_nn_use_fact_df[noropenbookqa_nn_use_fact_df["k"].isin([0, 4])],
    add_baselines=["Random"],
)
# print_latex_df(p1)

In [113]:
p2 = reogranize_by_k(
    noropenbookqa_nn_use_fact_df[noropenbookqa_nn_use_fact_df["k"].isin([1, 16])],
    add_baselines=["Random"],
)
# print_latex_df(p2)

In [114]:
pd.concat([p1, p2]).to_csv(
    "agg_results/qa/noropenbookqa_nn_use_fact.tsv", sep="\t", index=False
)

In [115]:
# 260
task = "ask_gec_nb"
ask_gec = pd.read_csv("ask_gec_nb/ask_gec_k_shot.tsv", sep="\t").rename(
    columns={"k": "k-shot"}
)
ask_gec["task"] = task

ask_gec = ask_gec[["task", "model", "prompt", "k-shot", "errant"]]

ask_gec_overall = {k: {task: subset} for k, subset in ask_gec.groupby("k-shot")}

ask_gec_df = pd.concat(
    [
        aggregate_by_skill(
            task,
            add_baselines=[],
            overall=ask_gec_overall[k],
            add_k=k,
            target_metric="errant",
        )
        for k in ask_gec_overall
    ]
)

In [125]:
p1 = reogranize_by_k(
    ask_gec_df[ask_gec_df["k"].isin([0, 1])], add_baselines=[], change_cols=True
)
# print_latex_df(p1)

In [128]:
p2 = reogranize_by_k(
    ask_gec_df[ask_gec_df["k"].isin([1, 16])], add_baselines=[], change_cols=True
)
# print_latex_df()

In [129]:
pd.concat([p1, p2]).to_csv(
    "agg_results/generation/ask_gec_nb.tsv", sep="\t", index=False
)

In [130]:
task2metric_bs = {
    "tatoeba_eng_nno_nn": ["bertscore_f1"],
    "tatoeba_nno_eng_nn": ["bertscore_f1"],
    "tatoeba_eng_nob_nb": ["bertscore_f1"],
    "tatoeba_nob_eng_nb": ["bertscore_f1"],
    "tatoeba_nob_nno_nb": ["bertscore_f1"],
    "tatoeba_nno_nob_nn": ["bertscore_f1"],
}

In [131]:
bertscore_mt = pd.read_csv("bertscore_mt_k_shot.tsv", sep="\t")

In [132]:
bertscore_mt.head(2)

Unnamed: 0,model,bertscore_f1,k,prompt,task
0,mimir-mistral-7b-base-scratch,86.471,0,prompt_0,tatoeba_eng_nno_nn
1,mimir-mistral-7b-base-scratch,79.985,0,prompt_2,tatoeba_eng_nno_nn


In [151]:
def get_mt_results(
    task,
    ks,
    bertscore_mt=bertscore_mt,
    task2metric_bs=task2metric_bs,
    change_cols=True,
    task2metric=task2metric,
):
    mt = {k: {task: collect_task_prompt_results(task, k=k, verbose=False)} for k in ks}

    mt_bs = {
        k: {task: subset}
        for k, subset in bertscore_mt[bertscore_mt["task"] == task].groupby("k")
    }

    mt_res = {}

    target_cols = [
        "Rank",
        "Model",
        "bleu",
        "delta bleu",
        "chrf",
        "delta chrf",
        "bertscore_f1",
        "delta bertscore_f1",
        "k",
    ]

    for k in ks:
        bleu = aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="bleu",
            overall=mt[k],
            add_k=k,
            task2metric=task2metric,
        )
        bleu["Rank_bleu"] = bleu.index.tolist()

        chrf = aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="chrf",
            overall=mt[k],
            add_k=k,
            task2metric=task2metric,
        )
        chrf["Rank_chrf"] = chrf.index.tolist()

        bertscore = aggregate_by_skill(
            task,
            add_baselines=[],
            target_metric="bertscore_f1",
            overall=mt_bs[k],
            add_k=k,
            task2metric=task2metric_bs,
        )
        bertscore["Rank_bertscore"] = bertscore.index.tolist()

        merged = bleu.merge(chrf).merge(bertscore)
        ranks = build_ranks(merged)
        k_res, cols = [], []
        for i, row in merged.iterrows():
            model_res = []
            for key, v in dict(row).items():
                key = (
                    key.replace("_max)", "")
                    .replace(f"{task} (", "")
                    .replace("delta (", "delta ")
                    .rstrip(")")
                )
                if "Rank" in key:
                    continue
                if key not in cols:
                    cols.append(key)
                model_res.append(v)
            k_res.append(model_res)
        k_df = pd.DataFrame(k_res, columns=cols)
        k_df["Rank"] = k_df["Model"].apply(lambda x: ranks[x])
        mt_res[k] = k_df[target_cols]
    return mt_res

In [152]:
ks = [0, 1, 4, 16]

tatoeba_eng_nob_nb = get_mt_results(task="tatoeba_eng_nob_nb", ks=ks)

In [157]:
p1 = reogranize_by_k(tatoeba_eng_nob_nb[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_eng_nob_nb[4], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_eng_nob_nb[1], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_eng_nob_nb[16], change_rank=False, add_baselines=[])

In [172]:
pd.concat([p1, p3, p2, p4]).to_csv(
    "agg_results/generation/tatoeba_eng_nob_nb.tsv", sep="\t", index=False
)

In [163]:
tatoeba_nob_eng_nb = get_mt_results(task="tatoeba_nob_eng_nb", ks=ks)

In [165]:
p1 = reogranize_by_k(tatoeba_nob_eng_nb[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_nob_eng_nb[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_nob_eng_nb[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_nob_eng_nb[16], change_rank=False, add_baselines=[])

In [173]:
pd.concat([p1, p2, p3, p4]).to_csv(
    "agg_results/generation/tatoeba_nob_eng_nb.tsv", sep="\t", index=False
)

In [171]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [174]:
tatoeba_eng_nno_nn = get_mt_results(task="tatoeba_eng_nno_nn", ks=ks)

In [175]:
p1 = reogranize_by_k(tatoeba_eng_nno_nn[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_eng_nno_nn[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_eng_nno_nn[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_eng_nno_nn[16], change_rank=False, add_baselines=[])

In [176]:
pd.concat([p1, p2, p3, p4]).to_csv(
    "agg_results/generation/tatoeba_eng_nno_nn.tsv", sep="\t", index=False
)

In [177]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [178]:
tatoeba_nno_eng_nn = get_mt_results(task="tatoeba_nno_eng_nn", ks=ks)

In [181]:
p1 = reogranize_by_k(tatoeba_nno_eng_nn[0], change_rank=False, add_baselines=[])
p2 = reogranize_by_k(tatoeba_nno_eng_nn[1], change_rank=False, add_baselines=[])
p3 = reogranize_by_k(tatoeba_nno_eng_nn[4], change_rank=False, add_baselines=[])
p4 = reogranize_by_k(tatoeba_nno_eng_nn[16], change_rank=False, add_baselines=[])

In [183]:
pd.concat([p1, p2, p3, p4]).to_csv(
    "agg_results/generation/tatoeba_nno_eng_nn.tsv", sep="\t", index=False
)

In [233]:
# print_latex_df(p1)
# print_latex_df(p2)
# print_latex_df(p3)
# print_latex_df(p4)

In [185]:
tatoeba_nob_nno_nb = get_mt_results(task="tatoeba_nob_nno_nb", ks=[0])
del tatoeba_nob_nno_nb[0]["k"]
p1 = reogranize(tatoeba_nob_nno_nb[0].set_index("Rank"), add_baselines=[])
p1.to_csv("agg_results/generation/tatoeba_nob_nno_nb.tsv", sep="\t", index=False)
# print_latex_df(p1)

In [188]:
import shutil

shutil.make_archive("agg_results", "zip", "agg_results")

'/pfs/lustrep3/scratch/project_465000498/vlad/mimir/mimir_results/agg_results.zip'

In [186]:
tatoeba_nno_nob_nn = get_mt_results(task="tatoeba_nno_nob_nn", ks=[0])
del tatoeba_nno_nob_nn[0]["k"]
p1 = reogranize(tatoeba_nno_nob_nn[0].set_index("Rank"), add_baselines=[])
p1.to_csv("agg_results/generation/tatoeba_nno_nob_nn.tsv", sep="\t", index=False)
print_latex_df(p1)

\begin{tabular}{llrlrlrl}
\toprule
{} &                        Model &   bleu & delta bleu &   chrf & delta chrf &  bertscore\_f1 & delta bertscore\_f1 \\
Rank &                              &        &            &        &            &               &                    \\
\midrule
7    &                    \extended &  10.49 &      \xmark &  42.58 &      \xmark &         87.49 &              \xmark \\
5    &             \extendedscratch &  12.50 &      \xmark &  47.18 &      \xmark &         88.00 &              \xmark \\
9    &                        \base &  10.96 &      \xmark &  43.29 &      \xmark &         84.74 &              \xmark \\
2    &                 \basescratch &  24.36 &      \xmark &  64.14 &      \xmark &         94.53 &              \xmark \\
8    &                     \fiction &   8.93 &     --15.4 &  38.74 &     --25.4 &         89.16 &              --5.4 \\
10   &                  \nonfiction &   9.79 &     --14.6 &  40.60 &     --23.5 &         85.84 &       