In [20]:
import os
import pandas as pd
from benchbench.data import cardinal_benchmark_list
from benchbench.data import load_cardinal_benchmark
from benchbench.utils.base import rankdata
from benchbench.measures.cardinal import get_sensitivity

model_col_name = {
    "BigBenchHard": "Model",
    "GLUE": "Model",
    "MMLU": "model_name",
    "MTEB": "Model",
    "OpenLLM": "Model",
    "SuperGLUE": "Model",
    "VTAB": None,
}
dir_out_before = "./data/before"
dir_out_after = "./data/after"
os.makedirs(dir_out_before, exist_ok=True)
os.makedirs(dir_out_after, exist_ok=True)
for bench in cardinal_benchmark_list:
    data, cols = load_cardinal_benchmark(bench, do_rerank=False)
    cols = list(cols)
    data["Average"] = data[cols].mean(axis=1)
    data["Rank"] = rankdata(-data["Average"])
    mc = model_col_name[bench]
    if mc is None:
        data["Model"] = data.index
    else:
        data["Model"] = data[mc]
    data["Model"] = data["Model"].apply(lambda s: s.replace(",", " "))
    data = data.sort_values("Rank")
    data = data.reset_index()
    data[["Model", "Rank", "Average"] + cols].to_csv(
        f"{dir_out_before}/{bench}.csv", index=False
    )

    sensitivity, weights = get_sensitivity(data, cols, return_weight=True)
    new_cols = ["%s (× %.4lf)" % (c, w) for c, w in zip(cols, weights)]

    weighted_df = data[cols].multiply(weights)
    weighted_df.columns = new_cols
    data = pd.concat([data, weighted_df], axis=1)

    data["Rank-Before"] = data["Rank"]
    data["Average-Before"] = data["Average"]
    data["Average"] = data[new_cols].mean(axis=1)
    data["Rank"] = rankdata(-data["Average"])
    data = data.sort_values("Rank")
    data = data.reset_index()
    data[["Model", "Rank", "Rank-Before", "Average", "Average-Before"] + new_cols].to_csv(
        f"{dir_out_after}/{bench}.csv", index=False
    )
    print(bench, "%.2lf, %.2lf" % (sensitivity[0], sensitivity[1]))

GLUE 0.12, 0.73
SuperGLUE 0.11, 0.30
OpenLLM 0.46, 0.89
MMLU 0.34, 0.93
BigBenchHard 0.25, 0.80
MTEB 0.18, 0.61
VTAB 0.07, 0.27


In [18]:
import os
import math
import numpy as np
from benchbench.data import load_ordinal_benchmark
from benchbench.utils.base import rankdata
from benchbench.utils.win_rate import WinningRate
from benchbench.measures.ordinal import get_sensitivity


ordinal_benchmark_list = [
    "BigCode",
    "HELM-accuracy",
    "HELM-fairness",
    "HELM-robustness",
    "HEIM-alignment_auto",
    "HEIM-quality_auto",
    "HEIM-aesthetics_auto",
    "HEIM-alignment_human",
    "HEIM-nudity",
    "HEIM-quality_human",
    "HEIM-aesthetics_human",
]

model_col_name = {
    "BigCode": "Models",
    "HELM-accuracy": "Model/adapter",
    "HELM-fairness": "Model",
    "HELM-robustness": "Model",
    "HEIM-alignment_auto": "Model/adapter",
    "HEIM-quality_auto": "Model/adapter",
    "HEIM-aesthetics_auto": "Model/adapter",
    "HEIM-alignment_human": "Model/adapter",
    "HEIM-nudity": "Model/adapter",
    "HEIM-quality_human": "Model/adapter",
    "HEIM-aesthetics_human": "Model/adapter",
}

dir_out_before = "./data/before"
dir_out_after = "./data/after"
os.makedirs(dir_out_before, exist_ok=True)
os.makedirs(dir_out_after, exist_ok=True)
for bench in ordinal_benchmark_list:
    data, cols = load_ordinal_benchmark(bench, do_rerank=True)
    cols = list(cols)
    inv_indices = np.arange(len(data) // 5)
    inv_data = data.iloc[inv_indices].copy()

    win_rate_calculator = WinningRate(data, cols)
    inv_data["WinRate"] = win_rate_calculator.get_winning_rate(inv_indices)
    inv_data["Rank"] = rankdata(-inv_data["WinRate"])
    mc = model_col_name[bench]
    inv_data["Model"] = inv_data[mc]
    inv_data["Model"] = inv_data["Model"].apply(lambda s: s.replace(",", " "))

    sensitivity, new_indices = get_sensitivity(
        data, cols, inv_indices=inv_indices, return_indices=True
    )
    new_data = data.iloc[new_indices].copy()

    new_data["Model"] = new_data[mc]
    new_data["Model"] = new_data["Model"].apply(lambda s: s.replace(",", " "))
    new_data["Rank-Before"] = [
        inv_data["Rank"].values[i] if i in inv_indices else math.nan for i in new_indices
    ]
    new_data["WinRate-Before"] = [
        inv_data["WinRate"].values[i] if i in inv_indices else math.nan for i in new_indices
    ]
    new_win_rate = win_rate_calculator.get_winning_rate(new_indices)[inv_indices]
    new_data["WinRate"] = [
        new_win_rate[i] if i in inv_indices else math.nan for i in new_indices
    ]
    new_rank = rankdata(-new_win_rate)
    new_data["Rank"] = [new_rank[i] if i in inv_indices else math.nan for i in new_indices]

    new_data = new_data.sort_values("Rank")
    new_data = new_data.reset_index()
    new_data[
        ["Model", "Rank", "Rank-Before", "WinRate", "WinRate-Before"] + cols
    ].to_csv(f"{dir_out_after}/{bench}.csv", index=False)

    inv_data = inv_data.sort_values("Rank")
    inv_data = inv_data.reset_index()
    inv_data[["Model", "Rank", "WinRate"] + cols].to_csv(
        f"{dir_out}/{bench}.csv", index=False
    )

    print(bench, "%.2lf, %.2lf" % (sensitivity[0], sensitivity[1]))


Episode 999, loss 1.92
BigCode 0.04, 0.14
Episode 999, loss 3.44
HELM-accuracy 0.32, 0.67
Episode 999, loss 4.67
HELM-fairness 0.18, 0.58
Episode 999, loss 3.70
HELM-robustness 0.26, 0.58
Episode 999, loss 0.60
HEIM-alignment_auto 0.30, 0.50
Episode 999, loss 1.00
HEIM-quality_auto 0.10, 0.25
Episode 999, loss 0.39
HEIM-aesthetics_auto 0.40, 0.50
Episode 999, loss 1.37
HEIM-alignment_human 0.10, 0.25
Episode 999, loss 0.26
HEIM-nudity 0.39, 0.62
Episode 999, loss 0.52
HEIM-quality_human 0.10, 0.25
Episode 999, loss 0.18
HEIM-aesthetics_human 0.10, 0.25


In [2]:
!pwd

/Users/ghzhang/Codes/benchmarking_aggregation_icml2023/benchmark


In [3]:
import joblib as jbl

cardinal_diversity = jbl.load("./data/cardinal_diversity.jbl")
cardinal_sensitivity = jbl.load("./data/cardinal_sensitivity.jbl")
ordinal_diversity = jbl.load("./data/ordinal_diversity.jbl")
ordinal_sensitivity = jbl.load("./data/ordinal_sensitivity.jbl")

KeyError: 236