In [1]:
model_names = ["loris3/stratified_10m_curriculum_random", "loris3/stratified_10m_curriculum_curriculum"]

In [2]:
import subprocess
import os
from dotenv import load_dotenv
load_dotenv()

from transformers import AutoModel, AutoTokenizer
import json
import pandas as pd
import glob
# TODO batch size to 1

def eval(model_name = "loris3/stratified_10m_curriculum_random", EVAL_REPO_PATH = "../evaluation-pipeline-2024"):
    blimp_out_file = os.path.join("./results/blimp/", os.path.basename(model_name),"blimp_results.json")
    ewok_out_file = os.path.join("./results/ewok/", os.path.basename(model_name),"ewok_results.json")
    if not os.path.exists(blimp_out_file) or not os.path.exists(ewok_out_file):
            
        local_path = os.path.join("./models", os.path.basename(model_name))
        model = AutoModel.from_pretrained(model_name, cache_dir=local_path)
        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_path)

        shellscript = subprocess.Popen([os.path.abspath("./eval.sh"), os.path.abspath(EVAL_REPO_PATH), os.path.abspath("./results"), os.path.abspath(local_path)], stdin=subprocess.PIPE)

        shellscript.wait()

        if shellscript.returncode != 0:
            raise NotImplementedError
    blimp = None
    with open(blimp_out_file) as f:
        blimp = json.load(f)
    ewok = None
    with open(ewok_out_file) as f:
        ewok = json.load(f)

    df = pd.concat([pd.DataFrame.from_dict(blimp["results"]).T, pd.DataFrame.from_dict(ewok["results"]).T])
    df.index.name="metric"
    df["model"] = model_name
    df = df.drop("alias", axis=1)
    return df


In [3]:
from scipy.stats import ttest_ind_from_stats, ttest_rel

In [4]:
def do_ttests(benchmark_name, files, model_names):
    dfs = [pd.concat([pd.read_json(os.path.join("./results/",benchmark_name.split("_")[0], os.path.basename(model_name),file)) for file in sorted(files)]) for model_name in model_names]
    data = sorted([(df["acc"].mean(),df["acc"]) for df in dfs], key= lambda l : l[0], reverse=True)
    return {benchmark_name:[ttest_rel(a,b, alternative="greater").pvalue for (mean_a,a),(mean_b,b) in zip(data[0::2], data[1::2])]}

In [5]:
filenames_supplement  = [os.path.basename(file) for file in sorted(glob.glob(os.path.join(os.path.join("./results/blimp", os.path.basename(model_names[0])), "blimp_supplement_*.jsonl")))]
filenames_filtered  = [os.path.basename(file)  for file in sorted(glob.glob(os.path.join(os.path.join("./results/blimp", os.path.basename(model_names[0])), "*.jsonl"))) if "supplement" not in file]
filenames_ewok  = [os.path.basename(file) for file in sorted(glob.glob(os.path.join(os.path.join("./results/ewok", os.path.basename(model_names[0])), "*.jsonl")))]


In [6]:
p_vals = \
do_ttests("blimp_supplement",filenames_supplement, model_names) | \
do_ttests("blimp_filtered",filenames_filtered, model_names) | \
do_ttests("ewok_filtered",filenames_ewok, model_names)
p_vals

{'blimp_supplement': [1.9877681712805908e-08],
 'blimp_filtered': [2.4399063824300964e-15],
 'ewok_filtered': [0.026100260128861328]}

In [7]:
df = pd.concat([eval(model_name) for model_name in model_names])
df_results = df[df.index.isin(["blimp_supplement", "blimp_filtered", "ewok_filtered"])]#.set_index("model", append=True).groupby("model").apply(lambda a: a[:])
df_results = df_results.reset_index().set_index("model").pivot(columns="metric")
df_results.columns = df_results.columns.swaplevel(0)
df_results.columns = pd.MultiIndex.from_tuples([(c, b.replace(",none", "")) for c,b in df_results.columns])
df_results = pd.concat(
    {group: df_results.xs(group, axis=1, level=0) for group in df_results.columns.get_level_values(0).unique()},
    axis=1,
)
df_results

Unnamed: 0_level_0,blimp_filtered,blimp_filtered,blimp_supplement,blimp_supplement,ewok_filtered,ewok_filtered
Unnamed: 0_level_1,acc,acc_stderr,acc,acc_stderr,acc,acc_stderr
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
loris3/stratified_10m_curriculum_curriculum,0.596632,0.00183,0.559653,0.006679,0.635073,0.004923
loris3/stratified_10m_curriculum_random,0.611433,0.001774,0.593624,0.006536,0.646186,0.004838


In [8]:
for col in df_results.columns[0::2]:
    metric = col[0]
    df =df_results[[col[0]]].sort_values(by=df_results[[col]].columns[0], ascending=False)
    df[(metric,"p")] = [pd.NA] + p_vals[metric]
    display(df)

Unnamed: 0_level_0,blimp_filtered,blimp_filtered,blimp_filtered
Unnamed: 0_level_1,acc,acc_stderr,p
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
loris3/stratified_10m_curriculum_random,0.611433,0.001774,
loris3/stratified_10m_curriculum_curriculum,0.596632,0.00183,0.0


Unnamed: 0_level_0,blimp_supplement,blimp_supplement,blimp_supplement
Unnamed: 0_level_1,acc,acc_stderr,p
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
loris3/stratified_10m_curriculum_random,0.593624,0.006536,
loris3/stratified_10m_curriculum_curriculum,0.559653,0.006679,0.0


Unnamed: 0_level_0,ewok_filtered,ewok_filtered,ewok_filtered
Unnamed: 0_level_1,acc,acc_stderr,p
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
loris3/stratified_10m_curriculum_random,0.646186,0.004838,
loris3/stratified_10m_curriculum_curriculum,0.635073,0.004923,0.0261
