In [1]:
import subprocess
import os
from dotenv import load_dotenv
load_dotenv()

from transformers import RobertaForMaskedLM, RobertaTokenizerFast
import json
import pandas as pd
import glob


import numpy as np
from transformers import AutoTokenizer, AutoModelForMaskedLM

import sys
sys.path.append("../UnMasked")

from unmasked.mlm.scoring import mlm_score_model_on_paradigm
from unmasked.holistic.scoring import holistic_score_model_on_paradigm
from unmasked import configs
from unmasked.utils import calc_accuracy_from_scores
# TODO batch size to 1

def eval(model_name = "loris3/stratified_10m_curriculum_random", EVAL_REPO_PATH = "../evaluation-pipeline-2024"):
    blimp_out_file = os.path.join("./results/blimp/", os.path.basename(model_name),"blimp_results.json")
    ewok_out_file = os.path.join("./results/ewok/", os.path.basename(model_name),"ewok_results.json")
    zorro_out_file = os.path.join("./results/zorro/", os.path.basename(model_name),"zorro.pkl")

    if (not os.path.exists(blimp_out_file)) or (not os.path.exists(ewok_out_file)):
        print("Running", os.path.basename(model_name))
        local_path = os.path.join("./models", os.path.basename(model_name))
        RobertaForMaskedLM.from_pretrained(model_name, cache_dir=local_path).save_pretrained(local_path)
        RobertaTokenizerFast.from_pretrained(model_name, cache_dir=local_path).save_pretrained(local_path)

        shellscript = subprocess.Popen([os.path.abspath("./eval.sh"), os.path.abspath(EVAL_REPO_PATH), os.path.abspath("./results"), os.path.abspath(local_path)], stdin=subprocess.PIPE)

        shellscript.wait()

        if shellscript.returncode != 0:
            raise NotImplementedError
    blimp = None
    with open(blimp_out_file) as f:
        blimp = json.load(f)
    ewok = None
    with open(ewok_out_file) as f:
        ewok = json.load(f)


    if (not os.path.exists(zorro_out_file)):
        os.makedirs(os.path.dirname(zorro_out_file))
        MODEL_REPO = model_name  # name of huggingface model hub repository
        LOWER_CASE = False  # should model be evaluated on lower-cased input?
        TEST_SUITE_NAME = ['zorro', 'blimp'][0]

        if TEST_SUITE_NAME == 'blimp':
            num_expected_scores = 2000
        elif TEST_SUITE_NAME == 'zorro':
            num_expected_scores = 4000
        else:
            raise AttributeError('Invalid "TEST_SUITE_NAME".')


        # each iteration in this "for" loop produces 1 row in the data-frame
        for scoring_method in ['mlm']:#, 'holistic']:

            if scoring_method == 'mlm':
                score_model_on_paradigm = mlm_score_model_on_paradigm
            elif scoring_method == 'holistic':
                score_model_on_paradigm = holistic_score_model_on_paradigm
            else:
                raise AttributeError('Invalid scoring_method.')

            # load from repo
            tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO,
                                                    #   add_prefix_space=True,  # this must be True for BabyBERTa
                                                    )
            model = AutoModelForMaskedLM.from_pretrained(MODEL_REPO)

            model.eval()
            model.cuda(0)

            # for each paradigm in test suite
            accuracies = []
            for path_paradigm in (configs.Dirs.test_suites / TEST_SUITE_NAME).glob('*.txt'):

                # scoring
                print(f"Scoring {path_paradigm.name:<60} with {MODEL_REPO:<40} and method={scoring_method}")
                scores = score_model_on_paradigm(model, tokenizer, path_paradigm, lower_case=LOWER_CASE)

                assert len(scores) == num_expected_scores

                # compute accuracy
                accuracy = calc_accuracy_from_scores(scores, scoring_method)

                # collect
                accuracies.append(accuracy)
            df_zorro = pd.DataFrame.from_dict({"zorro":{
                                                "acc,none":  np.mean(accuracies),
                                                "acc_stderr,none": np.std(accuracies)}} ).T
            df_zorro.to_pickle(zorro_out_file)

            


    df = pd.concat([pd.DataFrame.from_dict(blimp["results"]).T, 
                        pd.DataFrame.from_dict(ewok["results"]).T, 
                        pd.read_pickle(zorro_out_file)])
        
    df.index.name="metric"
    df["model"] = model_name
    df = df.drop("alias", axis=1)
    return df


In [2]:
from huggingface_hub import list_models

model_names = [model.id for model in list_models(author="loris3") if "stratified" in model.id]

In [None]:
df = pd.concat([eval(model_name) for model_name in model_names])

Scoring argument_structure-dropped_argument.txt                      with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring filler-gap-wh_question_object.txt                            with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring anaphor_agreement-pronoun_gender.txt                         with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring agreement_subject_verb-in_question_with_aux.txt              with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring local_attractor-in_question_with_aux.txt                     with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring agreement_subject_verb-in_simple_question.txt                with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring argument_structure-transitive.txt                            with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring npi_licensing-matrix_question.txt                            with loris3/stratifie



Scoring irregular-verb.txt                                           with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring agreement_subject_verb-across_relative_clause.txt            with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring argument_structure-swapped_arguments.txt                     with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring ellipsis-n_bar.txt                                           with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring binding-principle_a.txt                                      with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring npi_licensing-only_npi_licensor.txt                          with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring island-effects-coordinate_structure_constraint.txt           with loris3/stratified_10m_curriculum_random  and method=mlm
Scoring agreement_determiner_noun-between_neighbors.txt              with loris3/stratifie



Scoring irregular-verb.txt                                           with loris3/stratified_10m_curriculum_curriculum and method=mlm
Scoring agreement_subject_verb-across_relative_clause.txt            with loris3/stratified_10m_curriculum_curriculum and method=mlm
Scoring argument_structure-swapped_arguments.txt                     with loris3/stratified_10m_curriculum_curriculum and method=mlm
Scoring ellipsis-n_bar.txt                                           with loris3/stratified_10m_curriculum_curriculum and method=mlm
Scoring binding-principle_a.txt                                      with loris3/stratified_10m_curriculum_curriculum and method=mlm
Scoring npi_licensing-only_npi_licensor.txt                          with loris3/stratified_10m_curriculum_curriculum and method=mlm
Scoring island-effects-coordinate_structure_constraint.txt           with loris3/stratified_10m_curriculum_curriculum and method=mlm
Scoring agreement_determiner_noun-between_neighbors.txt              



Scoring irregular-verb.txt                                           with loris3/stratified_10m_curriculum_lognorm and method=mlm
Scoring agreement_subject_verb-across_relative_clause.txt            with loris3/stratified_10m_curriculum_lognorm and method=mlm
Scoring argument_structure-swapped_arguments.txt                     with loris3/stratified_10m_curriculum_lognorm and method=mlm
Scoring ellipsis-n_bar.txt                                           with loris3/stratified_10m_curriculum_lognorm and method=mlm
Scoring binding-principle_a.txt                                      with loris3/stratified_10m_curriculum_lognorm and method=mlm
Scoring npi_licensing-only_npi_licensor.txt                          with loris3/stratified_10m_curriculum_lognorm and method=mlm
Scoring island-effects-coordinate_structure_constraint.txt           with loris3/stratified_10m_curriculum_lognorm and method=mlm
Scoring agreement_determiner_noun-between_neighbors.txt              with loris3/stratifie

In [4]:
from scipy.stats import ttest_ind_from_stats, ttest_rel

In [None]:
def do_ttests(benchmark_name, files, model_names):
    dfs = [pd.concat([pd.read_json(os.path.join("./results/",benchmark_name.split("_")[0], os.path.basename(model_name),file)) for file in sorted(files)]) for model_name in model_names]
    data = sorted([(df["acc"].mean(),df["acc"]) for df in dfs], key= lambda l : l[0], reverse=True)
    return {benchmark_name:[ttest_rel(a,b, alternative="greater").pvalue for (mean_a,a),(mean_b,b) in zip(data, data[1:])]}

In [14]:
filenames_supplement  = [os.path.basename(file) for file in sorted(glob.glob(os.path.join(os.path.join("./results/blimp", os.path.basename(model_names[0])), "blimp_supplement_*.jsonl")))]
filenames_filtered  = [os.path.basename(file)  for file in sorted(glob.glob(os.path.join(os.path.join("./results/blimp", os.path.basename(model_names[0])), "*.jsonl"))) if "supplement" not in file]
filenames_ewok  = [os.path.basename(file) for file in sorted(glob.glob(os.path.join(os.path.join("./results/ewok", os.path.basename(model_names[0])), "*.jsonl")))]


In [15]:
df

Unnamed: 0_level_0,"acc,none","acc_stderr,none",model
metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
blimp_supplement,0.593624,0.006536,loris3/stratified_10m_curriculum_random
blimp_supplement_hypernym,0.527316,0.017216,loris3/stratified_10m_curriculum_random
blimp_supplement_qa_congruence_easy,0.65625,0.059839,loris3/stratified_10m_curriculum_random
blimp_supplement_qa_congruence_tricky,0.527273,0.038985,loris3/stratified_10m_curriculum_random
blimp_supplement_subject_aux_inversion,0.689423,0.007442,loris3/stratified_10m_curriculum_random
...,...,...,...
ewok_social-interactions_filtered,0.608844,0.02851,loris3/stratified_10m_curriculum_lognorm
ewok_social-properties_filtered,0.618902,0.026857,loris3/stratified_10m_curriculum_lognorm
ewok_social-relations_filtered,0.809432,0.009986,loris3/stratified_10m_curriculum_lognorm
ewok_spatial-relations_filtered,0.906122,0.013189,loris3/stratified_10m_curriculum_lognorm


In [16]:
p_vals = \
do_ttests("blimp_supplement",filenames_supplement, model_names) | \
do_ttests("blimp_filtered",filenames_filtered, model_names) | \
do_ttests("ewok_filtered",filenames_ewok, model_names)
p_vals

{'blimp_supplement': [7.590704819910127e-08, 1.9877681712805908e-08],
 'blimp_filtered': [0.466450505365944, 2.5340507688717866e-15],
 'ewok_filtered': [0.05926213499380416, 0.026100260128861328]}

In [17]:
do_ttests("ewok_filtered",filenames_ewok, model_names)

{'ewok_filtered': [0.05926213499380416, 0.026100260128861328]}

In [19]:

df_results = df[df.index.isin(["blimp_supplement", "blimp_filtered", "ewok_filtered","zorro"])]#.set_index("model", append=True).groupby("model").apply(lambda a: a[:])
df_results = df_results.reset_index().set_index("model").pivot(columns="metric")
df_results.columns = df_results.columns.swaplevel(0)
df_results.columns = pd.MultiIndex.from_tuples([(c, b.replace(",none", "")) for c,b in df_results.columns])
df_results = pd.concat(
    {group: df_results.xs(group, axis=1, level=0) for group in df_results.columns.get_level_values(0).unique()},
    axis=1,
)
df_results

Unnamed: 0_level_0,blimp_filtered,blimp_filtered,blimp_supplement,blimp_supplement,ewok_filtered,ewok_filtered,zorro,zorro
Unnamed: 0_level_1,acc,acc_stderr,acc,acc_stderr,acc,acc_stderr,acc,acc_stderr
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
loris3/stratified_10m_curriculum_curriculum,0.596632,0.00183,0.559653,0.006679,0.635073,0.004923,79.534783,13.596438
loris3/stratified_10m_curriculum_lognorm,0.611121,0.001774,0.593484,0.006342,0.67437,0.00484,84.491304,20.331693
loris3/stratified_10m_curriculum_random,0.611433,0.001774,0.593624,0.006536,0.646186,0.004838,46.430435,19.246929


In [20]:
pd.DataFrame(df_results[df_results.columns[0::2]].mean(axis=1).sort_values(ascending=False))

Unnamed: 0_level_0,0
model,Unnamed: 1_level_1
loris3/stratified_10m_curriculum_lognorm,21.59257
loris3/stratified_10m_curriculum_curriculum,20.331535
loris3/stratified_10m_curriculum_random,12.070419


In [None]:
p_vals

{'blimp_supplement': [0.4747132271757554,
  0.3004397308194905,
  0.0009588872761548292,
  0.3243917504107078,
  0.36507420328216866,
  2.1031225481428053e-05,
  0.294396922570642,
  3.1146923250438456e-09,
  0.46626128823151514],
 'blimp_filtered': [0.11788989535844516,
  0.466450505365944,
  3.2760058483410295e-08,
  0.2688501520239526,
  0.00500298079454132,
  6.352886483779778e-05,
  0.2092155824387692,
  1.5486834543173893e-24,
  4.413455238272855e-12],
 'ewok_filtered': [0.0002150092335641699,
  0.23917834797066245,
  0.45857072042718916,
  0.4012425262313602,
  0.44352411629586136,
  0.3809385139362454,
  0.3028725827520253,
  0.4033518053087544,
  0.12544322432293803]}

In [None]:
for col in df_results.columns[0::2]:
    metric = col[0]
    df =df_results[[col[0]]].sort_values(by=df_results[[col]].columns[0], ascending=False)
    df[(metric,"p")] = [pd.NA] + p_vals[metric]
    display(df)

Unnamed: 0_level_0,blimp_filtered,blimp_filtered,blimp_filtered
Unnamed: 0_level_1,acc,acc_stderr,p
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
loris3/stratified_10m_curriculum_influential_examples_first_half,0.613203,0.001766,
loris3/stratified_10m_curriculum_random,0.611433,0.001774,0.11789
loris3/stratified_10m_curriculum_lognorm,0.611121,0.001774,0.466451
loris3/stratified_10m_curriculum_dirac,0.602535,0.00178,0.0
loris3/stratified_10m_curriculum_lognorm_top_50_pct_twice_shuffled,0.60137,0.001812,0.26885
loris3/stratified_10m_curriculum_curriculum,0.596632,0.00183,0.005003
loris3/stratified_10m_curriculum_dirac_positive_only,0.588921,0.001839,6.4e-05
loris3/stratified_10m_curriculum_top_90_full_epoch_shuffled,0.587918,0.001854,0.209216
loris3/stratified_10m_curriculum_lognorm_top_50_pct_shuffled,0.570286,0.001852,0.0
loris3/stratified_10m_curriculum_lognorm_top_50_pct,0.56007,0.001871,0.0


Unnamed: 0_level_0,blimp_supplement,blimp_supplement,blimp_supplement
Unnamed: 0_level_1,acc,acc_stderr,p
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
loris3/stratified_10m_curriculum_dirac,0.628778,0.006218,
loris3/stratified_10m_curriculum_dirac_positive_only,0.620367,0.006206,0.474713
loris3/stratified_10m_curriculum_top_90_full_epoch_shuffled,0.617415,0.006388,0.30044
loris3/stratified_10m_curriculum_influential_examples_first_half,0.6174,0.006362,0.000959
loris3/stratified_10m_curriculum_lognorm_top_50_pct,0.604583,0.006569,0.324392
loris3/stratified_10m_curriculum_lognorm_top_50_pct_twice_shuffled,0.602646,0.006229,0.365074
loris3/stratified_10m_curriculum_random,0.593624,0.006536,2.1e-05
loris3/stratified_10m_curriculum_lognorm,0.593484,0.006342,0.294397
loris3/stratified_10m_curriculum_lognorm_top_50_pct_shuffled,0.576253,0.006687,0.0
loris3/stratified_10m_curriculum_curriculum,0.559653,0.006679,0.466261


Unnamed: 0_level_0,ewok_filtered,ewok_filtered,ewok_filtered
Unnamed: 0_level_1,acc,acc_stderr,p
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
loris3/stratified_10m_curriculum_influential_examples_first_half,0.687686,0.004736,
loris3/stratified_10m_curriculum_lognorm,0.67437,0.00484,0.000215
loris3/stratified_10m_curriculum_dirac,0.666974,0.004916,0.239178
loris3/stratified_10m_curriculum_lognorm_top_50_pct,0.664983,0.004902,0.458571
loris3/stratified_10m_curriculum_lognorm_top_50_pct_twice_shuffled,0.660918,0.004912,0.401243
loris3/stratified_10m_curriculum_lognorm_top_50_pct_shuffled,0.660767,0.004945,0.443524
loris3/stratified_10m_curriculum_top_90_full_epoch_shuffled,0.660534,0.004878,0.380939
loris3/stratified_10m_curriculum_dirac_positive_only,0.654016,0.004857,0.302873
loris3/stratified_10m_curriculum_random,0.646186,0.004838,0.403352
loris3/stratified_10m_curriculum_curriculum,0.635073,0.004923,0.125443
