In [1]:
import os
import re
import json

RE_PATH = r"(bert_.*)-ckpt(\d+)_(.*)_.*"

metrics = {
    'cola': ["eval_matthews_correlation"],
    'qnli': ["eval_accuracy"],
    'qqp': ["eval_accuracy", "eval_f1", "eval_combined_score"],
    'mnli': ["eval_accuracy", "eval_accuracy_mm"],
    'mrpc': ["eval_accuracy", "eval_combined_score", "eval_f1"],
    'rte': ["eval_accuracy"],
    'sst2': ["eval_accuracy"],
    'stsb': ["eval_combined_score", "eval_pearson", "eval_spearmanr"],
    'wnli': ["eval_accuracy"],
}

data = []

for root, dirs, files in os.walk("./Exp3_results"):
    if files:
        m = re.search(RE_PATH, root)
        model = m[1]
        ckpt = m[2]
        task = m[3]
        if task not in metrics:
            continue
        with open(root + "/" + files[0]) as f:
            d = json.load(f)
            for m in metrics[task]:
                data.append({
                    'model': model,
                    'task': task,
                    'ckpt': ckpt,
                    'metric': m,
                    'score': d[m],
                })

In [4]:
import pandas as pd

df = pd.DataFrame.from_dict(data).astype({'ckpt': 'int32'})

df = df.pivot(index=['model', 'ckpt'], columns=['task', 'metric'], values='score')
df.to_feather('exp3_glue_scores.arrow')

In [5]:
df

Unnamed: 0_level_0,task,stsb,stsb,stsb,mnli,mnli,mrpc,mrpc,mrpc,cola,wnli,qnli,rte,sst2,qqp,qqp,qqp
Unnamed: 0_level_1,metric,eval_combined_score,eval_pearson,eval_spearmanr,eval_accuracy,eval_accuracy_mm,eval_accuracy,eval_combined_score,eval_f1,eval_matthews_correlation,eval_accuracy,eval_accuracy,eval_accuracy,eval_accuracy,eval_accuracy,eval_f1,eval_combined_score
model,ckpt,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
bert_base_pairwise_20240204-093411,6000,0.264303,0.268919,0.259687,0.700458,0.699349,0.666667,0.71147,0.756272,0.135211,0.183099,0.755446,0.534296,0.824541,0.85256,0.812352,0.832456
bert_base_pairwise_20240204-093411,8000,0.382209,0.386848,0.37757,0.709322,0.709622,0.669118,0.713807,0.758497,0.126047,0.211268,0.77009,0.527076,0.817661,0.864556,0.824284,0.84442
bert_base_pairwise_20240204-093411,10000,0.529593,0.533869,0.525317,0.713602,0.718775,0.661765,0.705428,0.749091,0.114511,0.225352,0.790225,0.534296,0.81078,0.870443,0.829403,0.849923
bert_base_pairwise_20240204-093411,12000,0.616176,0.617254,0.615099,0.714722,0.714199,0.666667,0.712766,0.758865,0.095727,0.225352,0.801574,0.523466,0.821101,0.873732,0.83387,0.853801
bert_base_pairwise_20240204-093411,14000,0.698375,0.699896,0.696853,0.730107,0.742779,0.659314,0.706211,0.753108,0.103545,0.394366,0.829398,0.534296,0.825688,0.876107,0.836973,0.85654
bert_base_pairwise_20240204-093411,16000,0.824799,0.826162,0.823435,0.744778,0.750509,0.710784,0.757059,0.803333,0.137759,0.323944,0.83013,0.541516,0.84289,0.883428,0.844737,0.864083
bert_base_pairwise_20240204-093411,18000,0.844538,0.84651,0.842567,0.754967,0.764341,0.715686,0.7602,0.804714,0.177195,0.352113,0.824638,0.534296,0.865826,0.889612,0.853849,0.871731
bert_base_pairwise_20240204-093411,20000,0.849193,0.85051,0.847877,0.757412,0.766985,0.720588,0.76401,0.807432,0.149432,0.366197,0.837818,0.555957,0.872706,0.893594,0.858673,0.876133
bert_base_pairwise_20240204-093411,22000,0.841395,0.843245,0.839545,0.764238,0.773291,0.781863,0.815889,0.849916,0.196526,0.323944,0.838184,0.527076,0.877294,0.891541,0.855448,0.873494
bert_base_pairwise_20240204-093411,24000,0.843558,0.845415,0.841701,0.77351,0.774308,0.772059,0.806813,0.841567,0.281467,0.28169,0.838916,0.574007,0.879587,0.893965,0.859521,0.876743
