In [1]:
import os
import sys
import json
import pandas as pd

from collections import defaultdict

In [2]:
glue = ["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst2"]

dirs_per_task = defaultdict(list)

for task in glue:
    for dirs in os.listdir(f"../../results/{task}"):
        if dirs.startswith("new_moebert_k2_experiment"):
            # add the full path to the list
            dirs_per_task[task].append(f"../../results/{task}/{dirs}")    

In [3]:
def get_best_metric(list, metric_name):
    best_metric = 0
    best_epoch = 0
    for item_dict in list:
        if f'eval_{metric_name}' in item_dict:
            if item_dict[f'eval_{metric_name}'] > best_metric:
                best_metric = item_dict[f'eval_{metric_name}']
                best_epoch = item_dict['epoch']
    return best_metric, best_epoch

In [4]:
# create a dataframe with task, dis, wdec given the path
# for each of them, get the best metric, given the model in the path

metric_for_task = {
    "cola": "matthews_correlation",
    "mnli": "accuracy",
    "mrpc": "accuracy",
    "qnli": "accuracy",
    "qqp": "f1",
    "rte": "accuracy",
    "sst2": "accuracy",
}

df_results = pd.DataFrame(columns=["task", "dis", "wdec", "value", "epoch", "seed"])

for task in glue:
    for path in dirs_per_task[task]:
        try:
            with open(f"{path}/model/trainer_state.json", "r") as f:
                results = json.load(f)
            best_result, best_epoch = get_best_metric(results['log_history'], metric_for_task[task])
        except FileNotFoundError:
            best_result = 0
            best_epoch = 0
        dis = path.split("/")[-1].split("_")[5]
        wdec = path.split("/")[-1].split("_")[7]
        try:
            seed = path.split("/")[-1].split("_")[9]
        except:
            seed = "0"
        df_results = df_results.append({"task": task, "dis": dis, "wdec": wdec, "value": best_result, "epoch": best_epoch, "seed": seed}, ignore_index=True)


In [5]:
# number of experiments per task
print(df_results.groupby("task").count()['value'])

task
cola    30
mnli    28
mrpc    28
qnli    30
qqp     28
rte     27
sst2    28
Name: value, dtype: int64


In [6]:
# get the best value for each task, and the corresponding dis and wdec and epoch
best_results = pd.DataFrame(columns=["task", "dis", "wdec", "value", "epoch", "seed"])
for task in glue:
    best_results = best_results.append(df_results[df_results["task"] == task].sort_values(by="value", ascending=False).iloc[0])

In [7]:
best_results

Unnamed: 0,task,dis,wdec,value,epoch,seed
28,cola,4,0.0,0.578642,7.67,1
37,mnli,1,0.0,0.147681,4.24,1
78,mrpc,1,0.01,0.867647,8.71,1
104,qnli,2,0.0,0.916346,3.05,1
132,qqp,2,0.0,0.881005,4.22,1
164,rte,5,0.0,0.689531,5.77,0
181,sst2,5,0.1,0.927752,3.33,0
