In [None]:
import os
import sys
import json
import pandas as pd

from collections import defaultdict

In [None]:
glue = ['squad', 'squad_v2'] #["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst2", 'squad', 'squad_v2']

dirs_per_task_topk = defaultdict(list)
dirs_per_task_trimmed = defaultdict(list)

for task in glue:
    for dirs in os.listdir(f"../../results/{task}"):
        if dirs.startswith("new_moebert_k2_experiment"):
            #print(dirs)
            if 'trl_' in dirs:
                dirs_per_task_trimmed[task].append(f"../../results/{task}/{dirs}")
            else:
                # add the full path to the list
                dirs_per_task_topk[task].append(f"../../results/{task}/{dirs}")    

In [None]:
def get_best_metric(list, metric_name):
    best_metric = 0
    best_epoch = 0
    for item_dict in list:
        if f'eval_{metric_name}' in item_dict:
            if item_dict[f'eval_{metric_name}'] > best_metric:
                best_metric = item_dict[f'eval_{metric_name}']
                best_epoch = item_dict['epoch']
    return best_metric, best_epoch

In [None]:
# create a dataframe with task, dis, wdec given the path
# for each of them, get the best metric, given the model in the path

metric_for_task = {
    "cola": "matthews_correlation",
    "mnli": "accuracy",
    "mrpc": "accuracy",
    "qnli": "accuracy",
    "qqp": "f1",
    "rte": "accuracy",
    "sst2": "accuracy",
    "squad" : "f1",
    "squad_v2": "f1",
}

df_results = pd.DataFrame(columns=["task", "dis", "wdec", "value", "epoch", "seed"])

for task in glue:
    for path in dirs_per_task_topk[task]:
        #print(path)
        try:
            with open(f"{path}/model/trainer_state.json", "r") as f:
                results = json.load(f)
            best_result, best_epoch = get_best_metric(results['log_history'], metric_for_task[task])
        except FileNotFoundError:
            try:
                with open(f"{path}/all_results.json", "r") as f:
                    best_epoch = 10
                    best_result = json.load(f)[metric_for_task[task]]
            except FileNotFoundError:
                best_result = 0
                best_epoch = 0
        dis = path.split("/")[-1].split("_")[5]
        wdec = path.split("/")[-1].split("_")[7]
        try:
            seed = path.split("/")[-1].split("_")[9]
        except:
            seed = "0"
        df_results = df_results.append({"task": task, "dis": dis, "wdec": wdec, "value": best_result, "epoch": best_epoch, "seed": seed}, ignore_index=True)


In [None]:
# number of experiments per task
print(df_results.groupby("task").count()['value'])

In [None]:
# get the best value for each task, and the corresponding dis and wdec and epoch
best_results = pd.DataFrame(columns=["task", "dis", "wdec", "value", "epoch", "seed"])
for task in glue:
    best_results = best_results.append(df_results[df_results["task"] == task].sort_values(by="value", ascending=False).iloc[0])

In [None]:
best_results

## Trimmed Lasso Gate results

In [None]:
df_results_trimmed = pd.DataFrame(columns=["task", "dis", "wdec", "trl", "value", "epoch", "seed"])

for task in glue:
    for path in dirs_per_task_trimmed[task]:
        try:
            with open(f"{path}/model/trainer_state.json", "r") as f:
                results = json.load(f)
            best_result, best_epoch = get_best_metric(results['log_history'], metric_for_task[task])
            print(best_result, best_epoch)
        except FileNotFoundError:
            try:
                with open(f"{path}/all_results.json", "r") as f:
                    best_epoch = 10
                    best_result = json.load(f)[metric_for_task[task]]
            except FileNotFoundError:
                best_result = 0
                best_epoch = 0
        dis = path.split("/")[-1].split("_")[5]
        wdec = path.split("/")[-1].split("_")[7]
        trl = path.split("/")[-1].split("_")[9]
        try:
            seed = path.split("/")[-1].split("_")[11]
        except:
            seed = "0"
        df_results_trimmed = df_results_trimmed.append({"task": task, "dis": dis, "wdec": wdec, "trl": trl, "value": best_result, "epoch": best_epoch, "seed": seed}, ignore_index=True)


In [None]:
print(df_results_trimmed.groupby("task").count()['value'])

In [None]:
# get the best value for each task, and the corresponding dis and wdec and epoch
best_results_trimmed = pd.DataFrame(columns=["task", "dis", "wdec", "trl", "value", "epoch", "seed"])
for task in glue:
    try:
        best_results_trimmed = best_results_trimmed.append(df_results_trimmed[df_results_trimmed["task"] == task].sort_values(by="value", ascending=False).iloc[0], ignore_index=True)
    except IndexError:
        #print(task)
        pass

In [None]:
print(best_results_trimmed.to_markdown())

In [None]:
best_results.reset_index(drop=True, inplace=True)

In [None]:
best_results

In [None]:
best_results[best_results['task'] != 'mnli'].reset_index(drop=True)

In [None]:
# Winning models (when we do better than the best baseline)
best_results_trimmed[best_results_trimmed['value'] >= best_results[best_results['task'] != 'mnli'].reset_index(drop=True)['value']]

In [None]:
df_results_trimmed = pd.DataFrame(columns=["task", "dis", "wdec", "trl", "value", "epoch", "seed"])

for task in glue:
    for path in dirs_per_task_trimmed[task]:
        try:
            with open(f"{path}/model/all_results.json", "r") as f:
                best_epoch = 10
                best_result = json.load(f)[metric_for_task[task]]
        except FileNotFoundError:
            best_result = 0
            best_epoch = 0
        dis = path.split("/")[-1].split("_")[5]
        wdec = path.split("/")[-1].split("_")[7]
        trl = path.split("/")[-1].split("_")[9]
        try:
            seed = path.split("/")[-1].split("_")[11]
        except:
            seed = "0"
        df_results_trimmed = df_results_trimmed.append({"task": task, "dis": dis, "wdec": wdec, "trl": trl, "value": best_result, "epoch": best_epoch, "seed": seed}, ignore_index=True)


In [None]:
df_results_trimmed