# Aggregate results from GLUE evaluation

The model and adaptation results on GLUE have already been obtained, we simply need to collect the results from the different log files.

Move to root folder

In [None]:
%cd ../..

In [None]:
import json
import os
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

### Helper functions

In [None]:
TASK_METRICS = {"cola": "matthews_correlation", 
                "mnli": "accuracy", 
                "mnli_mm": "accuracy", 
                "mrpc": "combined_score", 
                "qnli": "accuracy", 
                "qqp": "combined_score", 
                "rte": "accuracy", 
                "sst2": "accuracy", 
                "stsb": "combined_score", 
                "wnli": "accuracy"}

UNIMODAL_MODELS = ["bert-base-uncased"]
MULTIMODAL_MODELS = ["clipbert", "lxmert", "visualbert"]
MULTIMODAL_ADAPTATIONS = ["-avg-visual-features", 
                          "-finetuned-lxmert-visual-features", 
                          "-finetuned-wikipedia-visual-features", 
                          "-no-visual-features",
                          "-no-visual-features-finetuned-lxmert", 
                          "-no-visual-features-finetuned-wikipedia", 
                          "-zero-image-visual-features",
                          "-zeroed-visual-features"]
MODEL_SPECIFIC_MULTIMODAL_ADAPTATIONS = {"clipbert": ["-imagined-visual-features"],
                                         "lxmert": [],
                                         "visualbert": []
                                        }
UNIMODAL_ADAPTATIONS = ["", "-trained-lxmert", "-trained-lxmert-scratch", "-trained-wikipedia"]

In [None]:
def get_eval_score(filename, task):
    with open(filename, 'r') as f:
        scores = json.load(f)
    metric_name = "eval_"+TASK_METRICS[task]
    return scores[metric_name]

In [None]:
def get_eval_filenames(dirname):
    eval_filenames = {}
    for file in os.listdir(dirname):
        if "GLUE-benchmark-" in file:
            # folders with run results look as follows: 'GLUE-benchmark-rte-bert-base-uncased-2022-05-02T09-38'
            task_name = file.split("-")[2]
            eval_filename = os.path.join(dirname, file, ("_").join([task_name, "eval_results.json"]))
            if os.path.exists(eval_filename):
                if task_name in eval_filenames:
                    raise ValueError(f"Duplicate entries for task {task_name} found in {dirname}")
                else:
                    eval_filenames[task_name] = eval_filename
                    # mnli-mm is evaluated together with mnli
                    if task_name == "mnli":
                        eval_filenames["mnli_mm"] = eval_filename.replace("mnli_eval_results", "mnli_mm_eval_results")
    if not eval_filenames.keys() == TASK_METRICS.keys():                    
        print(f"Warning: All eval task files should be present in the given folder '{dirname}'. Found only:\n{eval_filenames.keys()}\nShould have:\n{TASK_METRICS.keys()}")
    return eval_filenames

In [None]:
def extract_filename(model, adaptation, dirname):
    model_name = model+adaptation
    model_dirname = os.path.join(dirname, model_name)
    
    if not os.path.exists(model_dirname):
        print(f"Warning: Missing results, the directory '{model_dirname}' should exist")
        return None, None
    return model_name, model_dirname
    
def get_model_dirnames(dirname):
    model_dirnames = {}
    for model in UNIMODAL_MODELS:
        for adaptation in UNIMODAL_ADAPTATIONS:
            model_name, model_dirname = extract_filename(model, adaptation, dirname)
            if model_name is not None:
                model_dirnames[model_name] = model_dirname
    for model in MULTIMODAL_MODELS:
        for adaptation in MULTIMODAL_ADAPTATIONS:
            model_name, model_dirname = extract_filename(model, adaptation, dirname)
            if model_name is not None:
                model_dirnames[model_name] = model_dirname
        for adaptation in MODEL_SPECIFIC_MULTIMODAL_ADAPTATIONS[model]:
            model_name, model_dirname = extract_filename(model, adaptation, dirname)
            if model_name is not None:
                model_dirnames[model_name] = model_dirname
            
    return model_dirnames

In [None]:
def get_mnli_eval_results(dirname, logname_starter):
    if logname_starter is None:
        raise ValueError("logname_starter cannot be None")
    eval_acc = []
    values_found = 0
    for file in os.listdir(dirname):
        if file[:6] == logname_starter and file[-6:] == "_1.out":
            with open(os.path.join(dirname, file), "r") as f:
                for line in f.readlines():
                    if " eval_accuracy " in line:
                        accuracy_part = line.split()[-1]
                        eval_acc.append(float(accuracy_part))
                        values_found += 1 
    assert values_found == 2, f"There should be two mnli eval_accuracy values in {dirname}"
    return eval_acc

### Collect results

In [None]:
results = pd.DataFrame(columns=["model", "task", "score"])

model_dirnames = get_model_dirnames("GLUE/data/logs")
for model, dirname in model_dirnames.items():
    eval_filenames = get_eval_filenames(dirname)
    for task, eval_filename in eval_filenames.items():
        score = get_eval_score(eval_filename, task)
        results = results.append({"model": model, "task": task, "score": score}, ignore_index = True)

In [None]:
results

In [None]:
results.groupby("model").count()

In [None]:
results.groupby("model").mean()

## Save the results

In [None]:
results.to_csv("GLUE/data/results.csv", index=False)