In [22]:
import os
import pandas as pd
from typing import *

In [3]:
from run_evaluation import *

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
def make_df(folder_path: str, multimodal_dataset: bool) -> Tuple[pd.DataFrame, List[str]]:
    all_files = [x for x in os.listdir(folder_path) if x.endswith(".csv")]
    all_dataset_names = [x.replace("_with_answers.csv", "") for x in all_files]
    relevant_keys = {"total_correct_percent", "total_correct_meta_found_percent", "total_correct_percent_ex_missing", "total_correct_meta_found_percent_ex_missing", "total"}
    # collect all results
    all_results = {}
    total_by_model = {}
    for file_idx, file in enumerate(all_files):
        full_path = os.path.join(folder_path, file)
        stats = run_eval_by_file(full_path, None, multimodal_dataset, use_cached_only=True)
        dataset_name = all_dataset_names[file_idx]
    
        for model_name, results in stats.items():
            results["total"] = results["total_correct_percent"] if results["total_correct_meta_found_percent"] is None else ((results["total_correct_percent"] + results["total_correct_meta_found_percent"])/2)
    
            if model_name not in all_results:
                all_results[model_name] = {}
                total_by_model[model_name] = 0
            for key in relevant_keys:
                final_key = f"{dataset_name}_{key}"
                all_results[model_name][final_key] = results[key]
                if key == "total":
                    total_by_model[model_name] += results[key]
    
    arr = []
    for model_name, values in all_results.items():
        if model_name != "assigned":
            values = {"model": model_name, "total": total_by_model[model_name] / len(all_files), **values}
            arr.append(values)
    
    df = pd.DataFrame(arr)
    return df, all_dataset_names
    

# RAG-text Dataset

For details about this dataset, check the [Readme.](../../README.md)

In [21]:
df, all_dataset_names = make_df("../data/by_sub_dataset_with_answers/rag-text", False)
df[['model', 'total']+[f"{x}_total" for x in all_dataset_names]].sort_values('total', ascending=False)

Unnamed: 0,model,total,rag_text_50_rev23_meta_total,rag_text_50_rev23_thousands_no_hint_total,rag_text_50_rev23_thousands_hint_total,rag_text_50_rev23_millions_no_hint_total,rag_text_50_rev_meta_total,rag_text_50_rev22_millions_no_hint_total
0,claude-3-opus-20240229,0.892545,0.98,0.86,0.84,0.96,0.89527,0.82
1,gpt-4-1106-preview,0.789279,0.92,0.64,0.84,0.94,0.675676,0.72
2,meta/meta-llama-3-70b-instruct,0.698243,0.91,0.56,0.52,0.94,0.459459,0.8
4,mistralai/Mixtral-8x22B-Instruct-v0.1,0.602432,0.85,0.72,0.62,0.7,0.344595,0.38
5,databricks/dbrx-instruct,0.574392,0.65,0.5,0.6,0.72,0.476351,0.5
3,meta/meta-llama-3-8b-instruct,0.24,0.42,0.22,0.32,0.36,,0.12


# Selection-text Dataset

For details about this dataset, check the [Readme.](../../README.md)

In [25]:
df, all_dataset_names = make_df("../data/by_sub_dataset_with_answers/selection-text", False)
df[['model', 'total']+[f"{x}_total" for x in all_dataset_names]].sort_values('total', ascending=False)

Unnamed: 0,model,total,selection_text_50_rev22_millions_no_hint_total,selection_text_50_rev23_meta_total,selection_text_50_rev23_millions_no_hint_total,selection_text_50_rev23_thousands_hint_total,selection_text_50_rev23_thousands_no_hint_total,selection_text_50_rev_meta_total
0,claude-3-opus-20240229,0.910473,0.9,1.0,0.94,0.9,0.76,0.962838
1,gpt-4-1106-preview,0.890901,0.8,0.96,0.96,0.94,0.78,0.905405
4,mistralai/Mixtral-8x22B-Instruct-v0.1,0.75795,0.78,0.95,0.8,0.62,0.82,0.577703
2,meta/meta-llama-3-70b-instruct,0.729347,0.94,0.9,1.0,0.54,0.54,0.456081
3,databricks/dbrx-instruct,0.596757,0.5,0.74,0.74,0.56,0.5,0.540541


# Selection-images Dataset

For details about this dataset, check the [Readme.](../../README.md)

In [29]:
df, all_dataset_names = make_df("../data/by_sub_dataset_with_answers/selection-image", True)
df[['model', 'total']+[f"{x}_total" for x in all_dataset_names]].sort_values('total', ascending=False)

Unnamed: 0,model,total,selection_image_50_rev23_thousands_no_hint_total,selection_image_50_rev_meta_total,selection_image_50_rev23_millions_no_hint_total,selection_image_50_rev23_thousands_hint_total,selection_image_50_rev22_millions_no_hint_total,selection_image_50_rev23_meta_total
1,gpt-4-1106-vision-preview,0.533784,0.48,0.452703,0.72,0.5,0.2,0.85
0,claude-3-opus-20240229,0.511171,0.4,0.777027,0.5,0.42,0.2,0.77
