In [1]:
import pandas as pd
import json
import os

In [2]:
def read_json(path):
    with open(path, 'r') as f:
        return json.load(f)

In [3]:
def extract_results(path):
    models_list = os.listdir(path)

    times = {}
    scores = {}

    for model in models_list:
        model_name = model.replace('__', '/')
        scores[model_name] = {}
        times[model_name] = {}

        rev = os.listdir(os.path.join(path, model))[0]
        tasks = os.listdir(os.path.join(path, model, rev))

        for task in tasks:
            if task in ['model_meta.json', 'CoconutRetrieval.json']:
                continue
            data = read_json(os.path.join(
                path,
                model,
                rev,
                task
            ))
            task_name = data['task_name']
            if task_name.endswith('Classification'):
                scores[model_name][task_name] = data['scores']['test'][0]['f1']
            else:
                scores[model_name][task_name] = data['scores']['test'][0]['main_score']

            times[model_name][task_name] = data['evaluation_time']

    return pd.DataFrame.from_dict(scores, orient='index'), pd.DataFrame.from_dict(times, orient='index')

In [4]:
def aggregate_scores(scores_df):
    def get_category(task_name):
        if task_name.endswith('PC'):
            return 'PairClassification'
        elif task_name.endswith('Classification'):
            return 'Classification'
        elif task_name.endswith('Retrieval'):
            return 'Retrieval'
        elif task_name.endswith('Clustering'):
            return 'Clustering'
        elif task_name.endswith('BM') or 'BitextMining' in task_name:
            return 'BitextMining'

    categories = ['PairClassification',
                  'Classification',
                  'BitextMining',
                  'Retrieval',
                  'Clustering']
    
    aggregated_scores = pd.DataFrame(index=scores_df.index, columns=categories)

    for category in categories:
        category_columns = [col for col in scores_df.columns if get_category(col) == category]

        aggregated_scores[category] = scores_df[category_columns].mean(axis=1)

    return aggregated_scores

In [5]:
scores, times = extract_results('chemteb-results')

In [6]:
index_order = ['google-bert/bert-base-uncased', 'allenai/scibert_scivocab_uncased',
               'm3rg-iitd/matscibert', 'recobo/chemical-bert-uncased',
               'nomic-ai/nomic-bert-2048', 'nomic-ai/nomic-embed-text-v1',
               'nomic-ai/nomic-embed-text-v1.5', 'all-MiniLM-L6-v2',
               'all-MiniLM-L12-v2', 'all-mpnet-base-v2',
               'multi-qa-mpnet-base-dot-v1', 'intfloat/e5-small',
               'intfloat/e5-base', 'intfloat/e5-large',
               'intfloat/e5-small-v2', 'intfloat/e5-base-v2',
               'intfloat/e5-large-v2', 'intfloat/multilingual-e5-small',
               'intfloat/multilingual-e5-base', 'intfloat/multilingual-e5-large',
               'BAAI/bge-small-en', 'BAAI/bge-base-en',
               'BAAI/bge-large-en', 'BAAI/bge-small-en-v1.5',
               'BAAI/bge-base-en-v1.5', 'BAAI/bge-large-en-v1.5',
               'BAAI/bge-m3', 'text-embedding-3-small',
               'text-embedding-3-large', 'text-embedding-ada-002',
               'amazon-titan-embed-text-v2', 'amazon-titan-embed-text-v1',
               'cohere-embed-english-v3', 'cohere-embed-multilingual-v3']

In [8]:
column_order = ['Classification', 'BitextMining', 'Retrieval',
                'Clustering', 'PairClassification',]

In [9]:
scores_agg = aggregate_scores(scores)
scores_agg = scores_agg.reindex(index_order)
scores_agg = scores_agg[column_order]
scores_agg.round(2)

Unnamed: 0,Classification,BitextMining,Retrieval,Clustering,PairClassification
google-bert/bert-base-uncased,0.72,0.0,0.28,0.2,0.41
allenai/scibert_scivocab_uncased,0.71,0.0,0.2,0.18,0.43
m3rg-iitd/matscibert,0.7,0.0,0.11,0.21,0.41
recobo/chemical-bert-uncased,0.68,0.0,0.17,0.13,0.42
nomic-ai/nomic-bert-2048,0.67,0.0,0.05,0.22,0.38
nomic-ai/nomic-embed-text-v1,0.77,0.0,0.72,0.46,0.55
nomic-ai/nomic-embed-text-v1.5,0.78,0.0,0.75,0.5,0.55
all-MiniLM-L6-v2,0.78,0.0,0.61,0.36,0.54
all-MiniLM-L12-v2,0.77,0.0,0.58,0.34,0.54
all-mpnet-base-v2,0.78,0.0,0.56,0.5,0.54
