In [1]:
import os
import json
from tqdm import tqdm
import pandas as pd

In [2]:
PAPER_RES = 'chemteb-results'
MTEB_RES = 'mteb-results'

In [3]:
def read_main_score(path):
    with open(path, 'r') as f:
        data = json.load(f)
    return data['scores']['test'][0]['main_score']

First, you need to clone the following repo in the same directory as this notebook:
```bash
git clone https://github.com/HSILA/mteb-results.git
```

In [4]:
TASKS = ['ChemHotpotQARetrieval.json',
         'ChemNQRetrieval.json',
         'PubChemAISentenceParaphrasePC.json',
         'PubChemSMILESBitextMining.json',
         'PubChemSMILESPC.json',
         'PubChemSynonymPC.json',
         'PubChemWikiPairClassification.json',
         'PubChemWikiParagraphsPC.json',
         'SDSEyeProtectionClassification.json',
         'SDSGlovesClassification.json',
         'WikipediaBiolumNeurochemClassification.json',
         'WikipediaBioMetChemClassification.json',
         'WikipediaChemEngSpecialtiesClassification.json',
         'WikipediaChemFieldsClassification.json',
         'WikipediaChemistryTopicsClassification.json',
         'WikipediaChemistryTopicsClustering.json',
         'WikipediaCompChemSpectroscopyClassification.json',
         'WikipediaCryobiologySeparationClassification.json',
         'WikipediaCrystallographyAnalyticalClassification.json',
         'WikipediaGreenhouseEnantiopureClassification.json',
         'WikipediaIsotopesFissionClassification.json',
         'WikipediaLuminescenceClassification.json',
         'WikipediaOrganicInorganicClassification.json',
         'WikipediaSaltsSemiconductorsClassification.json',
         'WikipediaSolidStateColloidalClassification.json',
         'WikipediaSpecialtiesInChemistryClustering.json',
         'WikipediaTheoreticalAppliedClassification.json']

In [5]:
def find_json_files(base_path, json_file_names):
    json_file_paths = []

    for root, dirs, files in os.walk(base_path):
        for file in files:
            if file in json_file_names and file.endswith('.json'):
                json_file_paths.append(os.path.join(root, file))

    return json_file_paths

In [6]:
json_file_paths = find_json_files(MTEB_RES, TASKS)
print(f'number of evaluated models: {int(len(json_file_paths) / len(TASKS))}')

number of evaluated models: 34


Extracting models and revisions from the results:

In [7]:
model_rev_map = {}

for el in json_file_paths:
    splitted = el.split(os.path.sep)
    model = splitted[2]
    rev = splitted[3]
    if model not in model_rev_map.keys():
        model_rev_map[model] = rev

In [8]:
model_mapping = {'sentence-transformers__all-MiniLM-L12-v2': 'all-MiniLM-L12-v2',
                 'sentence-transformers__all-MiniLM-L6-v2': 'all-MiniLM-L6-v2',
                 'sentence-transformers__all-mpnet-base-v2': 'all-mpnet-base-v2',
                 'bedrock__amazon-titan-embed-text-v1': 'amazon-titan-embed-text-v1',
                 'bedrock__amazon-titan-embed-text-v2': 'amazon-titan-embed-text-v2',
                 'bedrock__cohere-embed-english-v3': 'cohere-embed-english-v3',
                 'bedrock__cohere-embed-multilingual-v3': 'cohere-embed-multilingual-v3',
                 'sentence-transformers__multi-qa-mpnet-base-dot-v1': 'multi-qa-mpnet-base-dot-v1',
                 'openai__text-embedding-3-large': 'text-embedding-3-large',
                 'openai__text-embedding-3-small': 'text-embedding-3-small',
                 'openai__text-embedding-ada-002': 'text-embedding-ada-002'}

In [9]:
task_mapping = {
    "PubChemSMILESBitextMining.json": ['PubChemSMILESCanonDescBM.json',
                                       'PubChemSMILESCanonTitleBM.json',
                                       'PubChemSMILESISoDescBM.json',
                                       'PubChemSMILESISoTitleBM.json',
                                       ],
    "PubChemSMILESPC.json": ['PubChemSMILESCanonDescPC.json',
                             'PubChemSMILESCanonTitlePC.json',
                             'PubChemSMILESIsoDescPC.json',
                             'PubChemSMILESIsoTitlePC.json'
                             ],
    'WikipediaBiolumNeurochemClassification.json': 'WikipediaMedium2BioluminescenceVsNeurochemistryClassification.json',
    'WikipediaBioMetChemClassification.json': 'WikipediaEasy2GeneExpressionVsMetallurgyClassification.json',
    'WikipediaChemEngSpecialtiesClassification.json': 'WikipediaMedium5Classification.json',
    'WikipediaChemFieldsClassification.json': 'WikipediaEZ10Classification.json',
    'WikipediaChemistryTopicsClassification.json': 'WikipediaEasy10Classification.json',
    'WikipediaChemistryTopicsClustering.json': 'WikipediaEasy10Clustering.json',
    'WikipediaCompChemSpectroscopyClassification.json': 'WikipediaMedium2ComputationalVsSpectroscopistsClassification.json',
    'WikipediaCryobiologySeparationClassification.json': 'WikipediaEasy5Classification.json',
    'WikipediaCrystallographyAnalyticalClassification.json': 'WikipediaMedium2CrystallographyVsChromatographyTitrationpHClassification.json',
    'WikipediaGreenhouseEnantiopureClassification.json': 'WikipediaEasy2GreenhouseVsEnantiopureClassification.json',
    'WikipediaIsotopesFissionClassification.json':  'WikipediaHard2IsotopesVsFissionProductsNuclearFissionClassification.json',
    'WikipediaLuminescenceClassification.json':  'WikipediaHard2BioluminescenceVsLuminescenceClassification.json',
    'WikipediaOrganicInorganicClassification.json': 'WikipediaEasy2SpecialClassification.json',
    'WikipediaSaltsSemiconductorsClassification.json': 'WikipediaHard2SaltsVsSemiconductorMaterialsClassification.json',
    'WikipediaSolidStateColloidalClassification.json': 'WikipediaEasy2SolidStateVsColloidalClassification.json',
    'WikipediaSpecialtiesInChemistryClustering.json':  'WikipediaMedium5Clustering.json',
    'WikipediaTheoreticalAppliedClassification.json': 'WikipediaEZ2Classification.json',
}

In [11]:
diffs = {}

for model, rev in tqdm(model_rev_map.items()):
    diffs[model] = {}
    for task in TASKS:
        if task == 'PubChemWikiPairClassification.json': # newly added task, not present in paper
            continue
        task_name = task.split('.')[0]
        mteb_task_path = os.path.join(MTEB_RES, "results", model, rev, task)
        mteb_score = read_main_score(mteb_task_path)

        c_model = model_mapping.get(model, model)

        c_rev = rev if os.path.exists(os.path.join(PAPER_RES, c_model, rev)) else os.listdir(
            os.path.join(PAPER_RES, c_model))[0]

        if task in task_mapping.keys():
            if isinstance(task_mapping[task], list):
                chem_score = 0
                for t2 in task_mapping[task]:
                    subttask_path = os.path.join(PAPER_RES, c_model, c_rev, t2)
                    sub_score = read_main_score(subttask_path)
                    chem_score += sub_score
                chem_score = chem_score / len(task_mapping[task])
                diffs[model][task_name] = chem_score - mteb_score
            else:
                chem_task_path = os.path.join(PAPER_RES, c_model, c_rev, task_mapping[task])
                chem_score = read_main_score(chem_task_path)
                diffs[model][task_name] = chem_score - mteb_score
        else:
            chem_task_path = os.path.join(PAPER_RES, c_model, c_rev, task)
            chem_score = read_main_score(chem_task_path)
            diffs[model][task_name] = chem_score - mteb_score

100%|██████████| 34/34 [00:01<00:00, 24.36it/s]


In [12]:
df = pd.DataFrame.from_dict(diffs, orient="columns")

A negative value means that the proposed task in MTEB performed better than the same task when reproducing the paper’s results.

In [20]:
df.mean(axis=1).sort_values(ascending=False)

PubChemWikiParagraphsPC                             0.237436
PubChemSMILESPC                                     0.085895
WikipediaIsotopesFissionClassification              0.027486
WikipediaLuminescenceClassification                 0.021270
WikipediaSpecialtiesInChemistryClustering           0.009875
WikipediaGreenhouseEnantiopureClassification        0.006102
WikipediaOrganicInorganicClassification             0.005592
PubChemSynonymPC                                    0.003267
SDSEyeProtectionClassification                      0.001166
PubChemSMILESBitextMining                           0.000880
WikipediaCrystallographyAnalyticalClassification   -0.000253
SDSGlovesClassification                            -0.001646
WikipediaChemistryTopicsClassification             -0.002229
WikipediaCompChemSpectroscopyClassification        -0.002781
WikipediaTheoreticalAppliedClassification          -0.005767
WikipediaBioMetChemClassification                  -0.007841
WikipediaBiolumNeurochem

In [21]:
df.mean(axis=1).mean()

0.004524595146259088