In [1]:
import os
import json
import regex
import pandas as pd

In [2]:
SCHEMES = ['GPT', 'B1','B2', 'PP', 'PL', 'PD']
LMMs = ['llava1.6-7bf', 'qwen2-vl-7bf', 'llava1.6-7bf,qwen2-vl-7bf']
LLMs = ['qwen2.5-14bf', 'mistral-12bf', 'qwen3-14bf', 'qwen2.5-7bf', 'llama3.1-8bf']
MINI = ['llava1.6-7bf', 'qwen2-vl-7bf', 'mistral-12bf', 'qwen2.5-7bf', 'llama3.1-8bf']
GPT = ['gpt4o-mini']
TASKS = {
    'fhm': 'FHM', 
    'harmc': "HarMeme", 
    'harmp': "Harm-P",
    'multioff': "MultiOFF",
    'mami': 'MAMI',
    'pridemm': "PrideMM"
}

### Main

In [3]:
def format_number(num):
    return str(round(num * 100, 2))

def traverse_model_dir(model, model_dir, task, split='test', res_by_task=None, seed=42, batch=1, llm="", lmm=""):
    for run in os.listdir(model_dir):
        tmp = run.split("_")[0]
        if tmp in SCHEMES:
            scheme = tmp
            if scheme == 'B1':
                lmm = model
            elif scheme in ['B2', 'PP', 'PL', 'PD']:
                llm = model
                #lmm = ",".join(run.split("_len-")[0].split("_")[1:])
            run_fd = os.path.join(model_dir, run)
            for round in os.listdir(run_fd):
                if os.path.isdir(os.path.join(run_fd, round)):
                    round_fd = os.path.join(run_fd, round)
                    if 'result.json' in os.listdir(round_fd):
                        res_file = os.path.join(round_fd, 'result.json')
                        res_dict = json.load(open(res_file))
                        one_res = {
                            'seed': seed,
                            'split': split,
                            'BS': batch,
                            'scheme': scheme,
                            'LLM': llm,
                            'LMM': lmm,
                            'acc': format_number(res_dict['acc']),
                            'f1': format_number(res_dict['f1']),
                        }
                        res_by_task[task].append(one_res)
    return res_by_task

def traverse_llm_dir(model, llm_fd, task, split='test', res_by_task=None, seed=42):
    for lmm in os.listdir(llm_fd):
        if lmm in LMMs + GPT:
            lmm_fd = os.path.join(llm_fd, lmm)
            for batch in os.listdir(lmm_fd):
                if regex.match(r"BS\-\d+", batch):
                    batch_num = int(batch.split("-")[1])
                    batch_fd = os.path.join(lmm_fd, batch)
                    res_by_task = traverse_model_dir(
                        model, batch_fd, task, split,
                        res_by_task, seed, batch=batch_num, lmm=lmm)
    return res_by_task

def main_table(
    root = "./results"
):  
    ## Start
    res_by_task = {task: [] for task in TASKS}
    for task in os.listdir(root):
        task_fd = os.path.join(root, task)
        for split in os.listdir(task_fd):
            split_fd = os.path.join(task_fd, split)
            for fd in os.listdir(split_fd):
                if fd in MINI: ### Mini models <= 13B
                    model = fd
                    mini_model_fd = os.path.join(split_fd, fd)
                    if model in LMMs:
                        res_by_task = traverse_model_dir(model, mini_model_fd, task, split, res_by_task)
                    elif model in LLMs:
                        res_by_task = traverse_llm_dir(model, mini_model_fd, task, split, res_by_task)
                if fd.startswith("seed-"): ### Small models > 13B
                    this_seed = fd.split("-")[-1]
                    seed_fd = os.path.join(split_fd, fd)
                    for model in os.listdir(seed_fd):
                        model_fd = os.path.join(seed_fd, model)
                        if model in GPT:
                            res_by_task = traverse_model_dir(model, model_fd, task, split, res_by_task, seed=this_seed)
                        elif model in LLMs:### Small models > 13B
                            res_by_task = traverse_llm_dir(model, model_fd, task, split, res_by_task, seed=this_seed)

    main_dict = {scheme: [] for scheme in SCHEMES}
    fhm_specific = {'B2': [], 'PP': [], 'PL': []}
    columns = []
    task0 = 'fhm'
    for rid, rec in enumerate(res_by_task[task0]):
        this_acc, this_f1 = rec.pop('acc'), rec.pop('f1')
        metrics = {f'{task0}_Acc': this_acc, f'{task0}_F1': this_f1}
        oneline = dict(**rec, **metrics)
        task0_split = rec['split']
        if task0 == 'fhm':
            task0_split = rec['split'].split("_")[0] # test_seen --> test
            oneline['split'] = task0_split
        # Traverse other tasks
        # for task, rec_ls in res_by_task.items():
        #     if task != task0:
        for task in TASKS:
            if task != task0:
                rec_ls = res_by_task[task]
                rec_found = False
                for one_rec in rec_ls: # one_rect = {'seed': 42, 'split':'test', ...,'f1': f1,}
                    if not rec_found:
                        cond = []
                        for k, v in one_rec.items():
                            if k not in ['acc', 'f1']:
                                # if k == 'split':
                                #     cond.append(task0_split == v.split("_")[0])
                                # else:
                                cond.append(oneline[k] == v)
                        if cond and all(cond):
                            rec_found = True
                            this_task_metrics = {f'{task}_Acc': one_rec['acc'], f'{task}_F1': one_rec['f1']}
                            # oneline = dict(**oneline, **this_task_metrics)
                if not rec_found:
                    this_task_metrics = {f'{task}_Acc': "", f'{task}_F1': ""}
                oneline = dict(**oneline, **this_task_metrics)
        if not columns:
            columns = list(oneline.keys())
        
        # FHM specific
        if oneline['LMM'] == 'llava1.6-7bf,qwen2-vl-7bf':
            fhm_specific[oneline['scheme']].append(list(oneline.values()))
        else:
            main_dict[oneline['scheme']].append(list(oneline.values()))
    
    main_tab = []
    for scheme, line_ls in main_dict.items():
        main_tab.extend(line_ls)
    for scheme, line_ls in fhm_specific.items():
        main_tab.extend(line_ls)
    df = pd.DataFrame(main_tab, columns=columns)
    return df

main_table()

Unnamed: 0,seed,split,BS,scheme,LLM,LMM,fhm_Acc,fhm_F1,harmc_Acc,harmc_F1,harmp_Acc,harmp_F1,multioff_Acc,multioff_F1,mami_Acc,mami_F1,pridemm_Acc,pridemm_F1
0,42,test,1,GPT,,gpt4o-mini,67.6,65.51,70.9,69.46,65.35,65.35,65.77,64.86,77.4,76.59,72.39,72.28
1,42,test,1,B1,,qwen2-vl-7bf,64.2,62.68,67.51,60.29,56.34,53.05,71.14,64.61,68.1,66.03,68.44,68.43
2,42,test,1,B1,,llava1.6-7bf,60.4,57.85,66.38,61.05,56.34,53.62,59.73,57.38,67.8,67.46,60.16,59.99
3,42,test,16,B2,qwen2.5-14bf,qwen2-vl-7bf,70.1,70.02,61.02,57.92,60.0,59.91,53.69,53.15,,,68.44,68.42
4,42,test,16,B2,qwen2.5-14bf,llava1.6-7bf,68.0,67.7,59.6,51.46,63.94,63.84,64.43,62.93,,,66.47,66.47
5,42,test,16,PP,qwen2.5-14bf,qwen2-vl-7bf,72.5,72.41,81.92,81.0,65.35,65.35,63.09,62.41,,,70.41,70.04
6,42,test,16,PP,qwen2.5-14bf,llava1.6-7bf,71.5,71.48,83.62,82.0,63.94,63.67,69.13,68.37,,,71.6,71.37
7,42,test,16,PL,qwen2.5-14bf,qwen2-vl-7bf,70.5,70.49,67.23,65.9,,,,,,,,
8,42,test,16,PL,qwen2.5-14bf,llava1.6-7bf,67.2,66.98,63.56,57.14,,,,,,,,
9,42,test,16,B2,qwen2.5-14bf,"llava1.6-7bf,qwen2-vl-7bf",69.1,68.83,,,,,,,,,,
