In [3]:
import os
import json
import regex
import pandas as pd
pd.set_option('display.max_columns', None)

In [4]:
SCHEMES = ['GPT', 'B1','B2', 'PP', 'PL', 'PD']
LMMs = ['llava1.6-7bf', 'qwen2-vl-7bf', 'llava1.6-7bf,qwen2-vl-7bf']
LLMs = ['qwen2.5-14bf', 'mistral-12bf', 'qwen3-14bf', 'qwen2.5-7bf', 'llama3.1-8bf']
MINI = ['llava1.6-7bf', 'qwen2-vl-7bf', 'mistral-12bf', 'qwen2.5-7bf', 'llama3.1-8bf']
GPT = ['gpt4o-mini']
TASKS = {
    'fhm': 'FHM', 
    'harmc': "HarMeme", 
    'harmp': "Harm-P",
    'multioff': "MultiOFF",
    'mami': 'MAMI',
    'pridemm': "PrideMM",
    'gb_hateful': "GB-Hateful",
    'gb_harmful': "GB-Harmful",
    'gb_offensive': "GB-Offensive",
    'gb_misogynistic': "GB-Misogynistic"
}
model_name_map = {
    'llava1.6-7bf': 'LLaVa1.6-7B',
    'qwen2-vl-7bf': 'Qwen2VL-7B',
    'llava1.6-7bf,qwen2-vl-7bf': 'LLaVa1.6&Qwen2VL-7B',
    'qwen2.5-14bf': 'Qwen2.5-14Bf',
    'mistral-12bf': 'Mistral-12Bf',
    'qwen3-14bf': 'Qwen3-14Bf',
    'qwen2.5-7bf': 'Qwen2.5-7Bf',
    'llama3.1-8bf': 'Llama3.1-8Bf',
    'gpt4o-mini': 'GPT-4o-mini',
    "": ""
}

### Main

In [8]:
def format_number(num):
    return str(round(num * 100, 2))

def traverse_model_dir(model, model_dir, task, split='test', res_by_task=None, seed=42, batch=1, llm="", lmm=""):
    for run in os.listdir(model_dir):
        tmp = run.split("_")[0]
        if tmp in SCHEMES:
            scheme = tmp
            if scheme == 'B1':
                lmm = model
            elif scheme in ['B2', 'PP', 'PL', 'PD']:
                llm = model
                #lmm = ",".join(run.split("_len-")[0].split("_")[1:])
            run_fd = os.path.join(model_dir, run)
            for round in os.listdir(run_fd):
                if os.path.isdir(os.path.join(run_fd, round)):
                    round_fd = os.path.join(run_fd, round)
                    if 'result.json' in os.listdir(round_fd):
                        res_file = os.path.join(round_fd, 'result.json')
                        res_dict = json.load(open(res_file))
                        try:
                            one_res = {
                                'seed': seed,
                                'split': split,
                                'BS': batch,
                                'scheme': scheme,
                                'LLM': model_name_map[llm],
                                'LMM': model_name_map[lmm],
                                'acc': format_number(res_dict['acc']),
                                'f1': format_number(res_dict['f1']),
                            }
                        except:
                            print(llm)
                            print(lmm)
                        res_by_task[task].append(one_res)
    return res_by_task

def traverse_llm_dir(model, llm_fd, task, split='test', res_by_task=None, seed=42):
    for lmm in os.listdir(llm_fd):
        if lmm in LMMs + GPT:
            lmm_fd = os.path.join(llm_fd, lmm)
            for batch in os.listdir(lmm_fd):
                if regex.match(r"BS\-\d+", batch):
                    batch_num = int(batch.split("-")[1])
                    batch_fd = os.path.join(lmm_fd, batch)
                    res_by_task = traverse_model_dir(
                        model, batch_fd, task, split,
                        res_by_task, seed, batch=batch_num, lmm=lmm)
    return res_by_task

def main_table(
    root = "./results"
):  
    ## Start
    res_by_task = {task: [] for task in TASKS}
    for task in os.listdir(root):
        task_fd = os.path.join(root, task)
        for split in os.listdir(task_fd):
            split_fd = os.path.join(task_fd, split)
            for fd in os.listdir(split_fd):
                if fd in MINI: ### Mini models <= 13B
                    model = fd
                    mini_model_fd = os.path.join(split_fd, fd)
                    if model in LMMs:
                        res_by_task = traverse_model_dir(model, mini_model_fd, task, split, res_by_task)
                    elif model in LLMs:
                        res_by_task = traverse_llm_dir(model, mini_model_fd, task, split, res_by_task)
                if fd.startswith("seed-"): ### Small models > 13B
                    this_seed = fd.split("-")[-1]
                    seed_fd = os.path.join(split_fd, fd)
                    for model in os.listdir(seed_fd):
                        model_fd = os.path.join(seed_fd, model)
                        if model in GPT:
                            res_by_task = traverse_model_dir(model, model_fd, task, split, res_by_task, seed=this_seed)
                        elif model in LLMs:### Small models > 13B
                            res_by_task = traverse_llm_dir(model, model_fd, task, split, res_by_task, seed=this_seed)

    main_dict = {scheme: [] for scheme in SCHEMES}
    fhm_specific = {'B2': [], 'PP': [], 'PL': []}
    columns = []
    task0 = 'fhm'
    for rid, rec in enumerate(res_by_task[task0]):
        this_acc, this_f1 = rec.pop('acc'), rec.pop('f1')
        task0_display = TASKS[task0]
        metrics = {f'{task0_display}_Acc': this_acc, f'{task0_display}_F1': this_f1}
        oneline = dict(**rec, **metrics)
        task0_split = rec['split']
        if task0 == 'fhm':
            task0_split = rec['split'].split("_")[0] # test_seen --> test
            oneline['split'] = task0_split
        # Traverse other tasks
        # for task, rec_ls in res_by_task.items():
        #     if task != task0:
        for task in TASKS:
            if task != task0:
                task_display = TASKS[task]
                rec_ls = res_by_task[task]
                rec_found = False
                for one_rec in rec_ls: # one_rect = {'seed': 42, 'split':'test', ...,'f1': f1,}
                    if not rec_found:
                        cond = []
                        for k, v in one_rec.items():
                            if k not in ['acc', 'f1']:
                                # if k == 'split':
                                #     cond.append(task0_split == v.split("_")[0])
                                # else:
                                cond.append(oneline[k] == v)
                        if cond and all(cond):
                            rec_found = True
                            this_task_metrics = {f'{task_display}_Acc': one_rec['acc'], f'{task_display}_F1': one_rec['f1']}
                            # oneline = dict(**oneline, **this_task_metrics)
                if not rec_found:
                    this_task_metrics = {f'{task_display}_Acc': "", f'{task_display}_F1': ""}
                oneline = dict(**oneline, **this_task_metrics)
        if not columns:
            columns = list(oneline.keys())
        
        # FHM specific
        if oneline['LMM'] == 'llava1.6-7bf,qwen2-vl-7bf':
            fhm_specific[oneline['scheme']].append(list(oneline.values()))
        else:
            main_dict[oneline['scheme']].append(list(oneline.values()))
    
    main_tab = []
    for scheme, line_ls in main_dict.items():
        main_tab.extend(line_ls)
    for scheme, line_ls in fhm_specific.items():
        main_tab.extend(line_ls)
    df = pd.DataFrame(main_tab, columns=columns)
    save_to = "./res_tables"
    if not os.path.exists(save_to):
        os.makedirs(save_to)
    df.to_excel(os.path.join(save_to, "main208.xlsx"), sheet_name='208', index=False) 
    return df

main_table()

Unnamed: 0,seed,split,BS,scheme,LLM,LMM,FHM_Acc,FHM_F1,HarMeme_Acc,HarMeme_F1,Harm-P_Acc,Harm-P_F1,MultiOFF_Acc,MultiOFF_F1,MAMI_Acc,MAMI_F1,PrideMM_Acc,PrideMM_F1,GB-Hateful_Acc,GB-Hateful_F1,GB-Harmful_Acc,GB-Harmful_F1,GB-Offensive_Acc,GB-Offensive_F1,GB-Misogynistic_Acc,GB-Misogynistic_F1
0,42,test,1,GPT,,GPT-4o-mini,67.6,65.51,70.9,69.46,65.35,65.35,65.77,64.86,77.4,76.59,72.39,72.28,,,,,,,,
1,42,test,1,B1,,Qwen2VL-7B,64.2,62.68,67.51,60.29,56.34,53.05,71.14,64.61,68.1,66.03,68.44,68.43,,,,,,,,
2,42,test,1,B1,,LLaVa1.6-7B,60.4,57.85,66.38,61.05,56.34,53.62,59.73,57.38,67.8,67.46,60.16,59.99,,,,,,,,
3,42,test,16,B2,Qwen2.5-14Bf,Qwen2VL-7B,70.1,70.02,61.02,57.92,60.0,59.91,53.69,53.15,75.1,75.1,68.44,68.42,,,,,,,,
4,42,test,16,B2,Qwen2.5-14Bf,LLaVa1.6&Qwen2VL-7B,69.1,68.83,,,,,,,,,,,,,,,,,,
5,42,test,16,B2,Qwen2.5-14Bf,LLaVa1.6-7B,68.0,67.7,59.6,51.46,63.94,63.84,64.43,62.93,76.4,76.38,66.47,66.47,,,,,,,,
6,42,test,16,PP,Qwen2.5-14Bf,Qwen2VL-7B,72.5,72.41,81.92,81.0,65.35,65.35,63.09,62.41,78.6,78.59,70.41,70.04,71.25,70.75,,,,,,
7,42,test,16,PP,Qwen2.5-14Bf,LLaVa1.6&Qwen2VL-7B,72.0,71.98,,,,,,,,,,,70.8,69.95,,,,,,
8,42,test,16,PP,Qwen2.5-14Bf,LLaVa1.6-7B,71.5,71.48,83.62,82.0,63.94,63.67,69.13,68.37,79.9,79.89,71.6,71.37,71.95,71.05,,,,,,
9,42,test,16,PL,Qwen2.5-14Bf,Qwen2VL-7B,70.5,70.49,67.23,65.9,,,,,75.8,75.78,,,,,,,,,,


### Confusion Matrices (B1 vs. B2 vs. PP)

In [None]:
# Qwen2VL-7B: B1, B2, PP
#FHM
#Harm-C
#Harm-P
#MultiOFF
#MAMI
#PrideMM
Qwen2VL = [
    [[[422,88],[270,220]], [[377,133],[166,324]], [[334,176],[99,391]]],
    [[[195,35],[80,44]], [[156,74],[64,60]], [[184,46],[18,106]]],
    [[[147,37],[118,53]], [[98,  86],[56, 115]], [[117,67],[56,115]]],
    [[[85,6],[37, 21]], [[48, 43],[26, 32]], [[57, 34],[21, 37]]],
    [[[464,36],[283, 217]], [[378, 122],[127, 373]], [[406,  94],[120,380]]],
    [[[178,82],[78,169]], [[180,80],[80,167]], [[207, 53],[97, 150]]] 
]

# Llava1.6-7B: B1, B2, PP
Llava = [
    [[[425, 85],[311, 179]], [[388, 122],[198, 292]], [[343, 167],[118, 372]]],
    [[[183,  47],[72,  52]], [[178,  52],[91,  33]], [[201,  29],[29,  95]]],
    [[[143, 41],[114, 57]], [[123,  61],[67, 104]], [[129,  55],[73,  98]]],
    [[[62, 29],[31, 27]], [[63, 28],[25, 33]], [[63, 28],[18, 40]]],
    [[[390, 110],[212, 288]], [[367, 133],[103, 397]], [[387, 113],[88, 412]]],
    [[[136, 124],[78, 169]], [[166,  94],[76, 171]], [[204,  56],[88, 159]]]
]

### Iterative Improvements

In [None]:
#FHM
FHM = {
    'l1': [68.38, 70.18, 70.64, 70.99, 71.68, 72.08],
    'l2': [65.51] * 6
}
# HarMeme
HarMeme = {
    'l1': [77.24, 75.91, 79.94, 78.36, 81.31, 80.35],
    'l2': [69.46] * 6,
}
# HarmP
HarmP = {
    'l1': [57.67, 61.65, 63.08, 62.2, 61.96, 63.09],
    'l2': [65.35] * 6,
}
# MultiOFF
MultiOFF = {
    'l1': [59.65, 62.71, 61.89, 63.64, 61.93, 66.85],
    'l2': [64.86] * 6,
} 
# MAMI
MAMI = {
    'l1': [77.18, 78.6, 78.4, 79.6, 79.3, 79.59],
    'l2': [76.59] * 6,
}
# PrideMM
PrideMM = {
    'l1': [68.24, 70.02, 68.83, 70.45, 70.92, 71.11],
    'l2': [72.28] * 6,
}
