In [1]:
import pandas as pd
import os
from sklearn.metrics import classification_report
import numpy as np

In [2]:
dir_ = 'data/response_annotations/' 
pth = f'{dir_}english_multimodal_unsafe.csv'
df = pd.read_csv(pth)[['final_label', 'llavaguard']]
df = df.rename(columns={'final_label': 'human'})

for root, dirs, files in os.walk(dir_):
    for file in files:
        if file.endswith(".csv") and "unsafe" not in file and "detailed" not in file and "autoeval" not in file:
            # print(file)
            model_name = file.split('_')[2].split('.csv')[0]
            if "gpt" in model_name:
                df_ = pd.read_csv(f'{dir_}{file}', names=["GPT4"])
                df_ = df_.reindex(df.index).reset_index(drop=True)
            else:    
                df_ = pd.read_csv(f'{dir_}{file}')
            
            if "GPT4" in df_.columns:
                key = "GPT4"
            elif "vlm_sglang" in df_.columns:
                key = "vlm_sglang"
            
            df_ = df_[[key]]
            # rename column key to model
            df_ = df_.rename(columns={key: model_name})
            df = pd.concat([df, df_], axis=1)
df.to_csv(f'{dir_}english_multimodal_autoeval.csv', index=False)

In [3]:
df = pd.read_csv(f'{dir_}english_multimodal_autoeval.csv')

In [4]:
df = pd.read_csv(f'{dir_}english_multimodal_autoeval.csv')
df['human'] = df['human'].apply(lambda x: 2 if "unsafe" in x.lower() else 1 if "safe" in x.lower() else np.NaN)
df['llavaguard'] = df['llavaguard'].apply(lambda x: 2 if "unsafe" in x.lower() else 1 if "safe" in x.lower() else np.NaN)
df['omni-moderation-latest'] = df['omni-moderation-latest'].apply(lambda x: 2 if x else 1 if x==False else np.NaN)
df['Qwen2-Vl-72b-Instruct'] = df['Qwen2-Vl-72b-Instruct'].apply(lambda x: 2 if "unsafe" in x.lower() else 1 if "safe" in x.lower() else np.NaN)
df['llava-onevision-qwen2-72b-ov-sft'] = df['llava-onevision-qwen2-72b-ov-sft'].apply(lambda x: 2 if "unsafe" in x.lower() else 1 if "safe" in x.lower() else np.NaN)
df['Llama-3.2-90B-Vision-Instruct'] = df['Llama-3.2-90B-Vision-Instruct'].apply(lambda x: 2 if "unsafe" in x.lower() else 1 if "safe" in x.lower() else np.NaN)

def replace_val(x):
    # check if x is nan
    if pd.isna(x):
        return np.NaN
    elif "unsafe" in x.lower():
        return 2
    elif "safe" in x.lower():
        return 1
    else:
        return np.NaN

df['gpt-4-turbo-2024-04-09'] = df['gpt-4-turbo-2024-04-09'].apply(replace_val)

In [5]:
# compare each column with the first column (human) in terms of classification report
for col in df.columns[1:]:
    print(col)
    # for classification report, we need to ignore the NaN values
    nans = df[col].isna()
    print(f"invalid ratio: {nans.sum() / len(df)}")
    report = classification_report(df['human'][~nans], df[col][~nans], zero_division=0)
    print(report)
    print("\n\n")

llavaguard
invalid ratio: 0.29975
              precision    recall  f1-score   support

           1       0.95      0.51      0.66      2707
           2       0.02      0.28      0.04        94

    accuracy                           0.50      2801
   macro avg       0.49      0.39      0.35      2801
weighted avg       0.92      0.50      0.64      2801




omni-moderation-latest
invalid ratio: 0.0
              precision    recall  f1-score   support

           1       0.95      0.81      0.87      3819
           2       0.03      0.10      0.04       181

    accuracy                           0.78      4000
   macro avg       0.49      0.46      0.46      4000
weighted avg       0.91      0.78      0.84      4000




Qwen2-Vl-72b-Instruct
invalid ratio: 0.00025
              precision    recall  f1-score   support

           1       0.96      0.97      0.97      3818
           2       0.13      0.08      0.10       181

    accuracy                           0.93      3999
 