In [1]:
import json
import os
import pprint
import copy
import shutil
from PIL import Image
from IPython.display import Image as IPythonImage
from IPython.display import display
import sys
sys.path.append("../../")

In [2]:
def preprocess_fhm(
    splits,
    root="./", 
    version="v1",
    save_data_jsons=False, 
    include_gt_captions=False,
):
    # # Process Goatbench
    gb_folder = "../../data/GoatBench/hatefulness/"
    gb_save_to = ""
    gb_count = 0
    GB_items = {}
    if os.path.exists(gb_folder):
        gb_path = os.path.join(gb_folder, "test.jsonl")
        f = open(gb_path, 'r')
        for line in f:
            one_data = json.loads(line)
            if one_data["id"] not in GB_items:
                GB_items[one_data["id"]] = {'label': int(one_data["label"]), 'text': one_data["text"]}
        gb_save_to = "../../data/GoatBench/data/hatefulness/"
        if not os.path.exists(gb_save_to):
            os.makedirs(gb_save_to)
        gb_statistics = {'hateful': 0, 'non-hateful': 0}
        print("Goatbench loaded ...")
    
    save_to = os.path.join(root, 'data')
    if not os.path.exists(save_to):
        os.makedirs(save_to)
    ori_image_path = os.path.join(root, version, 'img')
    new_image_path = os.path.join(root, version, 'Images')

    all_data = {sp: {} for sp in splits}
    data_statistics = {sp: {'seen': {'hateful': 0, 'non-hateful': 0}, 'unseen': {'hateful': 0, 'non-hateful': 0}} for sp in splits}
    for sp in splits:
        if sp == 'train':
            raw_files = {'seen': os.path.join(root, version,f"{sp}.jsonl")}
        else:
            raw_files = {ssp: os.path.join(root, version, f"{sp}_{ssp}.jsonl") for ssp in ['seen', 'unseen']}
            annotations = {}
        if include_gt_captions:
            for ssp in ['seen', 'unseen']:
                anno_file = os.path.join("./caption_annotations", f"{sp}_{ssp}.json")
                if os.path.exists(anno_file):
                    annotations[ssp] = json.load(open(anno_file)) # a dict {"12": {'img':, 'gt_caption':}}

        for key, file in raw_files.items():
            f = open(file, 'r')
            for line in f:
                one_data = json.loads(line)
                one_data["is_seen"] = 1 if key == 'seen' else 0
                if include_gt_captions:
                    one_data['gt_description'] = ""
                    if annotations[key] and (one_data["id"] in annotations[key]):
                        one_data['gt_description'] = annotations[key][one_data["id"]]["gt_caption"]
                
                # -------------------- Copy and move image -------------------- #
                # one_data["img"] = './data/FHM/' + version + '/' + one_data["img"] 
                # # one_data["img_mask"] = './data/FHM/' + version + '/img_mask/' + "mask_" + one_data["img"].split("/")[-1]

                label_folder = str(one_data["label"])
                img = one_data.pop("img")
                img = img.split("/")[-1]
                new_split_image_path = os.path.join(new_image_path, sp, label_folder)
                if not os.path.exists(new_split_image_path):
                    os.makedirs(new_split_image_path)
                new_img_path = os.path.join(new_split_image_path, img)
                if not os.path.exists(new_img_path):
                    ori_img_path = os.path.join(ori_image_path, img)
                    assert os.path.exists(ori_img_path)
                    shutil.copy(ori_img_path, new_img_path)
                one_data['img'] = os.path.join('./data/FHM/', version, 'Images', sp, label_folder, img)

                
                if GB_items:
                    if one_data["id"] in list(GB_items.keys()):
                        gb_count += 1
                        one_gb = copy.deepcopy(one_data)
                        one_gb.pop("label")
                        one_gb.pop("text")
                        one_gb["task"] = "fhm"
                        ori_item = GB_items[one_data["id"]]
                        GB_items[one_data["id"]] = dict(**ori_item, **one_gb)
                        lb = 'hateful' if ori_item['label'] == 1 else 'non-hateful'
                        gb_statistics[lb] += 1
                label = 'hateful' if one_data["label"] == 1 else 'non-hateful'
                if one_data["id"] not in all_data[sp]:
                    all_data[sp][one_data["id"]] = one_data
                    # all_data[sp].append(one_data)
                    data_statistics[sp][key][label] += 1
        
        if save_data_jsons:
            if sp != 'test':
                json.dump([item for _, item in all_data[sp].items()], open(os.path.join(save_to, f'{sp}.json'), 'w'), indent=4)
            else:
                all_ffc_ids = json.load(open("FFC_seen.json"))
                ffc_ids = all_ffc_ids['GT_hateful']

                new_data = {'seen': [], 'unseen': [], 'seen_ffc': []}
                for _, item in all_data[sp].items():
                    if item["is_seen"]:
                        new_data['seen'].append(item)
                        if (item['id'] in ffc_ids):
                            new_data['seen_ffc'].append(item)
                    else:
                        new_data['unseen'].append(item)
                for k, v in new_data.items():
                    json.dump(v, open(os.path.join(save_to, f'{sp}_{k}.json'), 'w'), indent=4)
    if save_data_jsons and GB_items:
        print(f"#GOATBENCH Instances = {gb_count} = {len(GB_items)}")
        json.dump([item for _, item in GB_items.items()], open(os.path.join(gb_save_to, f'test.json'), 'w'), indent=4)
    if GB_items:
        print(pprint.pformat(gb_statistics))
    
    print(pprint.pformat(data_statistics))
    return
splits = ['train', 'dev', 'test']
preprocess_fhm(splits, save_data_jsons=False, include_gt_captions=False)

Goatbench loaded ...
{'hateful': 750, 'non-hateful': 1250}
{'dev': {'seen': {'hateful': 247, 'non-hateful': 253},
         'unseen': {'hateful': 0, 'non-hateful': 140}},
 'test': {'seen': {'hateful': 490, 'non-hateful': 510},
          'unseen': {'hateful': 750, 'non-hateful': 1250}},
 'train': {'seen': {'hateful': 3019, 'non-hateful': 5481},
           'unseen': {'hateful': 0, 'non-hateful': 0}}}


In [2]:
def fhm_gen_annotation_sheet(source_paths):
    sep = "fhm/"
    save_to = "./caption_annotations"
    if not os.path.exists(save_to):
        os.makedirs(save_to)
    splits = list(set([src.split(sep)[1].split("/")[0] for src in source_paths]))
    splits_save_to = {sp: os.path.join(save_to, f'{sp}.json') for sp in splits}
    #data_to_annotate = {sp: {} for sp in splits}
    data_to_annotate = {}
    for sp, path in splits_save_to.items():
        if os.path.exists(path):
            data_to_annotate[sp] = json.load(open(path))
        else:
            data_to_annotate[sp] = {}
    for src in source_paths:
        split = src.split(sep)[1].split("/")[0]
        for item in json.load(open(src)):
            if item['id'] not in data_to_annotate[split]:
                data_to_annotate[split][item['id']] = {
                    'img': item['img'],
                    'gt_caption': ''''''
                }
    for sp, data in data_to_annotate.items():
        json.dump(data, open(splits_save_to[sp], 'w'), indent=4)
        print(f"#data to be annotated in {sp}: {len(data)}")
    return

sources = [
    "/data/fengjun/projects/LLM/meme/HMC/results/fhm/test_seen/seed-42/qwen2.5-14bf/D6m_llava1.6-7bf_qwen2-vl-7bf_len-1024_GPU-2_20250307033413/round-5_Decision-v3-v0/incorrect.json",
    "/data/fengjun/projects/LLM/meme/HMC/results/fhm/test_seen/seed-42/qwen2.5-14bf/D6h_llava1.6-7bf_qwen2-vl-7bf_len-1024_GPU-2_20250304221905/round-1_Decision-v3-v0/incorrect.json",
    "/data/fengjun/projects/LLM/meme/HMC/results/fhm/test_seen/seed-42/qwen2.5-14bf/D6j_llava1.6-7bf_qwen2-vl-7bf_len-1024_GPU-2_20250305203152/round-1_Decision-v3-v0/incorrect.json",
    "/data/fengjun/projects/LLM/meme/HMC/results/fhm/test_seen/seed-42/qwen2.5-14bf/D6h_llava1.6-7bf_qwen2-vl-7bf_len-1024_GPU-2_20250304221905/round-1_Decision-v3-v0/incorrect.json",
    "/data/fengjun/projects/LLM/meme/HMC/results/fhm/test_seen/seed-42/qwen2.5-14bf/D6f_llava1.6-7bf_qwen2-vl-7bf_len-1024_GPU-2_20250304182545/round-1_Decision-v3-v0/incorrect.json"
]
fhm_gen_annotation_sheet(sources)

#data to be annotated in test_seen: 388
