In [1]:
import json
import os
import pandas as pd    
import pprint
import shutil
import statistics
import copy
from PIL import Image
import sys
sys.path.append("../../")
from utils.tool import resize_image, preprocess_resize_img, view_img_size_statistics
from models.gpt import postprocess_output_text

In [2]:
# MIN_IMG_SIZE = {
#     'Harm-C': 0.5,
#     'Harm-P': 0.8
# }
def clean_text(text):
    text = text.replace("imgfip.com", "").replace("imgflip.com", "").replace("imgflip com", "").replace("imgflip co", "").replace("imgflip", "")
    text = " ".join(text.split())
    return text

default_max_l = 102400
MIN_LEN = {
    'Harm-C': 768,
    'Harm-P': 768,
}
MAX_LEN = {
    'Harm-C': [default_max_l, 1280],
    'Harm-P': [default_max_l, 1280]
}
task_name_map = {'Harm-C': "harmc", 'Harm-P': "harmp"}
def preprocess_HarMeme(
    splits,
    root="./", 
    save_data_jsons=False, 
    include_gt_captions=False,
    use_processed_img=False
):
    # # Process Goatbench
    gb_count = 0
    gb_folder = "../../data/GoatBench/harmfulness/"
    gb_save_to = ""
    GB_items = {}
    if os.path.exists(gb_folder):
        gb_path = os.path.join(gb_folder, "test.jsonl")
        f = open(gb_path, 'r')
        for line in f:
            one_data = json.loads(line)
            if one_data["id"] not in GB_items:
                GB_items[one_data["id"]] = {'label': int(one_data["label"])}
        gb_save_to = "../../data/GoatBench/data/harmfulness/"
        if not os.path.exists(gb_save_to):
            os.makedirs(gb_save_to)
        print("Goatbench data loaded ...")

    label_map = {
        "not harmful": 0,
        "very harmful": 1,
        "somewhat harmful": 1,
        "": 0,
    }
    data_image_map = {'Harm-C': 'harmeme_images_covid_19', 'Harm-P': 'harmeme_images_us_pol'}
    data_path = os.path.join(root, 'Annotations')
    image_folder_name = 'HarMeme_Images'
    image_path = os.path.join(root, image_folder_name)
    
    # splits = ['test', 'dev', 'train']
    for data_name, img_dir in data_image_map.items():
        if use_processed_img:
            processed_image_paths = {}
            for max_l in MAX_LEN[data_name]:
                p_folder = f"{image_folder_name}_{MIN_LEN[data_name]}"
                if max_l < default_max_l:
                    p_folder += f"_{max_l}"
                one_pp = os.path.join(root, p_folder)
                processed_image_paths[max_l] = (p_folder, one_pp)
                if not os.path.exists(one_pp):
                    os.makedirs(one_pp)
        
        data_statistics = {sp: {'not harmful': 0, 'harmful': 0, 'no label': 0} for sp in splits}
        ori_sizes = {sp: {'w': {}, 'h': {}} for sp in splits}
        after_sizes = {}
        if use_processed_img:
            after_sizes = {max_l: {sp: {'w': {}, 'h': {}} for sp in splits} for max_l in MAX_LEN[data_name]}
        for split in splits:
            save_to = os.path.join(root, 'data', data_name)
            if not os.path.exists(save_to):
                os.makedirs(save_to)
            # # Check if gt_captions available
            if include_gt_captions:
                annotations = {}
                anno_file = os.path.join(f"./caption_annotations/{data_name}", f"{split}.json")
                if os.path.exists(anno_file):
                    annotations = json.load(open(anno_file)) # a dict {"12": {'img':, 'gt_caption':}}
                gpt4_file = os.path.join(data_path, data_name, f"test_gpt4o_m2t.json")
                if os.path.exists(gpt4_file):
                    for item in json.load(open(gpt4_file)):
                        if item['id'] not in annotations:
                            annotations[item['id']] = {'gt_caption': item['prediction']}
                        
            sp = split
            if split == 'dev':
                sp = 'val'
            if data_name == "Harm-P":
                sp = sp + "_v1"

            new_data = []
            file_path = os.path.join(data_path, data_name, f"{sp}.jsonl")
            with open(file_path) as f:
                ori_data = [json.loads(line) for line in f]
            for item in ori_data:
                #print(item)
                # remove img
                ori_id = copy.deepcopy(item['id'])
                new_id = item['id'].split("_")[-1]
                item['id'] = new_id
                item.pop('image')
                caption = item['text']
                ori_label = item.pop('labels')
                assert isinstance(ori_label, list)
                
                item['label'] = label_map[ori_label[0]]
                if ori_label[0] == "":
                    data_statistics[split]['no label'] += 1
                    ori_label[0] = 'not harmful'
                if item['label'] == 1:
                    data_statistics[split]['harmful'] += 1
                else:
                    data_statistics[split]['not harmful'] += 1
                if len(ori_label) > 1:
                    item['target'] = ori_label[1:]
                else:
                    item['target'] = []
                
                if include_gt_captions:
                    # add in gt_captions
                    item['gpt_description'] = ""
                    if annotations and (item["id"] in annotations):
                        item['gpt_description'] = postprocess_output_text(annotations[item["id"]]["gt_caption"])
                
                # -------------------- Copy and move image -------------------- 
                label_folder = "_".join(ori_label[0].split())
                # print(label_folder)
                new_split_image_path = os.path.join(image_path, img_dir, split, label_folder)
                if not os.path.exists(new_split_image_path):
                    os.makedirs(new_split_image_path)
                new_img_path = os.path.join(image_path, img_dir, split, label_folder, f"{new_id}.png")
                if not os.path.exists(new_img_path):
                    old_img_path = os.path.join(image_path, img_dir, split, f"{new_id}.png")
                    assert os.path.exists(old_img_path)
                    if os.path.exists(old_img_path):
                        #os.rename(old_img_path, new_img_path)
                        shutil.copy(old_img_path, new_img_path)        
                item['img'] = os.path.join('./data/HarMeme_V1', image_folder_name, img_dir, split, label_folder, f"{new_id}.png")

                # # Check image size distribution
                img = Image.open(new_img_path)
                width, height = img.size
                ori_sizes[split]['w'][item['id']] = width
                ori_sizes[split]['h'][item['id']] = height
                # ori_sizes[split][item['id']] = os.path.getsize(new_img_path) / (1024 * 1024)
                if use_processed_img:
                    item, after_sizes = preprocess_resize_img(processed_image_paths, image_folder_name, split, new_split_image_path, new_img_path, item, after_sizes, MIN_LEN[data_name])
                    # # # Check if need to enlarge or compress images
                    # for max_l, tup in processed_image_paths.items():
                    #     p_folder = tup[0]
                    #     processed_image_path = tup[1]
                    #     p_save_dir = os.path.join(processed_image_path, img_dir, split, label_folder)
                    #     if not os.path.exists(p_save_dir):
                    #         os.makedirs(p_save_dir)
                    #     p_img_path = resize_image(
                    #         new_img_path, 
                    #         p_save_dir, 
                    #         #min_size_mb=MIN_IMG_SIZE[data_name],
                    #         min_len=MIN_LEN[data_name],
                    #         max_len=max_l
                    #     )
                    #     if p_img_path != new_img_path:
                    #         key = f'img_{MIN_LEN[data_name]}'
                    #         if max_l < default_max_l:
                    #             key += f"_{max_l}"
                    #         #print(p_img_path)
                    #         item[key] = os.path.join('./data/HarMeme_V1', p_folder, img_dir, split, label_folder, p_img_path.split("/")[-1])
                    #     img = Image.open(p_img_path)
                    #     after_sizes[max_l][split]['w'][item['id']], after_sizes[max_l][split]['h'][item['id']] = img.size
                    #     # after_sizes[split][item['id']] = os.path.getsize(p_img_path) / (1024 * 1024)
                # -------------------------- move imgage -------------------------- #
                # if data_name == "Harm-C":
                #     item['text'] = clean_text(caption)
                # else:
                item['text'] = caption
                    
                new_data.append(item)
                if GB_items:
                    if ori_id in list(GB_items.keys()):
                        gb_count += 1
                        one_gb = copy.deepcopy(item)
                        one_gb.pop("label")
                        # one_gb.pop("text")
                        one_gb["task"] = task_name_map[data_name]
                        ori_item = GB_items[ori_id]
                        GB_items[ori_id] = dict(**ori_item, **one_gb)
                        # if split == "train":
                        #     print("one train entry...")
            if save_data_jsons:
                json.dump(new_data, open(os.path.join(save_to, f'{split}.json'), 'w'), indent=4)
            
            tmp = {'before': ori_sizes, 'after': after_sizes}
            view_img_size_statistics(tmp, split, len(new_data), data_name)
            # for k, sizes in tmp.items():
            #     wh_values = []
            #     if k == 'before':
            #         width_values, height_values = list(sizes[split]['w'].values()), list(sizes[split]['h'].values())
            #         wh_values.append((width_values, height_values))
            #     if (k == 'after') and sizes: #k == 'after':
            #         for max_l, max_l_sizes in sizes.items():
            #             width_values, height_values = list(max_l_sizes[split]['w'].values()), list(max_l_sizes[split]['h'].values())
            #             wh_values.append((width_values, height_values))
            #     for vals in wh_values:
            #         width_values = vals[0]
            #         height_values = vals[1]
            #         if width_values and height_values:
            #             print(f"{k}|{data_name}: {split}| #{len(new_data)} images in total \n Width: min: {min(width_values)}, max: {max(width_values)}, mean: {statistics.mean(width_values)}, median: {statistics.median(width_values)} \n Height: min: {min(height_values)}, max: {max(height_values)}, mean: {statistics.mean(height_values)}, median: {statistics.median(height_values)}")
            
            # ori_size_values = list(ori_sizes[split].values())
            # print(f"{data_name}: {split}| #{len(new_data)} images in total | min: {min(ori_size_values)}, max: {max(ori_size_values)}, mean: {statistics.mean(ori_size_values)}, median: {statistics.median(ori_size_values)}")
            # if use_processed_img:
            #     after_size_values = list(after_sizes[split].values())
            #     print(f"{data_name}: {split}| #{len(new_data)} images in total | min: {min(after_size_values)}, max: {max(after_size_values)}, mean: {statistics.mean(after_size_values)}, median: {statistics.median(after_size_values)}")
                
        print(f"Dataset of {data_name}")
        print(pprint.pformat(data_statistics))
    
    if save_data_jsons and GB_items:
        print(f"#GOATBENCH Instances = {gb_count} = {len(GB_items)}")
        json.dump([item for _, item in GB_items.items()], open(os.path.join(gb_save_to, f'test.json'), 'w'), indent=4)
splits = ['test', 'dev']
preprocess_HarMeme(splits, save_data_jsons=True, use_processed_img=True,  include_gt_captions=True)

Goatbench data loaded ...
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5425.png from 512x511 to 768x766.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5425.png from 512x511 to 768x766.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5426.png from 512x556 to 707x768.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5426.png from 512x556 to 707x768.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5429.png from 183x275 to 511x768.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5429.png from 183x275 to 511x768.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5430.png from 183x276 to 509x768.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5430.png from 183x276 to 509x768.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmful/5434.png from 168x300 to 430x768.
Enlarged ./HarMeme_Images/harmeme_images_covid_19/test/not_harmfu

### Gen Annotation Sheets

In [None]:
def harmeme_gen_annotation_sheet(source_paths):
    sep_name_map = {
        'harmc': "Harm-C",
        'harmp': "Harm-P"
    }
    for dataset, src_paths in source_paths.items():
        sep = f"{dataset}/"
        save_to = f"./caption_annotations/{sep_name_map[dataset]}"
        if not os.path.exists(save_to):
            os.makedirs(save_to)
        splits = list(set([src.split(sep)[1].split("/")[0] for src in src_paths]))
        splits_save_to = {sp: os.path.join(save_to, f'{sp}.json') for sp in splits}
        #data_to_annotate = {sp: {} for sp in splits}
        data_to_annotate = {}
        for sp, path in splits_save_to.items():
            if os.path.exists(path):
                data_to_annotate[sp] = json.load(open(path))
            else:
                data_to_annotate[sp] = {}
        for src in src_paths:
            split = src.split(sep)[1].split("/")[0]
            for item in json.load(open(src)):
                if item['id'] not in data_to_annotate[split]:
                    data_to_annotate[split][item['id']] = {
                        'img': item['img'],
                        'gt_caption': ''''''
                    }
        for sp, data in data_to_annotate.items():
            json.dump(data, open(splits_save_to[sp], 'w'), indent=4)
            print(f"#data to be annotated in {dataset}'s {sp}: {len(data)}")
    return

sources = {
    "harmc": [
        "/data/fengjun/projects/LLM/meme/HMC/results/harmc/test/seed-42/qwen2.5-14bf/D6_llava1.6-7bf_len-1024_GPU-2_20250308023857/round-1_Decision-v0-v0/incorrect.json",
    ],
    "harmp": [
        "/data/fengjun/projects/LLM/meme/HMC/results/harmp/test/seed-42/qwen2.5-14bf/D6_llava1.6-7bf_len-1024_GPU-2_20250311021122/round-1_Decision-v0-v0/incorrect.json",
    ]
}
harmeme_gen_annotation_sheet(sources)

#data to be annotated in harmc's test: 75
