In [1]:
import json
import os
import pandas as pd    
import pprint
import shutil
import copy
import statistics
from PIL import Image
from IPython.display import Image as IPythonImage
from IPython.display import display
import sys
sys.path.append("../../")
from utils.tool import resize_image, preprocess_resize_img, view_img_size_statistics

In [2]:
default_max_l = 102400
MIN_LEN = 512
MAX_LEN = [default_max_l, 1280]#default_max_l, 1024

def clean_text(text):
    rpl_map = {" .": ".", " n't": "n't", " 'll": "'ll", " 're": "'re", " 's": "'s", " :":":", " ,":",", " ?": "?", " !": "!"}
    for ori_p, new_p in rpl_map.items():
        text = text.replace(ori_p, new_p)
    return " ".join(text.strip().split())

def preprocess_MultiOFF(
    root="./",
    save_data_jsons=False, 
    include_gt_captions=False,
    use_processed_img=False
):
    # # Process Goatbench
    gb_count = 0
    gb_folder = "../../data/GoatBench/offensiveness/"
    GB_items = {}
    if os.path.exists(gb_folder):
        gb_path = os.path.join(gb_folder, "test.jsonl")
        f = open(gb_path, 'r')
        for line in f:
            one_data = json.loads(line)
            if one_data["img"] not in GB_items:
                GB_items[one_data["img"]] = {'label': int(one_data["label"])}
        gb_save_to = "../../data/GoatBench/data/offensiveness/"
        if not os.path.exists(gb_save_to):
            os.makedirs(gb_save_to)
        gb_statistics = {'offensive' : 0, 'non-offensive': 0}
        print("Goatbench data loaded ...")

    save_to = os.path.join(root, 'data')
    root = "./MultiOFF_Dataset"
    if not os.path.exists(save_to):
        os.makedirs(save_to)
    image_folder_name = 'Labelled Images'
    image_path = os.path.join(root, image_folder_name)

    if use_processed_img:
        processed_image_paths = {}
        for max_l in MAX_LEN:
            p_folder = f"{image_folder_name}_{MIN_LEN}"
            if max_l < default_max_l:
                p_folder += f"_{max_l}"
            one_pp = os.path.join(root, p_folder)
            processed_image_paths[max_l] = (p_folder, one_pp)
            if not os.path.exists(one_pp):
                os.makedirs(one_pp)

    splits = {'test': 'Testing', 'dev': "Validation", 'train': "Training"}
    #splits = {'test': 'Testing', 'dev': "Validation"}
    data_statistics = {sp: {'offensive' : 0, 'non-offensive': 0} for sp in splits}
    # Record img size
    ori_sizes = {sp: {'w': {}, 'h': {}} for sp in splits}
    after_sizes = {}
    if use_processed_img:
        after_sizes = {max_l: {sp: {'w': {}, 'h': {}} for sp in splits} for max_l in MAX_LEN}
    
    id = 0
    for split, sp in splits.items():
        # # Check if gt_captions available
        if include_gt_captions:
            annotations = {}
            anno_file = os.path.join(f"./caption_annotations", f"{split}.json")
            if os.path.exists(anno_file):
                annotations = json.load(anno_file) # a dict {"12": {'img':, 'gt_caption':}}

        new_data = []
        file_path = os.path.join(root, 'Split Dataset', f"{sp}_meme_dataset.csv")
        split_df = pd.read_csv(file_path)
        result = split_df.to_json(orient="records")
        ori_data = json.loads(result)
        
        for item in ori_data:
            id += 1
            item['id'] = str(id)
            ori_label = item.pop('label')
            if ori_label == 'offensive':
                item['label'] = 1
                data_statistics[split]['offensive'] += 1
            else:
                item['label'] = 0
                data_statistics[split]['non-offensive'] += 1
            
            if include_gt_captions:
                item['gt_description'] = ""
                if annotations and (item["id"] in annotations):
                    item['gt_description'] = annotations[item["id"]]["gt_caption"]

            # -------------------- Copy and move image --------------------     
            img_name = item.pop('image_name')
            label_folder = str(item['label'])
            new_split_image_path = os.path.join(image_path, split, label_folder)
            if not os.path.exists(new_split_image_path):
                os.makedirs(new_split_image_path)
            new_img_path = os.path.join(image_path, split, label_folder, f"{id}.png")
            if not os.path.exists(new_img_path):
                print("Copy and paste the image to the new folder ...")
                ori_img_path = os.path.join(image_path, img_name)
                assert os.path.exists(ori_img_path)
                #os.rename(this_img_path, new_img_path)
                shutil.copy(ori_img_path, new_img_path)
            item['img'] = os.path.join('./data/MultiOFF', 'MultiOFF_Dataset', 'Labelled Images', split, label_folder, f"{id}.png")

            # # Check image size distribution
            img = Image.open(new_img_path)
            width, height = img.size
            ori_sizes[split]['w'][item['id']] = width
            ori_sizes[split]['h'][item['id']] = height
            if use_processed_img:
                item, after_sizes = preprocess_resize_img(processed_image_paths, image_folder_name, split, new_split_image_path, new_img_path, item, after_sizes, MIN_LEN)
            # -------------------------- move imgage -------------------------- #
            caption = item.pop('sentence')
            item['text'] = clean_text(caption)
            new_data.append(item)
            if GB_items:
                if img_name in list(GB_items.keys()):
                    gb_count += 1
                    one_gb = copy.deepcopy(item)
                    one_gb.pop("label")
                    one_gb["task"] = "multioff"
                    ori_item = GB_items[img_name]
                    GB_items[img_name] = dict(**ori_item, **one_gb)
                    lb = 'offensive' if ori_item['label'] == 1 else 'non-offensive'
                    gb_statistics[lb] += 1
                    # if split == "train":
                    #     print("one train entry...")
        
        if save_data_jsons:
            json.dump(new_data, open(os.path.join(save_to, f'{split}.json'), 'w'), indent=4)
            print(f"{split} finished!")
        
        # ------------------------img size statistics---------------------------
        tmp = {'before': ori_sizes, 'after': after_sizes}
        view_img_size_statistics(tmp, split, len(new_data), "MultiOFF")
    print(pprint.pformat(data_statistics))
    if save_data_jsons and GB_items:
        print(f"#GOATBENCH Instances = {gb_count} = {len(GB_items)}")
        json.dump([item for _, item in GB_items.items()], open(os.path.join(gb_save_to, f'test.json'), 'w'), indent=4)
    if GB_items:
        print("GoatBench_Offensiveness")
        print(pprint.pformat(gb_statistics))
preprocess_MultiOFF(save_data_jsons=False, use_processed_img=True)

Goatbench data loaded ...
Enlarged ./MultiOFF_Dataset/Labelled Images/test/1/1.png from 347x499 to 356x511.
Enlarged ./MultiOFF_Dataset/Labelled Images/test/1/1.png from 347x499 to 356x511.
Enlarged ./MultiOFF_Dataset/Labelled Images/test/1/3.png from 350x500 to 358x512.
Enlarged ./MultiOFF_Dataset/Labelled Images/test/1/3.png from 350x500 to 358x512.
Enlarged ./MultiOFF_Dataset/Labelled Images/test/0/5.png from 220x500 to 225x512.
Enlarged ./MultiOFF_Dataset/Labelled Images/test/0/5.png from 220x500 to 225x512.
Compressed ./MultiOFF_Dataset/Labelled Images/test/0/6.png from 2208x1242 to 1280x720.
Compressed ./MultiOFF_Dataset/Labelled Images/test/1/15.png from 1478x1264 to 1280x1094.
Compressed ./MultiOFF_Dataset/Labelled Images/test/1/16.png from 2188x1435 to 1280x839.
Enlarged ./MultiOFF_Dataset/Labelled Images/test/0/17.png from 480x480 to 512x512.
Enlarged ./MultiOFF_Dataset/Labelled Images/test/0/17.png from 480x480 to 512x512.
Compressed ./MultiOFF_Dataset/Labelled Images/test/1

In [10]:
def multioff_gen_annotation_sheet(source_paths):
    sep = "multioff/"
    save_to = "./caption_annotations"
    if not os.path.exists(save_to):
        os.makedirs(save_to)
    splits = list(set([src.split(sep)[1].split("/")[0] for src in source_paths]))
    splits_save_to = {sp: os.path.join(save_to, f'{sp}.json') for sp in splits}
    splits_save_to_py = {sp: os.path.join(save_to, f'{sp}.py') for sp in splits}
    #data_to_annotate = {sp: {} for sp in splits}
    data_to_annotate = {}
    for sp, path in splits_save_to.items():
        if os.path.exists(path):
            data_to_annotate[sp] = json.load(open(path))
        else:
            data_to_annotate[sp] = {}
    for src in source_paths:
        split = src.split(sep)[1].split("/")[0]
        for item in json.load(open(src)):
            if item['id'] not in data_to_annotate[split]:
                data_to_annotate[split][item['id']] = {
                    'img': item['img'],
                    'gt_caption': ''''''
                }
    for sp, data in data_to_annotate.items():
        json.dump(data, open(splits_save_to[sp], 'w'), indent=4)
        # with open(splits_save_to_py[sp], "w") as f:
        #     f.write(f'ANNOTATED = ')
        #     #json.dump(data, f)
        #     f.write(repr(pprint.pformat(data)))
        #     f.close()
        print(f"#data to be annotated in {sp}: {len(data)}")
    return

sources = ["/data/fengjun/projects/LLM/meme/HMC/results/multioff/test/seed-42/qwen2.5-14bf/D6_llava1.6-7bf_len-1024_GPU-2_20250321113908/round-1_Decision-v0-v0/incorrect.json"]
multioff_gen_annotation_sheet(sources)

#data to be annotated in test: 61
