Eval ispired by https://github.com/SiatMMLab/Awesome-Diffusion-Model-Based-Image-Editing-Methods/blob/main/EditEval_v1/Metric/LMM_Score_GPT4V_Prompt_Template.md


In [1]:
import base64
from openai import OpenAI
import os
import json 
import yaml

client = OpenAI()


In [2]:
from PIL import Image
import io

def encode_image(path, size=(512, 512)):
    with Image.open(path) as img:
        img = img.convert("RGB")          # ensure 3-channel
        img = img.resize(size, Image.BICUBIC)

        buffer = io.BytesIO()
        img.save(buffer, format="PNG")
        buffer.seek(0)

        return base64.b64encode(buffer.read()).decode("utf-8")
# ---------------------------
# Load images
# ---------------------------
def load_images(folder):
    source = encode_image(os.path.join(folder, "A.png"))
    edited = sorted([
        f for f in os.listdir(folder)
        if f.startswith("C")
    ])
    print(len(edited))
    obj = 'unet.' if 'unet.' in edited[0] else 'transformer.'
    edited = sorted([
        f for f in os.listdir(folder)
        if f.startswith("C")
        if len(f.split(obj)) < 3
    ])
    print(obj, len(edited))
    edited_imgs = [encode_image(os.path.join(folder, e)) for e in edited]
    layers_images = [e.replace('C_skips_','').replace('.png', '') for e in edited]
    return source, edited_imgs, layers_images

# --------------------------
# Helper: encode local image
# --------------------------

def build_content(TASK, INSTRUCTION, source_img, edited_imgs, layers):
  
    # --------------------------------------------
    # Build the full conversation up to scoring
    # --------------------------------------------
    messages = [
        {
            "role": "user",
            "content": (
                "I invite you to participate in an experiment on "
                "\"Evaluation of Image Editing Performance with Different Methods\". "
                "Are you ready?"
            )
        },
        {
            "role": "assistant",
            "content": "I'm ready and intrigued! Let's begin."
        },
        {
            "role": "user",
            "content": [
                "Step 1: Here is the source image, task indicator, and editing instruction.",
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{source_img}"}
                },
                f"Task indicator: {TASK}.",
                f"Instruction: {INSTRUCTION}.",
                "Please describe the source image and describe the editing process."
            ],
        },
    ]
    
    # --------------------------------------------
    # Add Step 2 instructions
    # --------------------------------------------
    messages.append({
        "role": "user",
        "content": (
            f"Step 2: Now I will upload {len(layers)} edited images. "
            "For each: give detailed analysis on 4 factors. The scores must be from 0 to 10. "
            "(Editing Accuracy, Contextual Preservation, Visual Quality, Logical Realism). "
            """
            1. Editing Accuracy: Evaluate how closely the edited image adheres to the specified editing instruction, measuring the precision of the editing. 

            2. Contextual Preservation: Assess how well the edited image maintains the context and structure of the source image that should not be changed. 
            
            3. Visual Quality: Assess the overall visual quality of the edited image. 
            
            4. Logical Realism: Evaluate how logically realistic the edited image is in terms of adherence to natural physical laws. 
            
            """
            "Do NOT score yet. After all images, I will ask for scores."
        )
    })
    
    # Add each edited image as a separate conversational turn
    for i, img in enumerate(edited_imgs):
        messages.append({
            "role": "user",
            "content": [
                f"Edited image {layers[i]}:",
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}
            ]
        })
    
    # --------------------------------------------
    # Step 3: Ask for scoring including S_all calc
    # --------------------------------------------
    messages.append({
        "role": "user",
        "content": (
            f"Now score all {len(layers)} images with four sub-scores (S_acc, S_pre, S_qua, S_real). "
            "Avoid identical scores. Then compute S_all = "
            "0.4*S_acc + 0.3*S_pre + 0.2*S_qua + 0.1*S_real for each."
            "You absolutely must return the results as a json containing the layer names as keys, and the subkeys being the metrics with values the scores."
        )
    })
    return messages


def extract_json_from_text(text: str):
    """
    Extracts JSON enclosed in ```json ... ``` from a long string and returns parsed Python object.
    """

    # Regex to extract JSON inside triple backticks
    start = text.find("{")
    if start == -1:
        raise ValueError("No '{' found in text")

    brace_count = 0
    inside = False

    for i in range(start, len(text)):
        if text[i] == "{":
            brace_count += 1
            inside = True
        elif text[i] == "}":
            brace_count -= 1
            if brace_count == 0 and inside:
                return text[start:i+1]

    # ---- 3. Clean trailing commas before } or ] ----
    block = re.sub(r",\s*([}\]])", r"\1", text)

    # ---- 4. Load JSON safely ----
    try:
        return json.loads(block)
    except json.JSONDecodeError as e:
        print("FAILED BLOCK:\n", block)
        raise e


In [4]:
from glob import glob
from PIL import Image
import re

models = ["outputs_stable-diffusion-3.5-large-turbo_controlled"] #"outputs_FLUX.1-schnell_controlled", "outputs_PixArt-XL-2-1024-MS_controlled",
for model in models:
    FOLDER = f"../outputs_per_model/{model}"
    folders = glob(FOLDER + '/*')[:30]
    print(len(folders))
    TASK = "Object and Style Change"
    transf = True
    OUT_FOLDER = 'output_scores'
    
    for FOLDER in folders:
        try:
            source, edits, layers = load_images(FOLDER)
            if transf:
                _, prompt_b, prompt_a = FOLDER.split('/')[-1].split('A_high-resolution_image_of_a_')
            else:
                _, prompt_a, prompt_b = FOLDER.split('/')[-1].split('A_high-resolution_image_of_a_')
                
            INSTRUCTION = f"Change an image described by {prompt_a.replace('_',' ')} to an image containing {prompt_b.replace('_',' ')}"
            print(INSTRUCTION)
        
            messages = build_content(TASK, INSTRUCTION, source, edits, layers)
        
            # --------------------------------------------
            # Call the API (choose a model)
            # --------------------------------------------
            response = client.chat.completions.create(
                model="gpt-5-mini-2025-08-07",   # or "gpt-4.1-mini", "gpt-4o-mini", "gpt-4.1-vision"
                messages=messages,
            )
            output = response.choices[0].message.content
            #print(output)
                
            scores = extract_json_from_text(output)
            scores = eval(scores)
            for keys, score in scores.items():
                scores[keys]["folder"] = FOLDER
                scores[keys]['A'] = prompt_a
                scores[keys]['B'] = prompt_b     
            
            #print(scores)
            
            os.makedirs(os.path.join(OUT_FOLDER, FOLDER), exist_ok=True)
        
            with open(os.path.join(OUT_FOLDER, FOLDER, "results.json"), "w") as f:
                json.dump(scores, f, indent=2)
            
        except Exception as e:
            print(e)
            
    print("\n✅ All scores saved to results.json")


29
37
transformer. 37
Change an image described by gold pocket watch in the volcanic landscape, baroque painting style to an image containing hot air balloon in the desert, Ukiyo-e japanese woodblock style 
37
transformer. 37
Change an image described by katana sword in the moon surface base, noir comic book style to an image containing lunar telescope in the retro 80s arcade hall, baroque painting style 
37
transformer. 37
Change an image described by retro arcade machine in the marble palace interior, liquid metal reflective style to an image containing futuristic drone in the robot factory assembly line, chalkboard drawing 
37
transformer. 37
Change an image described by casette player in the volcanic landscape, digital matte painting to an image containing vintage camera in the cyberpunk neon district, AI-generated abstract fractal style 
37
transformer. 37
Change an image described by vintage bicycle in the Tokyo night street, biomechanical art to an image containing marble statue