In [1]:
from pydantic import BaseModel

class Output_one(BaseModel):
    is_grounded: bool
    ids: list[str]

class Output_three(BaseModel):
    level: str # paragraph, line, word
    is_grounded: bool
    ids: list[str]


system_prompt = """
You are given an input consisting of 1. a screenshot annotated with bounding boxes, each assigned a unique ID, 2. a dict where the key is the ID of the bounding box and the value is the extracted text from that bounding box and 3. a referring expression that specifies a target text span.

Your task is to first identify the target text span given the screenshot and the referring expression. Then, check if the referring expression is grounded in the bounding boxes provided in the screenshot. That is, the coordinates of the start and end positions of the target text span should be inferable directly from the available bounding boxes' coordinates. For instance, if a bounding box encompasses an entire paragraph containing multiple sentences, referencing only a single sentence within it is not valid as you cannot infer the start and end positions of the sentence from the coordinates of the bounding box. Besides, the bounding box might be not accurate or suitbale enough (e.g. the id of bounding box is hidden by other bounding boxes or the id is not visible) for grounding the referring expression. For example, the bounding box is too large or too small for the target text span. If so, you should treat the referring expression as not grounded.

If the referring expression is grounded, you should output True and the involved id(s) of the bounding box that can be used to infer the start and end positions of the target text span. If the referring expression is not grounded, you should output False and leave the ids empty.
"""

用3个模型。。。
逻辑语法，都需要gpt refine

class Output_three(BaseModel):
    level: str # paragraph, line, word
    is_grounded: bool
    ids: list[str]



system_prompt = """
You are given an input consisting of 1. three screenshots annotated with bounding boxes, each assigned a unique ID. They are from the same image but with different granularities of bounding boxes. The first one is marked in the paragraph level, the second one is marked in the line level, and the third one is marked in the word level. 2. three dicts where the key is the ID of the bounding box and the value is the extracted text from that bounding box. They are based on the three screenshots with different levels of bounding box granularities. 3. a referring expression that specifies a target text span.

Your task is to first identify the target text span given the screenshots and the referring expression. After that, you have to decide which level of bounding box granularity is the most suitable for grounding the referring expression given three screenshots and dicts.
You should then decide if the referring expression can be grounded by the bounding boxes of the most suitable level of granularity; that is, the coordinates of the start and end positions of the target text span should be inferable directly from the available bounding boxes' coordinates. For instance, if a bounding box encompasses an entire paragraph containing multiple sentences, referencing only a single sentence within it is not valid as you cannot infer the start and end positions of the sentence from the coordinates of the bounding box. Therefore, you should first identify the most suitable level of bounding box granularity for grounding the referring expression. If the start and end coordinates of the target text span can be inferred from different screenshots with different levels of granualrity, you should select the one with most tight bounding box. If the bounding box is not inferable directly from the available bounding boxes' coordinates, you should output False for the is_grounded field. However, if all of them are not suitable, you should output False for the is_grounded field. Besides, the bounding box might be not accurate or suitbale at all (e.g. the id of bounding box is hidden by other bounding boxes or the id is not visible) for grounding the referring expression. For example, the bounding box is too large or too small for the target text span. If so, you should treat the referring expression as not grounded.

If the referring expression can be grounded, you should output True for the is_grounded field, the level of the bounding box granularity you used for grounding and the involved id(s) of the bounding box that can be used to infer the start and end positions of the target text span (Note that the target text span can span across multiple bounding boxes). If the referring expression is not grounded, you should output False for the is_grounded field.
"""

In [2]:
from openai import OpenAI
import base64

client = OpenAI()

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


In [3]:
# chat completions API

def call_llm(model, system_prompt, input_text, image_path,**kwargs):
  base64_image = encode_image(image_path)
  # chat.completions.create

  response = client.beta.chat.completions.parse(
    model=model,
    messages=[
      {
        "role": "developer",
        "content": [
          {
            "type": "text",
            "text": system_prompt
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": input_text,
          },
          {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
            }
          }
        ]
      }
    ],
    **kwargs
  )
  return response

In [11]:
import json
import os

def ground_expression(image_name, paresed_mode, referring_expression_category, referring_expression_model, to_ground_model):
        

    image_with_bbox_path = f"/home/t-zeyiliao/OmniParser/parsed_text_images/name-{image_name}_mode-{paresed_mode}.png"
    input_dict_path = f"/home/t-zeyiliao/OmniParser/parsed_text_text/name-{image_name}_mode-{paresed_mode}.json"

    referring_expression_path = f"/home/t-zeyiliao/OmniParser/referring_expressions/referring-expressions_name-{image_name}_model-{referring_expression_model}_category-{referring_expression_category}.json"
    grounded_expression_path = referring_expression_path.replace('/referring_expressions/', '/referring_expressions_grounded/').replace('.json', f'_parsed-mode-{paresed_mode}_grounded-by-{to_ground_model}.json')
    os.makedirs(os.path.dirname(grounded_expression_path), exist_ok=True)


    with open(input_dict_path, 'r') as f:
        input_dict = json.load(f)

    input_text = 'The dict where the key is the id of the bounding box and the value is the text of the bounding box.' + '/n/n' + json.dumps(input_dict)

    with open(referring_expression_path, 'r') as f:
        referring_expressions = json.load(f)


    res_save_dict = {}
    for idx, referring_expression in enumerate(referring_expressions['expressions']['expressions']):
        input_text = 'The dict where the key is the id of the bounding box and the value is the text of the bounding box.' + '/n/n' + json.dumps(input_dict需要改一下) + '/n/n' + 'The referring expression is ' + referring_expression

        res = call_llm(model=to_ground_model, input_text=input_text, image_path=image_with_bbox_path, system_prompt=system_prompt, response_format=Output)

        res_save_dict[idx] = dict(res.choices[0].message.parsed)

    with open(f'{grounded_expression_path}', 'w') as f:
        json.dump(res_save_dict, f, indent=4)

In [13]:

image_name = "test"
paresed_mode = "paragraph"

referring_expression_category = "positional"
referring_expression_model = "gpt-4o"

to_ground_model = "o4-mini-2025-04-16"
to_ground_model = "gpt-4o-mini"
to_ground_model = "gpt-4o"

ground_expression(image_name, paresed_mode, referring_expression_category, referring_expression_model, to_ground_model)