In [1]:
from pydantic import BaseModel

class Output(BaseModel):
    is_grounded: bool
    ids: list[str]

system_prompt = """
You are given an input consisting of 1. a screenshot annotated with bounding boxes, each assigned a unique ID, 2. a dict where the key is the ID of the bounding box and the value is the extracted text from that bounding box and 3. a referring expression that specifies a target text span.

Your task is to first identify the target text span given the screenshot and the referring expression. Then, check if the referring expression is grounded in the bounding boxes provided in the screenshot. That is, the coordinates of the start and end positions of the target text span should be inferable directly from the available bounding boxes' coordinates. For instance, if a bounding box encompasses an entire paragraph containing multiple sentences, referencing only a single sentence within it is not valid as you cannot infer the start and end positions of the sentence from the coordinates of the bounding box.

If the referring expression is grounded, you should output True and the involved id(s) of the bounding box that can be used to infer the start and end positions of the target text span. If the referring expression is not grounded, you should output False and leave the ids empty.
"""

In [2]:
from openai import OpenAI
import base64

client = OpenAI()

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


In [5]:
# chat completions API

def call_llm(model, system_prompt, input_text, image_path,**kwargs):
  base64_image = encode_image(image_path)
  # chat.completions.create

  response = client.beta.chat.completions.parse(
    model=model,
    messages=[
      {
        "role": "developer",
        "content": [
          {
            "type": "text",
            "text": system_prompt
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": input_text,
          },
          {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
            }
          }
        ]
      }
    ],
    **kwargs
  )
  return response

In [11]:
import json
image_with_bbox_path = "/home/t-zeyiliao/OmniParser/parsed_text_images/test_mode-paragraph.png"
input_dict_path = "/home/t-zeyiliao/OmniParser/parsed_text_text/test_mode-paragraph.json"
referring_expression_path = "/home/t-zeyiliao/OmniParser/referring_expressions/test_referring_expressions_model-o4-mini-2025-04-16_category-positional.json"

model = "o4-mini-2025-04-16"
model = "gpt-4o-mini"
model = "gpt-4o"

with open(input_dict_path, 'r') as f:
    input_dict = json.load(f)

input_text = 'The dict where the key is the id of the bounding box and the value is the text of the bounding box.' + '/n/n' + json.dumps(input_dict)

with open(referring_expression_path, 'r') as f:
    referring_expressions = json.load(f)


res_save_dict = {}
for idx, referring_expression in enumerate(referring_expressions['expressions']):
    input_text = 'The dict where the key is the id of the bounding box and the value is the text of the bounding box.' + '/n/n' + json.dumps(input_dict) + '/n/n' + 'The referring expression is ' + referring_expression

    res = call_llm(model=model, input_text=input_text, image_path=image_with_bbox_path, system_prompt=system_prompt, response_format=Output)

    res_save_dict[idx] = dict(res.choices[0].message.parsed)

with open(f'{referring_expression_path.replace(".json", "_grounded.json")}', 'w') as f:
    json.dump(res_save_dict, f, indent=4)

In [10]:
dict(res.choices[0].message.parsed)

{'is_grounded': True, 'ids': ['5']}

In [17]:
print(res)

Response(id='resp_6859f0853b9c81a094b16ff4a923c99f0a730c731d4721a3', created_at=1750724741.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-2024-08-06', object='response', output=[ResponseOutputMessage(id='msg_6859f086678881a08cf338c2c7fde9d30a730c731d4721a3', content=[ResponseOutputText(annotations=[], text='## Involved Id(s): 6\nReferring Expression: Select the text beginning with "Note 1: [\'ch_Sim\', \'en\'] is the list of languages..."\nCategory: Lexical\n\n## Involved Id(s): 13\nReferring Expression: Select the text about downloading model weights for the chosen language.\nCategory: Semantic\n\n## Involved Id(s): 14\nReferring Expression: Select the text about running the model in CPU-only mode if you don\'t have a GPU.\nCategory: Semantic', type='output_text', logprobs=None)], role='assistant', status='completed', type='message')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[], top_p=1.0, background=False, max_output