In [None]:
system_prompt_all = """
You are given an input consisting of a screenshot annotated with bounding boxes, each assigned a unique ID and a dict where the key is the ID of the bounding box and the value is the extracted text from that bounding box. Your task consists of two subtasks: First, identify the regions where a user is likely to interact with the content by performing a mouse drag action, based on the screenshot and the dict; and Second, generate a natural language referring expression that specifies a target text span contained within the identified region(s), either entirely within a single bounding box or spanning multiple bounding boxes.

For each referring expression, you must classify it into one or more of the following four base categories or their compositions. Each category is defined below, along with illustrative examples.

1. Semantic:

##Definition##
Describes the target text based on its meaning, intent, or topical content.

##Examples##
a.Select the paragraph discussing how to download models.
b.Select the lines that infer the causes of failure.
c.Select the sentence about Kobe Bryant’s career.
d.Select consecutive words referring to the weight of the MacBook Pro.

2. Positional:

##Definition##
Refers to the location of the text—either in absolute terms (e.g., top, bottom of the page) or relative to other visual elements.

##Examples##
 a. Select the paragraph at the bottom of the page.
 b. Select the first three lines from the top.
 c. Select the sentence immediately below the chart title.
 d. Select the words on the left side of the login button.

3. Lexical:

##Definition##
Describes the text by referencing its literal or quoted content, including the starting words, key phrases, or exact match.

##Examples##
 a. Select the paragraph that begins with “To get started with Python…”.
 b. Select the lines ending with “before submission is due”.
 c. Select the sentence containing the phrase “AI is transforming industries”.
 d. Select the words that say “Monday, Tuesday, and so on”.

4. Visual:

##Definition##
Refers to distinctive visual features of the text, such as font color, size, emphasis, or highlighting.

##Examples##
 a. Select the paragraph written in bold italics.
 b. Select the lines highlighted in yellow.
 c. Select the sentence in red font.
 d. Select the words with the largest font size on the screen.

##Note##
The bounding box around the text will have color, but they are not the visual attributes of the text. Don’t confuse them!

5. Compositional:

##Definition##
Combines two or more of the base categories to generate a referring expression that draws from multiple dimensions (e.g., semantic and positional).

##Examples##

a. Select the paragraph at the bottom of the page that discusses how to download the model. (Positional + Semantic)
b. Select the sentence in red font starting with “Warning:”. (Visual + Lexical)
c. Select the bold text immediately above the login form that mentions error messages. (Visual + Positional + Semantic)
d. Select the words on the top-right that say “Subscribe now”. (Positional + Lexical)

Every generated expression must be grounded in the bounding boxes provided in the screenshot. That is, the start and end positions of the referenced text should be inferable directly from the involved bounding boxes’ coordinates. For instance, if a bounding box encompasses an entire paragraph, referencing only a sentence within it is not valid unless the bounding box also precisely matches that sentence.

The referring expression should be clear about the granularity of the text, i.e., clearly specify if they are pargagraph(s), line(s), sentence(s), word(s) without using ambiguous words like 'text', 'part'.

You should output the involved id(s) of the bounding box, the corresponding generated referring expressions, and the category (or categories) it belongs to. Prioritize generating expression that belongs to a single category.
"""

In [1]:
from pydantic import BaseModel

class Output(BaseModel):
    available: bool
    expressions: list[str]

system_prompt_semantic = """
You are given an screenshot input. Your task is to generate natural language referring expressions which specify different target text spans contained within the screenshot that human tend to use mouse drag action to select. Ignore the parts that are not text, that are not selectable by mouse and that are not the places where human tend to use mouse drag action to select in daily life.

For the referring expression you generated, they must describe the target text span based on its meaning, intent, or topical content.

For example:
a.Select the paragraph discussing how to download models.
b.Select the lines that infer the causes of failure.
c.Select the sentence about Kobe Bryant's career.
d.Select consecutive words referring to the weight of the MacBook Pro.

The referring expression should be clear about the granularity of the text, i.e., clearly specify if they are pargagraph(s), line(s), sentence(s), words without using ambiguous words like 'text', 'part'. The target text span can be single or multiple paragraphs, lines, sentences. For words, it should be at least multiple words as selecting a single word usually does not require a mouse drag action.

If no feasible or available referring expression meeting the requirements can be generated, you should return False and an empty list.
If it does, you should return True and the generated referring expressions.
"""

In [19]:

system_prompt_positional = """
You are given an screenshot input. Your task is to generate natural language referring expressions which specify different target text spans contained within the screenshot that human tend to use mouse drag action to select. Ignore the parts that are not text, that are not selectable by mouse and that are not the places where human tend to use mouse drag action to select in daily life.

For the referring expression you generated, they must refer to the location of the text—either in absolute terms (e.g., top, bottom of the page) or relative to other visual elements.

For example:
 a. Select the paragraph at the bottom of the page.
 b. Select the first three lines from the top.
 c. Select the sentence immediately below the chart title.
 d. Select the words on the left side of the login button.

The referring expression should be clear about the granularity of the text, i.e., clearly specify if they are pargagraph(s), line(s), sentence(s), words without using ambiguous words like 'text', 'part'. The target text span can be single or multiple paragraphs, lines, sentences. For words, it should be at least multiple words as selecting a single word usually does not require a mouse drag action.

If no feasible or available referring expression meeting the requirements can be generated, you should return False and an empty list.
If it does, you should return True and the generated referring expressions.
"""

In [18]:

system_prompt_lexical = """
You are given an screenshot input. Your task is to generate natural language referring expressions which specify different target text spans contained within the screenshot that human tend to use mouse drag action to select. Ignore the parts that are not text, that are not selectable by mouse and that are not the places where human tend to use mouse drag action to select in daily life.

For the referring expression you generated, they must describe the text by referencing its literal or quoted content, including the starting words, key phrases, or exact match.

For example:
 a. Select the paragraph that begins with “To get started with Python…”.
 b. Select the lines ending with “before submission is due”.
 c. Select the sentence containing the phrase “AI is transforming industries”.
 d. Select the words that say “Monday, Tuesday, and so on”.

The referring expression should be clear about the granularity of the text, i.e., clearly specify if they are pargagraph(s), line(s), sentence(s), words without using ambiguous words like 'text', 'part'. The target text span can be single or multiple paragraphs, lines, sentences. For words, it should be at least multiple words as selecting a single word usually does not require a mouse drag action.

If no feasible or available referring expression meeting the requirements can be generated, you should return False and an empty list.
If it does, you should return True and the generated referring expressions.
"""

In [17]:

system_prompt_visual = """
You are given an screenshot input. Your task is to generate natural language referring expressions which specify different target text spans contained within the screenshot that human tend to use mouse drag action to select. Ignore the parts that are not text, that are not selectable by mouse and that are not the places where human tend to use mouse drag action to select in daily life.

For the referring expression you generated, they must refer to distinctive visual features of the text, such as font color, size, emphasis, or highlighting.

For example:
 a. Select the paragraph written in bold italics.
 b. Select the lines highlighted in yellow.
 c. Select the sentence in red font.
 d. Select the words with the largest font size on the screen.

The referring expression should be clear about the granularity of the text, i.e., clearly specify if they are pargagraph(s), line(s), sentence(s), words without using ambiguous words like 'text', 'part'. The target text span can be single or multiple paragraphs, lines, sentences. For words, it should be at least multiple words as selecting a single word usually does not require a mouse drag action.

If no feasible or available referring expression meeting the requirements can be generated, you should return False and an empty list.
If it does, you should return True and the generated referring expressions.
"""

In [21]:
system_prompt_map = {
    "semantic": system_prompt_semantic,
    "positional": system_prompt_positional,
    "lexical": system_prompt_lexical,
    "visual": system_prompt_visual
}

output_map = {
    "all": None,
    "semantic": Output,
    "positional": Output,
    "lexical": Output,
    "visual": Output
}

In [22]:
from openai import OpenAI
import base64

client = OpenAI()

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")


In [10]:
# chat completions API

def call_llm(model, system_prompt, input_text, image_path,**kwargs):
  base64_image = encode_image(image_path)

  # chat.completions.create

  response = client.beta.chat.completions.parse(
    model=model,
    messages=[
      {
        "role": "developer",
        "content": [
          {
            "type": "text",
            "text": system_prompt
          }
        ]
      },
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": input_text,
          },
          {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}",
            }
          }
        ]
      }
    ],
    **kwargs
  )
  return response

In [35]:
import os
import json

def generate_referring_expressions(image_path, model, category, save_dir = "/home/t-zeyiliao/OmniParser/referring_expressions"
):
    input_text = "Here is the screenshot."

    res = call_llm(model=model, input_text=input_text, image_path=image_path, system_prompt=system_prompt_map[category], response_format=output_map[category])

    expressions = res.choices[0].message.parsed.expressions
    image_name = os.path.splitext(os.path.basename(image_path))[0]

    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f'{image_name}_referring_expressions_model-{model}_category-{category}.json')
    with open(save_path, 'w', encoding='utf-8') as f:
        json.dump({"expressions":expressions}, f,indent=4, ensure_ascii=False)


In [36]:
image_path = "/home/t-zeyiliao/OmniParser/lzy_images/test.png"

model = "o4-mini-2025-04-16"
# model = "gpt-4o-mini"
# model = "gpt-4o"

category = "positional"

generate_referring_expressions(image_path, model, category)
