In [1]:
%pip install google-generativeai
%pip install pillow

import google.generativeai as genai

Collecting google-generativeai
  Downloading google_generativeai-0.8.3-py3-none-any.whl.metadata (3.9 kB)
Collecting google-ai-generativelanguage==0.6.10 (from google-generativeai)
  Downloading google_ai_generativelanguage-0.6.10-py3-none-any.whl.metadata (5.6 kB)
Collecting google-api-core (from google-generativeai)
  Downloading google_api_core-2.23.0-py3-none-any.whl.metadata (3.0 kB)
Collecting google-api-python-client (from google-generativeai)
  Downloading google_api_python_client-2.153.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting google-auth>=2.15.0 (from google-generativeai)
  Downloading google_auth-2.36.0-py2.py3-none-any.whl.metadata (4.7 kB)
Collecting protobuf (from google-generativeai)
  Downloading protobuf-5.28.3-cp38-abi3-macosx_10_9_universal2.whl.metadata (592 bytes)
Collecting pydantic (from google-generativeai)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
Collecting tqdm (from google-generativeai)
  Downloading tqdm-4.67.0-py3-none-any.w

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
programs = genai.protos.Schema(
    type=genai.protos.Type.OBJECT,
    properties={
        'program_name': genai.protos.Schema(type=genai.protos.Type.STRING),
        'program_description': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="Description of the program"
        ),
        'is_active': genai.protos.Schema(
            type=genai.protos.Type.BOOLEAN,
            description="Whether the program is currently active"
        ),
        'program_type': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            enum=[
                "browser",
                "email",
                "chat",
                "calendar",
                "other"
            ],
            description="Type of the program"
        ),
        'program_info': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="valid json information about the program based on the program type, e.g email client with email showing should have subject, content, sender, etc. if it is a browser it will be the url, page title, etc. if it is a calendar it will be the date, time, highlighted, conflict"
        ),

    },
    required=['program_name', 'program_description', 'program_type']
)

In [4]:
screenshot_data_schema = genai.protos.Schema(
    type=genai.protos.Type.OBJECT,
    properties={
        'general_scene_description': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="Description of screenshot and user activity in the scene"
        ),
        'programs_in_scene': genai.protos.Schema(
            type=genai.protos.Type.ARRAY,
            items=programs,
            description="An array of all programs running in the screenshot it may only be one"
        ),
        'mouse_action': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="what the user is doing with the mouse, hovering a button, clicking a button, etc."
        )
    },
    required=['general_scene_description', 'programs_in_scene', 'mouse_action']
)

In [5]:
screenshot_tools = genai.protos.FunctionDeclaration(
    name="screenshot_data",
    description="Get data about the screenshot and what the user is doing with the mouse",
    parameters=screenshot_data_schema
)


In [6]:
api_key = "AIzaSyBHHuU66zwhgvoGVD16GL1hLGPGRy2Y0JA"
genai.configure(api_key=api_key)


In [7]:
model = genai.GenerativeModel(
    "gemini-1.5-pro-latest",
    tools=[screenshot_tools]
)


In [61]:
import os
from PIL import Image
# Get the directory where the current script is located
current_dir = os.getcwd()
image_dir = os.path.join(current_dir, "test_images")

files = sorted(os.listdir(image_dir), 
              key=lambda x: int(x.replace('s', '').split('.')[0]))

results = []
for file in files:
    image = Image.open(os.path.join(image_dir, file))
    result = model.generate_content(
        ["describe the screenshot and what the user is doing with the mouse", image]
    )
    results.append(result)

In [64]:
# Extract the actions
actions = "# Action History\n\n"
for i, result in enumerate(results):
    for candidate in result.candidates:
        part = candidate.content.parts[0]
        actions += f"## Action {i+1}\n{str(part.function_call)}\n"


In [66]:
actions

'# Action History\n\n## Action 1\nname: "screenshot_data"\nargs {\n  fields {\n    key: "programs_in_scene"\n    value {\n      list_value {\n        values {\n          struct_value {\n            fields {\n              key: "program_type"\n              value {\n                string_value: "email"\n              }\n            }\n            fields {\n              key: "program_name"\n              value {\n                string_value: "Email Client"\n              }\n            }\n            fields {\n              key: "program_info"\n              value {\n                string_value: "{\\"status\\": \\"Syncing\\"}"\n              }\n            }\n            fields {\n              key: "program_description"\n              value {\n                string_value: "An email client application."\n              }\n            }\n            fields {\n              key: "is_active"\n              value {\n                bool_value: true\n              }\n            }\n      