In [None]:
%pip install google-generativeai
%pip install pillow

import google.generativeai as genai

In [7]:
programs = genai.protos.Schema(
    type=genai.protos.Type.OBJECT,
    properties={
        'program_name': genai.protos.Schema(type=genai.protos.Type.STRING),
        'program_description': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="Description of the program"
        ),
        'is_active': genai.protos.Schema(
            type=genai.protos.Type.BOOLEAN,
            description="Whether the program is currently active"
        ),
        'program_type': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            enum=[
                "browser",
                "email",
                "chat",
                "calendar",
                "other"
            ],
            description="Type of the program"
        ),
        'program_info': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="valid json information about the program based on the program type, e.g email client with email showing should have subject, content, sender, etc. if it is a browser it will be the url, page title, etc. if it is a calendar it will be the date, time, highlighted, conflict"
        ),

    },
    required=['program_name', 'program_description', 'program_type']
)

In [8]:
screenshot_data_schema = genai.protos.Schema(
    type=genai.protos.Type.OBJECT,
    properties={
        'general_scene_description': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="Description of screenshot and user activity in the scene"
        ),
        'programs_in_scene': genai.protos.Schema(
            type=genai.protos.Type.ARRAY,
            items=programs,
            description="An array of all programs running in the screenshot it may only be one"
        ),
        'mouse_action': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="what the user is doing with the mouse, hovering a button, clicking a button, etc."
        )
    },
    required=['general_scene_description', 'programs_in_scene', 'mouse_action']
)

In [9]:
screenshot_tools = genai.protos.FunctionDeclaration(
    name="screenshot_data",
    description="Get data about the screenshot and what the user is doing with the mouse",
    parameters=screenshot_data_schema
)


In [10]:
api_key = "AIzaSyBHHuU66zwhgvoGVD16GL1hLGPGRy2Y0JA"
genai.configure(api_key=api_key)


In [11]:
model = genai.GenerativeModel(
    "gemini-1.5-pro-latest",
    tools=[screenshot_tools]
)


In [13]:
import os
from PIL import Image
# Get the directory where the current script is located
current_dir = os.getcwd()
image_dir = os.path.join(current_dir, "test_images")

files = sorted(os.listdir(image_dir), 
              key=lambda x: int(x.replace('s', '').split('.')[0]))

for file in files:
    image = Image.open(os.path.join(image_dir, file))
    result = model.generate_content(
        ["describe the screenshot and what the user is doing with the mouse", image]
    )
    print("file:", file)
    print(result)


file: s0.png
response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "function_call": {
                  "name": "screenshot_data",
                  "args": {
                    "programs_in_scene": [
                      {
                        "is_active": true,
                        "program_info": "{\"status\": \"Syncing\"}",
                        "program_description": "Email client application",
                        "program_type": "email",
                        "program_name": "Email Client"
                      }
                    ],
                    "general_scene_description": "The screenshot shows an email client. The inbox is empty and displays a congratulatory message: \"What a productive day! You\\'ve accomplished a lot\". The client is currently syncing.",
                    "mouse_act