# imports

In [1]:
%pip install google-generativeai
%pip install pillow
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
import google.generativeai as genai
from dotenv import load_dotenv
import os
from PIL import Image
import json

load_dotenv() 
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
programs = genai.protos.Schema(
    type=genai.protos.Type.OBJECT,
    properties={
        'program_name': genai.protos.Schema(type=genai.protos.Type.STRING),
        'program_description': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="Description of the program"
        ),
        'is_active': genai.protos.Schema(
            type=genai.protos.Type.BOOLEAN,
            description="Whether the program is currently active"
        ),
        'program_type': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            enum=[
                "browser",
                "email",
                "chat",
                "calendar"
            ],
            description="Type of the program"
        ),
        'program_info': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="valid json information about the program based on the program type, e.g email client with email showing should have subject, body text, content, sender, etc. if it is a browser it will be the url, page title, etc. if it is a calendar it will be the date, time, highlighted, conflict"
        ),

    },
    required=['program_name', 'program_description', 'program_type']
)

In [4]:
screenshot_data_schema = genai.protos.Schema(
    type=genai.protos.Type.OBJECT,
    properties={
        'general_scene_description': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="Description of screenshot and user activity in the scene. Including text in Email body"
        ),
        'programs_in_scene': genai.protos.Schema(
            type=genai.protos.Type.ARRAY,
            items=programs,
            description="An array of all programs running in the screenshot it may only be one"
        ),
        'mouse_action': genai.protos.Schema(
            type=genai.protos.Type.STRING,
            description="what the user is doing with the mouse, hovering a button, clicking a button, etc."
        )
    },
    required=['general_scene_description', 'programs_in_scene', 'mouse_action']
)

In [5]:
screenshot_tools = genai.protos.FunctionDeclaration(
    name="screenshot_data",
    description="Get data about the screenshot and what the user is doing with the mouse",
    parameters=screenshot_data_schema
)

In [6]:
model = genai.GenerativeModel(
    "gemini-1.5-pro-latest",
    tools=[screenshot_tools]
)

# Images to text

In [10]:

current_dir = os.getcwd()
image_dir = os.path.join(current_dir, "final_images")

files = sorted(os.listdir(image_dir), 
              key=lambda x: int(x.replace('output_', '').split('.')[0]))
results = []
for file in files: #[21:22]:
    print(f"{file}")
    image = Image.open(os.path.join(image_dir, file))
    result = model.generate_content(
        ["describe the screenshot and what the user is doing with the mouse", image]
    )
    results.append(result)

output_01.jpg
output_02.jpg
output_03.jpg
output_04.jpg
output_05.jpg
output_06.jpg
output_07.jpg
output_08.jpg
output_09.jpg
output_10.jpg
output_11.jpg
output_12.jpg
output_13.jpg
output_14.jpg


In [11]:
def get_text_from_result(result):
    # Convert MapComposite to regular dictionary
    data_dict = dict(result.candidates[0].content.parts[0].function_call.args)
    # print(data_dict)
    # If you need text/string values
    text_data = {}
    for key, value in data_dict.items():
        if hasattr(value, 'text_value'):
            text_data[key] = value.text_value
        elif isinstance(value, str):
            text_data[key] = value
        elif key == 'programs_in_scene':
            for k in ['program_name', 'program_description', 'program_type']:
                text_data[k] = dict(value[0]).get(k,'Unknown')

            
    return text_data

json_data_from_images = [{"action":get_text_from_result(result)} for result in results]


In [12]:
for i, j in enumerate(json_data_from_images):
    print(f"{i+1}: {j}")
    os.makedirs('json_data_from_images', exist_ok=True)
    with open(f'json_data_from_images/{i+1:02d}.json', 'w') as f:
        json.dump(j, f)

1: {'action': {'mouse_action': 'The mouse cursor is hovering over the "+" button to create a new message.', 'general_scene_description': 'The screenshot shows an empty inbox in the Outlook email client. The message "All done for the day. Enjoy your empty inbox." is displayed.', 'program_name': 'Outlook', 'program_description': 'The Outlook email client is open in a browser window.', 'program_type': 'email'}}
2: {'action': {'program_name': 'Outlook', 'program_description': 'The Outlook web application is open in the browser window.', 'program_type': 'email', 'general_scene_description': 'The screenshot shows an empty inbox in the Outlook web application. The message "All done for the day\\nEnjoy your empty inbox." is displayed.', 'mouse_action': 'The mouse cursor is hovering over the minimize button of the browser window.'}}
3: {'action': {'mouse_action': 'The mouse cursor is hovering over the email subject.', 'program_name': 'Outlook', 'program_description': 'The user is viewing their 

# knowledge_base: generate what and why database 

In [13]:
llm = genai.GenerativeModel(
    "gemini-1.5-pro-latest"
)

In [14]:
# read all json files in json_data_from_images json files
file_count_in_folder = len(os.listdir('json_data_from_images'))
json_data_from_images = [json.load(open(f'json_data_from_images/{i+1:02d}.json')) for i in range(file_count_in_folder)]
json_data_from_images

knowledge_base = []
for i in range(1,len(json_data_from_images)-2):
    data = json_data_from_images[i:i+2]
    response =llm.generate_content([f"Describe in details what the user is doing and why the user took the last action. Return the results in json format whith two keys: 'what' and 'why'. Here is the situation data: {data}"])
    knowledge_base.append(response.candidates[0].content.parts[0].text.replace("```json","").replace("```",""))
    print(f"{i}: data {data}: {knowledge_base[-1]}")

1: data [{'action': {'program_name': 'Outlook', 'program_description': 'The Outlook web application is open in the browser window.', 'program_type': 'email', 'general_scene_description': 'The screenshot shows an empty inbox in the Outlook web application. The message "All done for the day\\nEnjoy your empty inbox." is displayed.', 'mouse_action': 'The mouse cursor is hovering over the minimize button of the browser window.'}}, {'action': {'mouse_action': 'The mouse cursor is hovering over the email subject.', 'program_name': 'Outlook', 'program_description': 'The user is viewing their Outlook email inbox in a browser.', 'program_type': 'email', 'general_scene_description': 'The screenshot shows an email inbox in Outlook. The user is viewing an email with the subject "Box size change request" from "isseki". The email body starts with "Hi, The boxes you use for the ship...".'}}]: 
{
  "what": "The user was viewing an email with the subject \"Box size change request\" from \"isseki\" in t

In [15]:
os.makedirs('knowledge_base', exist_ok=True)
with open(f'knowledge_base/knowledge_base.json', 'w') as f:
    json.dump(knowledge_base, f)


# Ask Joshu

In [18]:
email="""
email from isseki: I’m checking that everything is OK for the next shipment.
 
I hope there won’t be any problems with box sizes.
"""

system_prompt = f"""Your objective as an assistant is to utilize the lessons learnt and information provided in the past.
The following is the information about what happened in the past. {knowledge_base}. """
response =llm.generate_content([f"""

{system_prompt}

{email}
what box size shoud I use for the next shipment?""",])
print(response.candidates[0].content.parts[0].text)

The user will likely reply to Isseki's email about box sizes.  Based on the past interactions, the user previously agreed to use smaller boxes due to handling difficulties.  Therefore, the user will likely reply confirming the use of the smaller box size for the next shipment.  They might also reiterate the reason for the change (easier handling) to ensure Isseki understands and remembers the agreement.

