In [1]:
import os
import subprocess

# autoreload any .py scripts
%load_ext autoreload
%autoreload 2

# set the project's root directory as the notebooks' working directory
# git_root = subprocess.run(['git', 'rev-parse', '--show-toplevel'], capture_output=True, text=True).stdout.strip()
if os.getcwd().endswith("notebooks"): os.chdir("..")
os.getcwd()


'/Users/nilsgandlau/code/browser-automation'

In [1]:
from typing import Any
import requests

def get_openai_response(
    api_key: str,
    prompt: str,
    image: Any, # base64 encoded image data
    max_tokens: int = 300
) -> dict:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    payload = {
        "model": "gpt-4o",
        "messages": [
            {
            "role": "user",
            "content": [
                {
                "type": "text",
                "text": prompt,
                },
                {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{image}"
                }
                }
            ]
            }
        ],
        "max_tokens": max_tokens
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()

In [7]:
from pathlib import Path
from src.utils import encode_image
import requests


def create_user_message(prompt: str | None, image_paths: list[Path] | None) -> dict:
    content = []
    if prompt: content += [{"type": "text", "text": prompt}]
    if image_paths: content += [{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encode_image(img_path)}"}} for img_path in image_paths]
    return {"role": "user", "content": content}

def create_payload(user_message: dict, max_tokens: int = 300) -> dict:
    return {
        "model": "gpt-4o",
        "messages": [user_message],
        "max_tokens": max_tokens
    }

def get_openai_response(
    api_key: str,
    payload: dict,
) -> dict:
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()

def get_response_text(openai_response: dict) -> str:
    return openai_response["choices"][0]["message"]["content"]

In [15]:
import os
from src.utils import encode_image

api_key = os.environ["OPENAI_API_KEY"]
prompt = "describe the images. how do they differ?"
image_paths = [Path("screenshots/02.jpg"), Path("screenshots/fullpage.jpg")]

user_message = create_user_message(prompt=prompt, image_paths=image_paths)
payload = create_payload(user_message)
response = get_openai_response(api_key, payload)
response_text = get_response_text(response)
print(response_text)

# TODO: kann ich zwei bilder an open ai geben?
# TODO: wie klein kann ein bild sein?

The two images provided are screenshots from the same website for Sportclub SAFO Frankfurt, but they showcase different parts of the site. Here are the key differences between the two images:

1. **Content Focus:**
   - **First Image:** The screenshot appears to be the top section of the home page featuring an introduction to Sportclub SAFO Frankfurt. It includes the club name, logo, a welcome message, and a photograph of the sports facilities.
   - **Second Image:** This screenshot is more comprehensive, displaying a detailed view of the website's homepage or another content-rich page. It includes sections on various topics such as tennis summer training, athletic training, the SAFO Gym, ball school, and several other club-related details like membership, booking, and payment information.

2. **Layout:**
   - **First Image:** The layout is more simplistic with a large central space dedicated to a welcome message and a single image showing the club’s tennis courts.
   - **Second Image:

In [1]:
def parse_actor_response(text: str) -> tuple[str, str]:
    found_answer_actions: list = re.findall(r'ANSWER\("[a-zA-Z]+\)', text)
    found_click_actions: list = re.findall(r'CLICK\("[a-zA-Z]+"\)', text)
    found_type_actions: list = re.findall(r'INPUT\("[a-zA-z0-9]+"\)', text)
    found_scroll_actions: list = re.findall(r'SCROLL\("[a-zA-z0-9]+"\)', text)
    found_analyze_table_actions: list = re.findall(r'ANALYZE_TABLE\("[a-zA-z0-9]+"\)', text)

    if len(found_answer_actions) > 1: raise ValueError(f"Found multiple ANSWER actions in response!")
    if len(found_click_actions) > 1: raise ValueError(f"Found multiple CLICK actions in response!")
    if len(found_type_actions) > 1: raise ValueError(f"Found multiple TYPE actions in response!")
    if len(found_scroll_actions) > 1: raise ValueError(f"Found multiple SCROLL actions in response!")
    if len(found_analyze_table_actions) > 1: raise ValueError(f"Found multiple ANALYZE_TABLE actions in response!")
    if not found_answer_actions and not found_click_actions and not found_type_actions and not found_scroll_actions and not found_analyze_table_actions: raise ValueError("No Action found in response!")

    if found_answer_actions:
        answer = found_answer_actions[0].split('\"')[1]
        return ("ANSWER", answer)
    elif found_scroll_actions:
        direction = found_scroll_actions[0].split('\"')[1]
        return ("SCROLL", direction)
    elif found_click_actions:
        letters = found_click_actions[0].split('\"')[1]
        return ("CLICK", letters)
    elif found_type_actions:
        text = found_type_actions[0].split('\"')[1]
        return ("INPUT", text)
    elif found_analyze_table_actions:
        text = found_analyze_table_actions[0].split('\"')[1]
        return ("ANALYZE_TABLE", text)
    else:
        raise ValueError("No Action found in response!")

response = """
Thought: To determine which outside tennis courts are free for 1 hour between 17:00 and 19:00 today, I need to analyze the schedule table for the specified time range and identify the available courts.

Action: ANALYZE_TABLE("Identify which courts (Platz 1 to Platz 17) are marked as 'B' between 17:00 and 19:00 on Friday, 14.06.2024.")
"""

