In [1]:
import numpy as np
import pandas as pd

# from urllib.parse import urlparse, urljoin
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By

from langchain_openai import OpenAIEmbeddings
from langchain_openai.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage

embedding_model = OpenAIEmbeddings(model="text-embedding-3-large")
llm = ChatOpenAI(model="gpt-4-turbo")

creating semantic search

In [2]:
# searches df[column] for the query
def search(query, df, column, model):
    query_embedding = model.embed_query(query)

    if f"embeddings: {column}" not in df.columns:
        df[f"embeddings: {column}"] = model.embed_documents(list(df[column]))

    df["similarity"] = df[f"embeddings: {column}"].apply(
        lambda x: np.dot(x, query_embedding)
    )

    return df.sort_values("similarity", ascending=False)

getting and filtering elements to feed to the LLM

In [3]:
def get_relevant_elements(done_already, phrase, embedding_model, driver):
    pagesource = driver.page_source
    tree = etree.ElementTree(etree.HTML(pagesource))

    elements = pd.DataFrame()
    elements["element"] = tree.xpath("//a | //button | //h1 | //h2 | //input")

    elements["xpath"] = elements["element"].apply(tree.getpath)
    elements = elements[elements["xpath"].apply(is_visible, args=(driver,))]

    elements["text"] = elements["element"].apply(get_element_text, args=(tree,))
    elements = elements[elements["text"] != ""]
    elements["tag"] = elements["element"].apply(lambda x: x.tag)

    elements = apply_headings(elements)

    elements.to_csv("output/all_elements.csv", index=False)

    elements = search(phrase, elements, "text", embedding_model).head(30)
    elements["tagandtext"] = [
        f"{tag}[{text}]" for tag, text in zip(elements["tag"], elements["text"])
    ]
    # removing elements that have already been interacted with
    elements = elements[~elements["tagandtext"].isin(done_already)]
    # temporarily removing footers which are often irrelevant
    elements = elements[~elements["text"].str.contains("footer", case=False)]
    elements = elements.drop_duplicates(subset=["tagandtext"])
    elements["tagandtext"].to_csv("output/relevant_elements.csv", index=False)

    return elements


def apply_headings(elements):
    current_heading = None
    to_delete = []
    for index, row in elements.iterrows():
        if row["tag"] in ["h1", "h2"]:
            if row["text"] != "":
                current_heading = row["text"]
            to_delete.append(index)
        else:
            if current_heading is not None:
                elements.at[index, "text"] = (
                    current_heading + " / " + elements.at[index, "text"]
                )
    elements.drop(to_delete, inplace=True)
    return elements


# returns empty string if text is not important
# TODO: sometimes, the best text is in a child element, e.g. a heading inside a link (see https://jobs.ashbyhq.com/The%20Browser%20Company)
def get_element_text(element, tree):
    if element.text and element.text.strip():
        return element.text.strip()
    attr_hierarchy = [
        "aria-label",
        "placeholder",
        "title",
        "href",
    ]
    for attr in attr_hierarchy:
        if attr in element.attrib:
            # return f"{attr}: {element.attrib[attr].strip()}"
            return element.attrib[attr].strip()

    if element.tag == "input":
        input_id = element.attrib.get("id", "")  # Get the id of the input, if it exists
        if input_id:
            # XPath to find a label with a 'for' attribute that matches the input's id
            label = tree.xpath(f"//label[@for='{input_id}']")
            if label:
                # Assumption: There's only one label for each input if found
                label_text = label[0].xpath(
                    ".//span[@class='form-selector-title']/text()"
                )
                if label_text:
                    return label_text[0].strip()  # return the text from the found label

    return ""


def is_visible(xpath, driver):
    # TODO: fix
    #
    try:
        element = driver.find_element(By.XPATH, xpath)
        # some headings return false for is_displayed, but are displayed - look into this
        return element.is_displayed() or (
            element.tag_name in ["h1", "h2"] and element.text.strip() != ""
        )
    except Exception:
        # print('element not found')
        return False

creating message to feed to the LLM

In [4]:
# this function creates the messages to feed to the chat model
def create_messages(goal, done_already, next_task, current_url, relevant_elements):
    system_instructions = """You are an AI that navigates webpages to complete tasks.

As input, you will receive information about the task and a list of elements from the webpage. Many of the elements' text will have a slash (/). The text before the slash is the heading that the element is below; it is useful for context but cannot be used to determine which element to click. The the part after the slash is the text on the element itself, which is indicative of what the element does. Pay attention to whether the element does an action or is for navigation. For example, if the subtask is to add an item to the shopping cart, an element with text "shopping cart" will not work; you must find an element with text like "ADD to shopping cart." If the part after the slash does not help complete the task, do not consider that entire element.

In multiple seperate lines, respond with an ordered list, numbered 1 to 3, of the three best elements to interact with to accomplish the task, with the first line being the best.

In a new line, write a short phrase that describes what interacting with the element above achieved.

In the last line, write a short phrase that specifies the user's next subtask."""

    if len(done_already) == 0:
        done_already_str = "None yet"
    else:
        done_already_str = ", ".join(done_already)

    prompt = """The task is {goal}.
    The following elements have already been interacted with {done_already}.
    The next subtask is {next_task}. The element you choose to interact with must complete this subtask.
    The url of the current page is {current_url}.
    List of relevant, interactable elements:
    {relevant_elements}

    Which element should the user interact with and what is the next subtask?""".format(
        goal=goal,
        done_already=done_already_str,
        next_task=next_task,
        current_url=current_url,
        relevant_elements="\n".join(relevant_elements["tagandtext"]),
    )

    # save prompt to file
    with open("output/prompt.txt", "w") as file:
        file.write(prompt)

    messages = [
        SystemMessage(content=system_instructions),
        HumanMessage(content=prompt),
    ]

    system_tokens = len(system_instructions) // 4
    message_tokens = len(prompt) // 4
    print(f"System tokens: {system_tokens}\nInput message tokens: {message_tokens}")

    return messages, message_tokens + system_tokens

querying LLM

In [5]:
# this function queries the chat model for the first phrase to search
def get_first_phrase(goal, url, llm):
    prompt = f"I am already on the webpage {url}. I am trying to {goal}. What is a short phrase that describes the first thing I need to do? Respond with the phrase only."

    messages = [HumanMessage(content=prompt)]
    return query_llm(messages, llm)

In [6]:
# this function queries the chat model
def query_llm(messages, llm):
    rawoutput = llm.invoke(messages, max_tokens=512)
    response = rawoutput.dict()["content"].strip()
    return response

In [7]:
# this function parses the response from the chat model. The first to third last lines are its actions, and the last line is the next step.
# it returns a list of string of code to execute
def parse_response(response):
    lines = [_.strip() for _ in response.splitlines() if len(_.strip()) > 0]

    # set todo to the 2nd line to 2nd last line
    todo = lines[0:-2]

    return todo, lines[-1]

interacting with the page

In [8]:
# fallback when we haven't implemented the action for a tag
def get_interact_code(html, goal, url, llm):
    prompt = f"My Python Selenium Webdriver is on the webpage {url}. I want to interact with the following HTML element: {html}. My goal in interacting with this element is {goal}. It is currently selected with ```python\nselected_element = driver.find_element(By.XPATH, xpath)``` What is the Python code to interact with the element? Respond with the code only, without any import statements, in a markdown code block. This should take less than 5 statements."
    # print(f"prompt: {prompt}")
    messages = [HumanMessage(content=prompt)]

    response = query_llm(messages, llm)
    print(f"executing response: {response}")
    code_start_index = response.index("```python") + 9
    code_end_index = response.index("```", code_start_index)
    code = response[code_start_index:code_end_index].strip("\n")

    return code

In [9]:
def interact_element(element, xpath, driver, task, url, goal):
    selected_element = driver.find_element(By.XPATH, xpath)
    if element.tag in ["a", "button"]:
        selected_element.click()
        return
    # get html code
    html = selected_element.get_attribute("outerHTML")
    exec(get_interact_code(html, goal, url, llm))
    return

In [10]:
# this function takes the goal, the next task, the driver, and the url and interacts with the page
def interact_page(goal, next_task, done_already, driver, url, embedding_model, llm):
    relevant_elements = get_relevant_elements(
        done_already, next_task, embedding_model, driver
    )
    messages, input_tokens = create_messages(
        goal, done_already, next_task, url, relevant_elements
    )
    input_token_limit = 2048
    if input_tokens < input_token_limit:
        response = query_llm(messages, llm)
        print(f"response:\n{response}")
    else:
        raise ValueError(
            f"too many tokens\nthe limit is {input_token_limit}\nthe input has {input_tokens} tokens"
        )

    todo, next_task = parse_response(response)

    for todo_item in todo:
        # find the element that matches the todo item using search
        selected_element = search(
            todo_item, relevant_elements, "tagandtext", embedding_model
        ).iloc[0]
        done_already.append(selected_element["tagandtext"])
        try:
            interact_element(
                selected_element["element"],
                selected_element["xpath"],
                driver,
                todo_item,
                url,
                goal,
            )
            return next_task, done_already
        except Exception:
            print(
                f"failed interaction; trying next element (xpath: {selected_element['xpath']})"
            )
            pass

In [11]:
goal = "add a macbook pro to my cart"
url = "https://apple.com/"
next_task = get_first_phrase(goal, url, llm)
print(f"first search phrase: {next_task}")
done_already = []
driver = webdriver.Chrome()
driver.get(url)

first search phrase: Click on "Mac"


In [17]:
# repeat this cell to perform multiple actions
next_task, done_already = interact_page(
    goal=goal,
    next_task=next_task,
    done_already=done_already,
    driver=driver,
    url=driver.current_url,
    embedding_model=embedding_model,
    llm=llm,
)

System tokens: 294
Input message tokens: 284
response:
1. button[Pre-Installed Software / Add to Bag]
2. a[Shopping Bag]
3. button[Pre-Installed Software / Add to Saved Items]

Added the configured MacBook Pro to the cart.

Next subtask: Proceed to checkout.
