In [57]:
import os

from dotenv import load_dotenv
from huggingface_hub import login


load_dotenv(override=True)
HF_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY")

login(HF_TOKEN)

In [41]:
import argparse
import os
import threading

from dotenv import load_dotenv
from huggingface_hub import login
from scripts.text_inspector_tool import TextInspectorTool
from scripts.text_web_browser import (
    ArchiveSearchTool,
    FinderTool,
    FindNextTool,
    PageDownTool,
    PageUpTool,
    SearchInformationTool,
    SimpleTextBrowser,
    VisitTool,
)
from scripts.visual_qa import visualizer

from smolagents import (
    CodeAgent,
    # HfApiModel,
    LiteLLMModel,
    ToolCallingAgent,
)


AUTHORIZED_IMPORTS = [
    "requests",
    "zipfile",
    "os",
    "pandas",
    "numpy",
    "sympy",
    "json",
    "bs4",
    "pubchempy",
    "xml",
    "yahoo_finance",
    "Bio",
    "sklearn",
    "scipy",
    "pydub",
    "io",
    "PIL",
    "chess",
    "PyPDF2",
    "pptx",
    "torch",
    "datetime",
    "fractions",
    "csv",
]
load_dotenv(override=True)
login(HF_TOKEN)

append_answer_lock = threading.Lock()


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-id", type=str, default="o1")
    parser.add_argument("--api-base", type=str, default=None)
    parser.add_argument(
        "--question", type=str, default="How many studio albums did Mercedes Sosa release before 2007?"
    )
    return parser.parse_args()


custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}

user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"

BROWSER_CONFIG = {
    "viewport_size": 1024 * 5,
    "downloads_folder": "downloads_folder",
    "request_kwargs": {
        "headers": {"User-Agent": user_agent},
        "timeout": 300,
    },
    "serpapi_key": SERPAPI_API_KEY,
}

os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)

In [42]:
text_limit = 100000

# model = LiteLLMModel(
#     "code_gpt4o_27-01_text",  # args.model_id,
#     custom_role_conversions=custom_role_conversions,
#     max_completion_tokens=8192,
#     reasoning_effort="high",
# )

# messages = [
#   {"role": "user", "content": [{"type": "text", "text": "Let's begin."}]}
# ]

# model = LiteLLMModel(
#     "anthropic/claude-3-5-sonnet-latest",
#     temperature=0.2,
#     api_key=ANTHROPIC_API_KEY,
# )

from smolagents import OpenAIServerModel

model = OpenAIServerModel(
    model_id="gpt-4o",
    api_base="https://api.openai.com/v1",
    api_key=OPENAI_API_KEY,
)

document_inspection_tool = TextInspectorTool(model, text_limit)

browser = SimpleTextBrowser(**BROWSER_CONFIG)

WEB_TOOLS = [
    SearchInformationTool(browser),
    VisitTool(browser),
    PageUpTool(browser),
    PageDownTool(browser),
    FinderTool(browser),
    FindNextTool(browser),
    ArchiveSearchTool(browser),
    TextInspectorTool(model, text_limit),
]

In [43]:
text_webbrowser_agent = ToolCallingAgent(
        model=model,
        tools=WEB_TOOLS,
        max_steps=20,
        verbosity_level=2,
        planning_interval=4,
        name="search_agent",
        description="""A team member that will search the internet to answer your question.
    Ask him for all your questions that require browsing the web.
    Provide him as much context as possible, in particular if you need to search on a specific timeframe!
    And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
    Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
    """,
        provide_run_summary=True,
    )
text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""

manager_agent = CodeAgent(
    model=model,
    tools=[visualizer, document_inspection_tool],
    max_steps=12,
    verbosity_level=2,
    additional_authorized_imports=AUTHORIZED_IMPORTS,
    planning_interval=4,
    managed_agents=[text_webbrowser_agent],
)

In [44]:
answer = manager_agent.run("return a comprehensive feature list of hex.tech")

print(f"Got this answer: {answer}")

Got this answer: 
Hex.tech is a comprehensive, collaborative workspace designed for data science and analytics that supports SQL, Python, and no-code environments. It offers features like secure data connections, collaboration tools, AI assist, and the ability to build interactive data apps and reports.

### Detailed Features:
- **Secure data connections**: Users can directly connect to data warehouses such as Snowflake, Redshift, and BigQuery, or cloud storage solutions like Amazon S3.
- **Multi-language support**: It supports SQL, Python, and no-code environments, allowing users to choose the best tool for their tasks.
- **No-code cells**: Includes built-in charts, input parameters, and pivot tables for ease of use without coding.
- **AI assist**: Hex Magic helps users generate, edit, debug, or document code and charts, offering intelligent suggestions and automation.
- **Collaboration features**: Real-time collaboration through multiplayer mode, commenting, versioning, and sharing &

In [54]:
from scripts.visual_qa import VisualQATool

from smolagents import CodeAgent, DuckDuckGoSearchTool, LiteLLMModel
from smolagents.vision_web_browser import (
    close_popups,
    go_back,
    helium_instructions,
    initialize_agent,
    save_screenshot,
    search_item_ctrl_f,
)

import helium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("--force-device-scale-factor=1")
chrome_options.add_argument("--window-size=1000,1350")
chrome_options.add_argument("--disable-pdf-viewer")
chrome_options.add_argument("--window-position=0,0")

# Initialize the browser
driver = helium.start_chrome(headless=False, options=chrome_options)

# proprietary_model = LiteLLMModel("gpt-4o")
vision_browser_agent = initialize_agent(model)
### BUILD AGENTS & TOOLS

# CodeAgent(
#     tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
#     model=proprietary_model,
#     additional_authorized_imports=["helium"],
#     step_callbacks=[save_screenshot],
#     max_steps=20,
#     verbosity_level=2,
# )

# results_vision = answer_questions(
#     eval_ds,
#     vision_browser_agent,
#     "code_gpt4o_27-01_vision",
#     reformulation_model=proprietary_model,
#     output_folder="output_browsers",
#     visual_inspection_tool=VisualQAGPT4Tool(),
#     text_inspector_tool=TextInspectorTool(proprietary_model, 40000),
#     postprompt=helium_instructions
#     + "Any web browser controls won't work on .pdf urls, rather use the tool 'inspect_file_as_text' to read them",
# )

# text_webbrowser_agent = ToolCallingAgent(
#         model=model,
#         tools=WEB_TOOLS,
#         max_steps=20,
#         verbosity_level=2,
#         planning_interval=4,
#         name="search_agent",
#         description="""A team member that will search the internet to answer your question.
#     Ask him for all your questions that require browsing the web.
#     Provide him as much context as possible, in particular if you need to search on a specific timeframe!
#     And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
#     Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
#     """,
#         provide_run_summary=True,
#     )
# text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
# If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
# Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""

manager_agent = CodeAgent(
    model=model,
    # tools=[visualizer, document_inspection_tool],
    tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
    additional_authorized_imports=["helium"],
    max_steps=12,
    verbosity_level=2,
    step_callbacks=[save_screenshot],
    planning_interval=4,
)

helium_instructions = """
You can use helium to access websites. Don't bother about the helium driver, it's already managed.
We've already ran "from helium import *"
Then you can go to pages!
Code:
```py
go_to('github.com/trending')
```<end_code>

You can directly click clickable elements by inputting the text that appears on them.
Code:
```py
click("Top products")
```<end_code>

If it's a link:
Code:
```py
click(Link("Top products"))
```<end_code>

If you try to interact with an element and it's not found, you'll get a LookupError.
In general stop your action after each button click to see what happens on your screenshot.
Never try to login in a page.

To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
Code:
```py
scroll_down(num_pixels=1200) # This will scroll one viewport down
```<end_code>

When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
Just use your built-in tool `close_popups` to close them:
Code:
```py
close_popups()
```<end_code>

You can use .exists() to check for the existence of an element. For example:
Code:
```py
if Text('Accept cookies?').exists():
    click('I accept')
```<end_code>
"""

manager_agent.python_executor("from helium import *", manager_agent.state)

answer = manager_agent.run("return a comprehensive feature list of hex.tech" + helium_instructions)

print(f"Got this answer: {answer}")

Captured a browser screenshot: (1000, 827) pixels


Captured a browser screenshot: (1000, 827) pixels


Captured a browser screenshot: (1000, 827) pixels


Captured a browser screenshot: (1000, 827) pixels


Got this answer: {'Community': ['Unlimited queries', 'Up to 5 projects', 'Up to 100 Magic actions per Editor/month', 'Embed visualizations or code cells anywhere', 'Try the latest techniques with templates and tutorials'], 'Professional': ['Includes features of Community plan, plus', 'Configure workspace data connections', 'Unlimited projects', 'Unlimited Magic actions', 'Save personal components (reusable groups of cells)', 'Restore versions up to 30 days'], 'Team': ['Includes features of Professional plan, plus', 'Approve changes with Reviews', 'Analyze deeper with Explore and View data', 'Define user groups to manage permissions', 'Unlimited collections', 'Share components (reusable groups of cells)'], 'Enterprise': ['Includes features of Team plan, plus', 'Integrate with OIDC SSO providers', 'Single tenant and HIPAA compliance', 'Purchase through AWS Marketplace', 'Configure audit logs']}


In [None]:
!pip install browser-use lxml_html_clean -q
!playwright install

In [None]:
import asyncio

import nest_asyncio


nest_asyncio.apply()

from browser_use import Agent
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI


load_dotenv()


class BrowserUseAgent:
    logs = []

    def write_inner_memory_from_logs(self, summary_mode):
        return self.results

    def run(self, task, **kwargs):
        agent = Agent(
            task=task,
            llm=ChatOpenAI(model="gpt-4o"),
        )
        self.results = asyncio.get_event_loop().run_until_complete(agent.run())
        return self.results.history[-1].result[0].extracted_content


browser_use_agent = BrowserUseAgent()

results_browseruse = answer_questions(
    eval_ds,
    browser_use_agent,
    "gpt-4o_27-01_browseruse",
    reformulation_model=proprietary_model,
    output_folder="output_browsers",
    visual_inspection_tool=VisualQAGPT4Tool(),
    text_inspector_tool=TextInspectorTool(proprietary_model, 40000),
    postprompt="",
    run_simple=True,
)

### Get results

In [None]:
import pandas as pd
from scripts.gaia_scorer import question_scorer


results_vision, results_text, results_browseruse = (
    pd.DataFrame(results_vision),
    pd.DataFrame(results_text),
    pd.DataFrame(results_browseruse),
)

results_vision["is_correct"] = results_vision.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)
results_text["is_correct"] = results_text.apply(lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1)
results_browseruse["is_correct"] = results_browseruse.apply(
    lambda x: question_scorer(x["prediction"], x["true_answer"]), axis=1
)

In [None]:
results = pd.concat([results_vision, results_text, results_browseruse])
results.groupby("agent_name")["is_correct"].mean()

In [None]:
correct_vision_results = results_vision.loc[results_vision["is_correct"]]
correct_vision_results

In [None]:
false_text_results = results_text.loc[~results_text["is_correct"]]
false_text_results