From 27dbccaa297f9c524ba79462653fe318da57d8c2 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Wed, 18 Jun 2025 20:57:02 +0100 Subject: [PATCH 01/13] ts-cua sample added --- README.md | 5 + index.ts | 15 +- templates/typescript/cua-sample/.gitignore | 2 + .../typescript/cua-sample/.prettierignore | 1 + templates/typescript/cua-sample/.prettierrc | 4 + templates/typescript/cua-sample/README.md | 8 + templates/typescript/cua-sample/index.ts | 110 +++++++++ templates/typescript/cua-sample/lib/agent.ts | 223 ++++++++++++++++++ .../typescript/cua-sample/lib/computers.ts | 25 ++ .../cua-sample/lib/playwright/base.ts | 199 ++++++++++++++++ .../cua-sample/lib/playwright/kernel.ts | 76 ++++++ .../cua-sample/lib/playwright/local.ts | 75 ++++++ .../typescript/cua-sample/lib/toolset.ts | 20 ++ templates/typescript/cua-sample/lib/utils.ts | 89 +++++++ templates/typescript/cua-sample/package.json | 14 ++ templates/typescript/cua-sample/tsconfig.json | 30 +++ 16 files changed, 895 insertions(+), 1 deletion(-) create mode 100644 templates/typescript/cua-sample/.gitignore create mode 100644 templates/typescript/cua-sample/.prettierignore create mode 100644 templates/typescript/cua-sample/.prettierrc create mode 100644 templates/typescript/cua-sample/README.md create mode 100644 templates/typescript/cua-sample/index.ts create mode 100644 templates/typescript/cua-sample/lib/agent.ts create mode 100644 templates/typescript/cua-sample/lib/computers.ts create mode 100644 templates/typescript/cua-sample/lib/playwright/base.ts create mode 100644 templates/typescript/cua-sample/lib/playwright/kernel.ts create mode 100644 templates/typescript/cua-sample/lib/playwright/local.ts create mode 100644 templates/typescript/cua-sample/lib/toolset.ts create mode 100644 templates/typescript/cua-sample/lib/utils.ts create mode 100644 templates/typescript/cua-sample/package.json create mode 100644 templates/typescript/cua-sample/tsconfig.json diff --git a/README.md b/README.md index 652fe3c..c3627c1 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ create-kernel-app [app-name] [options] - `stagehand`: Template with Stagehand SDK (Typescript only) - `advanced-sample`: Implements sample apps using advanced Kernel configs - `computer-use`: Implements a prompt loop using Anthropic Computer Use + - `cua-sample`: Implements a Computer Use Agent (OpenAI CUA) sample (Typescript only) ### Examples @@ -121,6 +122,9 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google # Python + Browser Use kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}' + +# Typescript + CUA Sample +kernel invoke ts-cua agent-run --payload '{"query": "open hackernews and get the top 5 articles"}' ``` ## Sample apps reference @@ -134,6 +138,7 @@ These are the sample apps currently available when you run `npx @onkernel/create | **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` | | **advanced-sample** | Implements sample apps using advanced Kernel configs | n/a | | **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` | +| **cua-sample** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ query }` | ## Documentation diff --git a/index.ts b/index.ts index 8111e91..95ace36 100644 --- a/index.ts +++ b/index.ts @@ -18,7 +18,8 @@ type TemplateKey = | "browser-use" | "stagehand" | "advanced-sample" - | "computer-use"; + | "computer-use" + | "cua-sample"; type LanguageInfo = { name: string; shorthand: string }; type TemplateInfo = { name: string; @@ -34,6 +35,7 @@ const TEMPLATE_BROWSER_USE = "browser-use"; const TEMPLATE_STAGEHAND = "stagehand"; const TEMPLATE_ADVANCED_SAMPLE = "advanced-sample"; const TEMPLATE_COMPUTER_USE = "computer-use"; +const TEMPLATE_CUA_SAMPLE = "cua-sample"; const LANGUAGE_SHORTHAND_TS = "ts"; const LANGUAGE_SHORTHAND_PY = "py"; @@ -73,6 +75,11 @@ const TEMPLATES: Record = { description: "Implements the Anthropic Computer Use SDK", languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON], }, + [TEMPLATE_CUA_SAMPLE]: { + name: "CUA Sample", + description: "Implements a Computer Use Agent (OpenAI CUA) sample", + languages: [LANGUAGE_TYPESCRIPT], + }, }; const INVOKE_SAMPLES: Record< @@ -88,6 +95,8 @@ const INVOKE_SAMPLES: Record< 'kernel invoke ts-advanced test-captcha-solver', [TEMPLATE_COMPUTER_USE]: 'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', + [TEMPLATE_CUA_SAMPLE]: + 'kernel invoke ts-cua cua-task --payload \'{"query": "open hackernews and get the top 5 articles"}\'', }, [LANGUAGE_PYTHON]: { [TEMPLATE_SAMPLE_APP]: @@ -114,6 +123,8 @@ const REGISTERED_APP_NAMES: Record< 'ts-advanced', [TEMPLATE_COMPUTER_USE]: 'ts-cu', + [TEMPLATE_CUA_SAMPLE]: + 'ts-cua', }, [LANGUAGE_PYTHON]: { [TEMPLATE_SAMPLE_APP]: @@ -354,6 +365,8 @@ function printNextSteps( ? "kernel deploy index.ts --env OPENAI_API_KEY=XXX" : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_COMPUTER_USE ? "kernel deploy index.ts --env ANTHROPIC_API_KEY=XXX" + : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_CUA_SAMPLE + ? "kernel deploy index.ts --env OPENAI_API_KEY=XXX" : language === LANGUAGE_PYTHON && (template === TEMPLATE_SAMPLE_APP || template === TEMPLATE_ADVANCED_SAMPLE) ? "kernel deploy main.py" : language === LANGUAGE_PYTHON && template === TEMPLATE_BROWSER_USE diff --git a/templates/typescript/cua-sample/.gitignore b/templates/typescript/cua-sample/.gitignore new file mode 100644 index 0000000..bbacd7f --- /dev/null +++ b/templates/typescript/cua-sample/.gitignore @@ -0,0 +1,2 @@ +node_modules +bun.lockb \ No newline at end of file diff --git a/templates/typescript/cua-sample/.prettierignore b/templates/typescript/cua-sample/.prettierignore new file mode 100644 index 0000000..b512c09 --- /dev/null +++ b/templates/typescript/cua-sample/.prettierignore @@ -0,0 +1 @@ +node_modules \ No newline at end of file diff --git a/templates/typescript/cua-sample/.prettierrc b/templates/typescript/cua-sample/.prettierrc new file mode 100644 index 0000000..79a1682 --- /dev/null +++ b/templates/typescript/cua-sample/.prettierrc @@ -0,0 +1,4 @@ +{ + "tabWidth": 1, + "useTabs": true +} diff --git a/templates/typescript/cua-sample/README.md b/templates/typescript/cua-sample/README.md new file mode 100644 index 0000000..83d52f3 --- /dev/null +++ b/templates/typescript/cua-sample/README.md @@ -0,0 +1,8 @@ +# Kernel Typescript Sample App - CUA + +This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI. + +It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. +Also makes use of the latest OpenAI SDK format. + +See the [docs](https://docs.onkernel.com/quickstart) for information. \ No newline at end of file diff --git a/templates/typescript/cua-sample/index.ts b/templates/typescript/cua-sample/index.ts new file mode 100644 index 0000000..f516932 --- /dev/null +++ b/templates/typescript/cua-sample/index.ts @@ -0,0 +1,110 @@ +// @ts-nocheck + +import "dotenv/config"; +import { Kernel, type KernelContext } from "@onkernel/sdk"; +import { chromium } from "playwright"; +import { Agent } from "./lib/agent"; +import computers from "./lib/computers"; + +const kernel = new Kernel(); +const app = kernel.app("ts-cua"); + +// LLM API Keys are set in the environment during `kernel deploy -e ANTHROPIC_API_KEY=XXX` +// See https://docs.onkernel.com/launch/deploy#environment-variables +if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); + +/** + * Example app that run an agent using openai CUA + * Args: + * ctx: Kernel context containing invocation information + * payload: An object with a `query` property + * Returns: + * An answer to the query, elapsed time and optionally the messages stack + * Invoke this via CLI: + * export KERNEL_API_KEY= + * kernel deploy index.ts -e OPENAI_API_KEY=XXXXX --force + * kernel invoke ts-cua agent-run -p "{\"query\":\"current market price range for a used dreamcast\"}" + * kernel logs ts-cua -f # Open in separate tab + */ + +interface CuaInput { + query: string; +} + +interface CuaOutput { + elapsed: number; + response: Array; + answer: object; +} + +app.action( + "agent-run", + async (ctx: KernelContext, payload?: CuaInput): Promise => { + const startTime = Date.now(); + const kernelBrowser = await kernel.browsers.create({ + invocation_id: ctx.invocation_id, + }); + console.log( + "> Kernel browser live view url: ", + kernelBrowser.browser_live_view_url, + ); + + try { + + // kernel browser + const { computer } = await computers.create({ + type: "kernel", + cdp_ws_url: kernelBrowser.cdp_ws_url, + }); + + // setup agent + const agent = new Agent( + "computer-use-preview", + computer, + [], // additional tools + (message: string) => { + console.log(`> safety check: ${message}`); + return true; // Auto-acknowledge all safety checks for testing + }, + ); + + // start agent run + const response = await agent.runFullTurn( + [ + { + role: "system", + content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString("en-US", { weekday: "long" })})`, + }, + { + type: "message", + role: "user", + content: [ + { + type: "input_text", + text: payload.query, + // text: "go to https://news.ycombinator.com , open top article , describe the target website design (in yaml format)" + }, + ], + }, + ], + true, // print_steps + true, // debug + false, // show_images + ); + + console.log("> agent run done"); + + const endTime = Date.now(); + const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + + return { + // response, // full messages stack trace + elapsed: parseFloat(timeElapsed.toFixed(2)), + answer: response?.slice(-1)?.[0]?.content?.[0]?.text ?? null, + }; + } finally { + // Note: KernelPlaywrightComputer handles browser cleanup internally + // No need to manually close browser here + } + }, +); diff --git a/templates/typescript/cua-sample/lib/agent.ts b/templates/typescript/cua-sample/lib/agent.ts new file mode 100644 index 0000000..d5e2e11 --- /dev/null +++ b/templates/typescript/cua-sample/lib/agent.ts @@ -0,0 +1,223 @@ +// @ts-nocheck + +import utils from "./utils"; +import computers from "./computers"; +import toolset from "./toolset"; + +import type { BasePlaywrightComputer } from "./playwright/base"; + +interface Tool { + type: string; + display_width?: number; + display_height?: number; + environment?: string; + [key: string]: any; +} + +interface SafetyCheck { + message: string; + [key: string]: any; +} + +interface Item { + type: string; + content?: Array<{ type: string; text: string }>; + name?: string; + arguments?: string; + call_id?: string; + action?: { + type: string; + [key: string]: any; + }; + pending_safety_checks?: SafetyCheck[]; + role?: string; + [key: string]: any; +} + +interface ComputerCallOutput { + type: string; + call_id: string; + acknowledged_safety_checks: SafetyCheck[]; + output: { + type: string; + image_url: string; + current_url?: string; + }; +} + +type AcknowledgeSafetyCheckCallback = (message: string) => boolean; + +/** + * A sample agent class that can be used to interact with a computer. + */ +export class Agent { + private model: string; + private computer: BasePlaywrightComputer | null; + private tools: Tool[]; + private print_steps: boolean; + private debug: boolean; + private show_images: boolean; + private acknowledge_safety_check_callback: AcknowledgeSafetyCheckCallback; + + constructor( + model: string = "computer-use-preview", + computer: BasePlaywrightComputer | null = null, + tools: Tool[], + acknowledge_safety_check_callback: AcknowledgeSafetyCheckCallback = () => + true, + ) { + this.model = model; + this.computer = computer; + this.tools = [...toolset.shared, ...tools]; + this.print_steps = true; + this.debug = false; + this.show_images = false; + this.acknowledge_safety_check_callback = acknowledge_safety_check_callback; + + if (computer) { + const dimensions = computer.getDimensions(); + this.tools.push({ + type: "computer-preview", + display_width: dimensions[0], + display_height: dimensions[1], + environment: computer.getEnvironment(), + }); + } + } + + private debugPrint(...args: any[]): void { + if (this.debug) { + console.warn("--- debug:agent:debugPrint"); + console.dir(...args, { depth: null }); + } + } + + private async handleItem(item: Item): Promise { + /**Handle each item; may cause a computer action + screenshot.*/ + if (item.type === "message") { + if (this.print_steps) { + console.log(item.content![0].text); + } + } + + if (item.type === "function_call") { + const name = item.name!; + const args = JSON.parse(item.arguments!); + if (this.print_steps) { + console.log(`${name}(${JSON.stringify(args)})`); + } + + if (this.computer && (this.computer as any)[name]) { + const method = (this.computer as any)[name]; + await method.call(this.computer, ...Object.values(args)); + } + return [ + { + type: "function_call_output", + call_id: item.call_id!, + output: "success", // hard-coded output for demo + }, + ]; + } + + if (item.type === "computer_call") { + const action = item.action!; + const action_type = action.type; + const action_args = Object.fromEntries( + Object.entries(action).filter(([k]) => k !== "type"), + ); + if (this.print_steps) { + console.log(`${action_type}(${JSON.stringify(action_args)})`); + } + + if (this.computer) { + const method = (this.computer as any)[action_type]; + await method.call(this.computer, ...Object.values(action_args)); + + const screenshot_base64 = await this.computer.screenshot(); + // console.dir({ debug: { screenshot_base64 }}) + + // if user doesn't ack all safety checks exit with error + const pending_checks = item.pending_safety_checks || []; + for (const check of pending_checks) { + const message = check.message; + if (!this.acknowledge_safety_check_callback(message)) { + throw new Error( + `Safety check failed: ${message}. Cannot continue with unacknowledged safety checks.`, + ); + } + } + + const call_output: ComputerCallOutput = { + type: "computer_call_output", + call_id: item.call_id!, + acknowledged_safety_checks: pending_checks, + output: { + type: "input_image", + image_url: `data:image/webp;base64,${screenshot_base64}`, + }, + }; + + // additional URL safety checks for browser environments + if (this.computer.getEnvironment() === "browser") { + const current_url = this.computer.getCurrentUrl(); + utils.checkBlocklistedUrl(current_url); + call_output.output.current_url = current_url; + } + + return [call_output]; + } + } + return []; + } + + async runFullTurn( + input_items: Item[], + print_steps: boolean = true, + debug: boolean = false, + show_images: boolean = false, + ): Promise { + this.print_steps = print_steps; + this.debug = debug; + this.show_images = show_images; + const new_items: Item[] = []; + + // keep looping until we get a final response + while ( + new_items.length === 0 || + new_items[new_items.length - 1].role !== "assistant" + ) { + this.debugPrint( + input_items.concat(new_items).map((msg) => utils.sanitizeMessage(msg)), + ); + + const response = await utils.createResponse({ + model: this.model, + input: input_items.concat(new_items), + tools: this.tools, + truncation: "auto", + }); + this.debugPrint(response); + + if (!response.output && this.debug) { + console.log(response); + throw new Error("No output from model"); + } else { + new_items.push(...response.output); + for (const item of response.output) { + const handled_items = await this.handleItem(item); + new_items.push(...handled_items); + } + } + } + + // Return sanitized messages if show_images is false + if (!show_images) { + return new_items.map((msg) => utils.sanitizeMessage(msg)); + } + + return new_items; + } +} + +export default { Agent }; diff --git a/templates/typescript/cua-sample/lib/computers.ts b/templates/typescript/cua-sample/lib/computers.ts new file mode 100644 index 0000000..3c8aa47 --- /dev/null +++ b/templates/typescript/cua-sample/lib/computers.ts @@ -0,0 +1,25 @@ +import { KernelPlaywrightComputer } from "./playwright/kernel.ts"; +import { LocalPlaywrightComputer } from "./playwright/local.ts"; + +interface ComputerConfig { + type: "local" | "kernel"; + [key: string]: any; +} + +const computers = { + async create({ type, ...args }: ComputerConfig) { + if (type === "kernel") { + const computer = new KernelPlaywrightComputer(args.cdp_ws_url); + await computer.enter(); + return { computer }; + } else if (type === "local") { + const computer = new LocalPlaywrightComputer(args.headless); + await computer.enter(); + return { computer }; + } else { + throw new Error(`Unknown computer type: ${type}`); + } + }, +}; + +export default computers; diff --git a/templates/typescript/cua-sample/lib/playwright/base.ts b/templates/typescript/cua-sample/lib/playwright/base.ts new file mode 100644 index 0000000..fad3d79 --- /dev/null +++ b/templates/typescript/cua-sample/lib/playwright/base.ts @@ -0,0 +1,199 @@ +// @ts-nocheck + +import utils from "../utils.ts"; +import sharp from "sharp"; +import type { Browser, Page, Route, Request } from "playwright"; + +// Optional: key mapping if your model uses "CUA" style keys +const CUA_KEY_TO_PLAYWRIGHT_KEY: Record = { + "/": "/", + "\\": "\\", + alt: "Alt", + arrowdown: "ArrowDown", + arrowleft: "ArrowLeft", + arrowright: "ArrowRight", + arrowup: "ArrowUp", + backspace: "Backspace", + capslock: "CapsLock", + cmd: "Meta", + ctrl: "Control", + delete: "Delete", + end: "End", + enter: "Enter", + esc: "Escape", + home: "Home", + insert: "Insert", + option: "Alt", + pagedown: "PageDown", + pageup: "PageUp", + shift: "Shift", + space: " ", + super: "Meta", + tab: "Tab", + win: "Meta", +}; + +interface Point { + x: number; + y: number; +} + +/** + * Abstract base for Playwright-based computers: + * + * - Subclasses override `_getBrowserAndPage()` to do local or remote connection, + * returning [Browser, Page]. + * - This base class handles context creation (`enter()`/`exit()`), + * plus standard "Computer" actions like click, scroll, etc. + * - We also have extra browser actions: `goto(url)` and `back()`. + */ +export class BasePlaywrightComputer { + protected _browser: Browser | null = null; + protected _page: Page | null = null; + + constructor() { + this._browser = null; + this._page = null; + } + + getEnvironment(): string { + return "browser"; + } + + getDimensions(): [number, number] { + return [1024, 768]; + } + + async enter(): Promise { + // Call the subclass hook for getting browser/page + [this._browser, this._page] = await this._getBrowserAndPage(); + + // Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS + const handleRoute = (route: Route, request: Request): void => { + const url = request.url(); + if (utils.checkBlocklistedUrl(url)) { + console.log(`Flagging blocked domain: ${url}`); + route.abort(); + } else { + route.continue(); + } + }; + + await this._page!.route("**/*", handleRoute); + return this; + } + + async exit(): Promise { + if (this._browser) { + await this._browser.close(); + } + } + + getCurrentUrl(): string { + return this._page!.url(); + } + + // Common "Computer" actions + async screenshot(): Promise { + // Capture only the viewport (not full_page) + const screenshotBuffer = await this._page!.screenshot({ fullPage: false }); + const webpBuffer = await sharp(screenshotBuffer).webp().toBuffer(); + return webpBuffer.toString("base64"); + } + + async click(button: string = "left", x: number, y: number): Promise { + // console.dir({ debug:{base:{click:{x,y,button}}} },{depth:null}) + switch (button) { + case "back": + await this.back(); + break; + case "forward": + await this.forward(); + break; + case "wheel": + await this._page!.mouse.wheel(x, y); + break; + default: + const buttonMapping: Record = { + left: "left", + right: "right", + }; + const buttonType = + buttonMapping[button as keyof typeof buttonMapping] || "left"; + await this._page!.mouse.click(x, y, { button: buttonType }); + } + } + + async doubleClick(x: number, y: number): Promise { + await this._page!.mouse.dblclick(x, y); + } + + async scroll( + x: number, + y: number, + scrollX: number, + scrollY: number, + ): Promise { + await this._page!.mouse.move(x, y); + await this._page!.evaluate(`window.scrollBy(${scrollX}, ${scrollY})`); + } + + async type(text: string): Promise { + await this._page!.keyboard.type(text); + } + + async keypress(keys: string[]): Promise { + const mappedKeys = keys.map( + (key) => CUA_KEY_TO_PLAYWRIGHT_KEY[key.toLowerCase()] || key, + ); + for (const key of mappedKeys) { + await this._page!.keyboard.down(key); + } + for (const key of mappedKeys.reverse()) { + await this._page!.keyboard.up(key); + } + } + + async wait(ms: number = 1000): Promise { + await new Promise((resolve) => setTimeout(resolve, ms)); + } + + async move(x: number, y: number): Promise { + await this._page!.mouse.move(x, y); + } + + async drag(path: Point[]): Promise { + if (!path.length) { + return; + } + await this._page!.mouse.move(path[0].x, path[0].y); + await this._page!.mouse.down(); + for (const point of path.slice(1)) { + await this._page!.mouse.move(point.x, point.y); + } + await this._page!.mouse.up(); + } + + // Extra browser-oriented actions + async goto(url: string): Promise { + try { + return await this._page!.goto(url); + } catch (e) { + console.log(`Error navigating to ${url}: ${e}`); + } + } + + async back(): Promise { + return await this._page!.goBack(); + } + + async forward(): Promise { + return await this._page!.goForward(); + } + + // Subclass hook + async _getBrowserAndPage(): Promise<[Browser, Page]> { + // Subclasses must implement, returning [Browser, Page] + throw new Error("Subclasses must implement _getBrowserAndPage()"); + } +} diff --git a/templates/typescript/cua-sample/lib/playwright/kernel.ts b/templates/typescript/cua-sample/lib/playwright/kernel.ts new file mode 100644 index 0000000..f945f9f --- /dev/null +++ b/templates/typescript/cua-sample/lib/playwright/kernel.ts @@ -0,0 +1,76 @@ +// @ts-nocheck + +import { chromium, type Browser, type Page } from "playwright"; +import { BasePlaywrightComputer } from "./base"; + +/** + * KernelPlaywrightComputer connects to a remote browser instance via CDP WebSocket URL. + * Similar to LocalPlaywrightComputer but uses an existing browser instance instead of launching one. + */ +export class KernelPlaywrightComputer extends BasePlaywrightComputer { + private cdp_ws_url: string; + + constructor(cdp_ws_url: string) { + super(); + this.cdp_ws_url = cdp_ws_url; + } + + async _getBrowserAndPage(): Promise<[Browser, Page]> { + const [width, height] = this.getDimensions(); + + // Connect to existing browser instance via CDP + const browser = await chromium.connectOverCDP(this.cdp_ws_url); + + // Get existing context or create new one + let context = browser.contexts()[0]; + if (!context) { + context = await browser.newContext(); + } + + // Add event listeners for page creation and closure + context.on("page", this._handleNewPage.bind(this)); + + // Get existing page or create new one + let page = context.pages()[0]; + if (!page) { + page = await context.newPage(); + } + + // Set viewport size + await page.setViewportSize({ width, height }); + page.on("close", this._handlePageClose.bind(this)); + + return [browser, page]; + } + + private _handleNewPage(page: Page): void { + /** Handle the creation of a new page. */ + console.log("New page created"); + this._page = page; + page.on("close", this._handlePageClose.bind(this)); + } + + private _handlePageClose(page: Page): void { + /** Handle the closure of a page. */ + console.log("Page closed"); + if (this._page === page) { + // Check if browser and contexts exist before accessing + if ( + this._browser && + this._browser.contexts && + this._browser.contexts.length > 0 + ) { + const context = this._browser.contexts[0]; + if (context.pages && context.pages.length > 0) { + this._page = context.pages[context.pages.length - 1]; + } else { + console.log("Warning: All pages have been closed."); + this._page = null; + } + } else { + console.log("Warning: Browser or context not available."); + this._page = null; + } + } + } +} diff --git a/templates/typescript/cua-sample/lib/playwright/local.ts b/templates/typescript/cua-sample/lib/playwright/local.ts new file mode 100644 index 0000000..3556049 --- /dev/null +++ b/templates/typescript/cua-sample/lib/playwright/local.ts @@ -0,0 +1,75 @@ +// @ts-nocheck + +import { chromium, type Browser, type Page } from "playwright"; +import { BasePlaywrightComputer } from "./base"; + +/** + * Launches a local Chromium instance using Playwright. + */ +export class LocalPlaywrightComputer extends BasePlaywrightComputer { + private headless: boolean; + + constructor(headless: boolean = false) { + super(); + this.headless = headless; + } + + async _getBrowserAndPage(): Promise<[Browser, Page]> { + const [width, height] = this.getDimensions(); + const launchArgs = [ + `--window-size=${width},${height}`, + "--disable-extensions", + "--disable-file-system", + ]; + + const browser = await chromium.launch({ + headless: this.headless, + args: launchArgs, + env: { DISPLAY: ":0" }, + }); + + const context = await browser.newContext(); + + // Add event listeners for page creation and closure + context.on("page", this._handleNewPage.bind(this)); + + const page = await context.newPage(); + await page.setViewportSize({ width, height }); + page.on("close", this._handlePageClose.bind(this)); + + await page.goto("https://bing.com"); + + return [browser, page]; + } + + private _handleNewPage(page: Page): void { + /** Handle the creation of a new page. */ + console.log("New page created"); + this._page = page; + page.on("close", this._handlePageClose.bind(this)); + } + + private _handlePageClose(page: Page): void { + /** Handle the closure of a page. */ + console.log("Page closed"); + if (this._page === page) { + // Check if browser and contexts exist before accessing + if ( + this._browser && + this._browser.contexts && + this._browser.contexts.length > 0 + ) { + const context = this._browser.contexts[0]; + if (context.pages && context.pages.length > 0) { + this._page = context.pages[context.pages.length - 1]; + } else { + console.log("Warning: All pages have been closed."); + this._page = null; + } + } else { + console.log("Warning: Browser or context not available."); + this._page = null; + } + } + } +} diff --git a/templates/typescript/cua-sample/lib/toolset.ts b/templates/typescript/cua-sample/lib/toolset.ts new file mode 100644 index 0000000..592f421 --- /dev/null +++ b/templates/typescript/cua-sample/lib/toolset.ts @@ -0,0 +1,20 @@ +const shared = [ + { + type: "function", + name: "goto", + description: "Go to a specific URL.", + parameters: { + type: "object", + properties: { + url: { + type: "string", + description: "Fully qualified URL to navigate to.", + }, + }, + additionalProperties: false, + required: ["url"], + }, + }, +]; + +export default { shared }; diff --git a/templates/typescript/cua-sample/lib/utils.ts b/templates/typescript/cua-sample/lib/utils.ts new file mode 100644 index 0000000..15673ef --- /dev/null +++ b/templates/typescript/cua-sample/lib/utils.ts @@ -0,0 +1,89 @@ +// @ts-nocheck + +import "dotenv/config"; +import sharp from "sharp"; +import OpenAI from "openai"; + +const BLOCKED_DOMAINS: readonly string[] = [ + "maliciousbook.com", + "evilvideos.com", + "darkwebforum.com", + "shadytok.com", + "suspiciouspins.com", + "ilanbigio.com", +] as const; + +interface ImageDimensions { + width: number; + height: number; +} + +interface ComputerCallOutput { + type: "computer_call_output"; + output?: { + image_url?: string; + [key: string]: any; + }; + [key: string]: any; +} + +interface Message { + type: string; + output?: any; + [key: string]: any; +} + +async function calculateImageDimensions( + base64Image: string, +): Promise { + const imageBuffer = Buffer.from(base64Image, "base64"); + const metadata = await sharp(imageBuffer).metadata(); + return { width: metadata.width!, height: metadata.height! }; +} + +function sanitizeMessage(msg: Message): Message { + /** Return a copy of the message with image_url omitted for computer_call_output messages. */ + if (msg.type === "computer_call_output") { + const output = msg.output || {}; + if (typeof output === "object") { + const sanitized = { ...msg }; + sanitized.output = { ...output, image_url: "[omitted]" }; + return sanitized; + } + } + return msg; +} + +async function createResponse(kwargs: any): Promise { + const openai = new OpenAI(); + /* + console.error("--- debug:utils:createResponse"); + console.dir( + { createResponse: { ...kwargs, input: kwargs.input?.map(sanitizeMessage) } }, + { depth: null }, + ); + */ + + try { + const response = await openai.responses.create(kwargs); + return response; + } catch (error: any) { + console.error(`Error: ${error.status} ${error.message}`); + throw error; + } +} + +function checkBlocklistedUrl(url: string): boolean { + /** Return true if the given URL (including subdomains) is in the blocklist. */ + const hostname = new URL(url).hostname || ""; + return BLOCKED_DOMAINS.some( + (blocked) => hostname === blocked || hostname.endsWith(`.${blocked}`), + ); +} + +export default { + calculateImageDimensions, + sanitizeMessage, + createResponse, + checkBlocklistedUrl, +}; diff --git a/templates/typescript/cua-sample/package.json b/templates/typescript/cua-sample/package.json new file mode 100644 index 0000000..d334289 --- /dev/null +++ b/templates/typescript/cua-sample/package.json @@ -0,0 +1,14 @@ +{ + "type": "module", + "private": true, + "dependencies": { + "@onkernel/sdk": "^0.5.0", + "dotenv": "^16.5.0", + "openai": "^5.5.1", + "playwright": "^1.53.0", + "sharp": "^0.34.2" + }, + "peerDependencies": { + "typescript": "^5.8.3" + } +} diff --git a/templates/typescript/cua-sample/tsconfig.json b/templates/typescript/cua-sample/tsconfig.json new file mode 100644 index 0000000..f5c1fe2 --- /dev/null +++ b/templates/typescript/cua-sample/tsconfig.json @@ -0,0 +1,30 @@ +{ + "compilerOptions": { + // Environment setup & latest features + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleDetection": "force", + "jsx": "react-jsx", + "allowJs": true, + + // Bundler mode + "moduleResolution": "bundler", + "allowImportingTsExtensions": true, + "verbatimModuleSyntax": true, + "noEmit": true, + + // Best practices + "strict": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true, + + // Some stricter flags (disabled by default) + "noUnusedLocals": false, + "noUnusedParameters": false, + "noPropertyAccessFromIndexSignature": false + }, + "include": ["./**/*.ts", "./**/*.tsx"], + "exclude": ["node_modules", "dist"] +} From 7ff9af970d43ffb322bd068d52b01f8031dc8b34 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Thu, 19 Jun 2025 03:35:47 +0100 Subject: [PATCH 02/13] python/browser-use : fixed viewport & window resize --- templates/python/browser-use/main.py | 127 +++++++++++++++++++++- templates/typescript/cua-sample/README.md | 2 +- 2 files changed, 126 insertions(+), 3 deletions(-) diff --git a/templates/python/browser-use/main.py b/templates/python/browser-use/main.py index 5dfc7cc..5e1f0c3 100644 --- a/templates/python/browser-use/main.py +++ b/templates/python/browser-use/main.py @@ -13,7 +13,130 @@ class TaskInput(TypedDict): # LLM API Keys are set in the environment during `kernel deploy -e OPENAI_API_KEY=XXX` # See https://docs.onkernel.com/launch/deploy#environment-variables -llm = ChatOpenAI(model="gpt-4o") +llm = ChatOpenAI(model="gpt-4o-mini") + + +# Define a subclass of BrowserSession that overrides _setup_viewports (which mishandles resizeing on connecting via cdp) +class BrowserSessionCustomResize(BrowserSession): + async def _setup_viewports(self) -> None: + """Resize any existing page viewports to match the configured size, set up storage_state, permissions, geolocation, etc.""" + + assert self.browser_context, 'BrowserSession.browser_context must already be set up before calling _setup_viewports()' + + self.browser_profile.window_size = {"width": 1024, "height": 786} + self.browser_profile.viewport = {"width": 1024, "height": 786} + self.browser_profile.screen = {"width": 1024, "height": 786} + self.browser_profile.device_scale_factor = 1.0 + + # log the viewport settings to terminal + viewport = self.browser_profile.viewport + print( + '📐 Setting up viewport: ' + + f'headless={self.browser_profile.headless} ' + + ( + f'window={self.browser_profile.window_size["width"]}x{self.browser_profile.window_size["height"]}px ' + if self.browser_profile.window_size + else '(no window) ' + ) + + ( + f'screen={self.browser_profile.screen["width"]}x{self.browser_profile.screen["height"]}px ' + if self.browser_profile.screen + else '' + ) + + (f'viewport={viewport["width"]}x{viewport["height"]}px ' if viewport else '(no viewport) ') + + f'device_scale_factor={self.browser_profile.device_scale_factor or 1.0} ' + + f'is_mobile={self.browser_profile.is_mobile} ' + + (f'color_scheme={self.browser_profile.color_scheme.value} ' if self.browser_profile.color_scheme else '') + + (f'locale={self.browser_profile.locale} ' if self.browser_profile.locale else '') + + (f'timezone_id={self.browser_profile.timezone_id} ' if self.browser_profile.timezone_id else '') + + (f'geolocation={self.browser_profile.geolocation} ' if self.browser_profile.geolocation else '') + + (f'permissions={",".join(self.browser_profile.permissions or [""])} ') + ) + + # if we have any viewport settings in the profile, make sure to apply them to the entire browser_context as defaults + if self.browser_profile.permissions: + try: + await self.browser_context.grant_permissions(self.browser_profile.permissions) + except Exception as e: + self.logger.warning( + f'⚠️ Failed to grant browser permissions {self.browser_profile.permissions}: {type(e).__name__}: {e}' + ) + try: + if self.browser_profile.default_timeout: + self.browser_context.set_default_timeout(self.browser_profile.default_timeout) + if self.browser_profile.default_navigation_timeout: + self.browser_context.set_default_navigation_timeout(self.browser_profile.default_navigation_timeout) + except Exception as e: + self.logger.warning( + f'⚠️ Failed to set playwright timeout settings ' + f'cdp_api={self.browser_profile.default_timeout} ' + f'navigation={self.browser_profile.default_navigation_timeout}: {type(e).__name__}: {e}' + ) + try: + if self.browser_profile.extra_http_headers: + self.browser_context.set_extra_http_headers(self.browser_profile.extra_http_headers) + except Exception as e: + self.logger.warning( + f'⚠️ Failed to setup playwright extra_http_headers: {type(e).__name__}: {e}' + ) # dont print the secret header contents in the logs! + + try: + if self.browser_profile.geolocation: + await self.browser_context.set_geolocation(self.browser_profile.geolocation) + except Exception as e: + self.logger.warning( + f'⚠️ Failed to update browser geolocation {self.browser_profile.geolocation}: {type(e).__name__}: {e}' + ) + + await self.load_storage_state() + + page = None + + for page in self.browser_context.pages: + # apply viewport size settings to any existing pages + if viewport: + await page.set_viewport_size(viewport) + + # show browser-use dvd screensaver-style bouncing loading animation on any about:blank pages + if page.url == 'about:blank': + await self._show_dvd_screensaver_loading_animation(page) + + page = page or (await self.browser_context.new_page()) + + if (not viewport) and (self.browser_profile.window_size is not None) and not self.browser_profile.headless: + # attempt to resize the actual browser window + + # cdp api: https://chromedevtools.github.io/devtools-protocol/tot/Browser/#method-setWindowBounds + try: + cdp_session = await page.context.new_cdp_session(page) + window_id_result = await cdp_session.send('Browser.getWindowForTarget') + await cdp_session.send( + 'Browser.setWindowBounds', + { + 'windowId': window_id_result['windowId'], + 'bounds': { + **self.browser_profile.window_size, + 'windowState': 'normal', # Ensure window is not minimized/maximized + }, + }, + ) + await cdp_session.detach() + except Exception as e: + _log_size = lambda size: f'{size["width"]}x{size["height"]}px' + try: + # fallback to javascript resize if cdp setWindowBounds fails + await page.evaluate( + """(width, height) => {window.resizeTo(width, height)}""", + **self.browser_profile.window_size, + ) + return + except Exception as e: + pass + + self.logger.warning( + f'⚠️ Failed to resize browser window to {_log_size(self.browser_profile.window_size)} using CDP setWindowBounds: {type(e).__name__}: {e}' + ) + @app.action("bu-task") async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput): @@ -37,7 +160,7 @@ async def bu_task(ctx: kernel.KernelContext, input_data: TaskInput): #task="Compare the price of gpt-4o and DeepSeek-V3", task=input_data["task"], llm=llm, - browser_session=BrowserSession(cdp_url=kernel_browser.cdp_ws_url) + browser_session=BrowserSessionCustomResize(cdp_url=kernel_browser.cdp_ws_url) ) result = await agent.run() if result.final_result() is not None: diff --git a/templates/typescript/cua-sample/README.md b/templates/typescript/cua-sample/README.md index 83d52f3..0cb2dfe 100644 --- a/templates/typescript/cua-sample/README.md +++ b/templates/typescript/cua-sample/README.md @@ -3,6 +3,6 @@ This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI. It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. -Also makes use of the latest OpenAI SDK format. +Also makes use of the latest OpenAI SDK format, and has local equivalent to Kernel methods for local testing before deploying on Kernel. See the [docs](https://docs.onkernel.com/quickstart) for information. \ No newline at end of file From b3dfef64844f4a2d62d4abab3f978c07bbfea3e0 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Thu, 19 Jun 2025 19:11:49 +0100 Subject: [PATCH 03/13] ts-cua updates+ --- README.md | 4 ++-- templates/typescript/cua/index.ts | 26 +++++++++++++------------- templates/typescript/cua/lib/agent.ts | 18 +++++++++++------- 3 files changed, 26 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index 1f7dc9f..b348fd8 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}' # Typescript + CUA Sample -kernel invoke ts-cua cua-task --payload '{"query": "open hackernews and get the top 5 articles"}' +kernel invoke ts-cua cua-task --payload '{"task": "open hackernews and get the top 5 articles"}' ``` ## Sample apps reference @@ -138,7 +138,7 @@ These are the sample apps currently available when you run `npx @onkernel/create | **stagehand** | Returns the first result of a specified Google search | Stagehand | `{ query }` | | **advanced-sample** | Implements sample apps using advanced Kernel configs | n/a | | **computer-use** | Implements a prompt loop | Anthropic Computer Use API | `{ query }` | -| **cua** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ query }` | +| **cua** | Implements the OpenAI Computer Using Agent (CUA) | OpenAI CUA | `{ task }` | ## Documentation diff --git a/templates/typescript/cua/index.ts b/templates/typescript/cua/index.ts index cd8c16a..135f32c 100644 --- a/templates/typescript/cua/index.ts +++ b/templates/typescript/cua/index.ts @@ -15,18 +15,18 @@ if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); * Example app that run an agent using openai CUA * Args: * ctx: Kernel context containing invocation information - * payload: An object with a `query` property + * payload: An object with a `task` property * Returns: - * An answer to the query, elapsed time and optionally the messages stack + * An answer to the task, elapsed time and optionally the messages stack * Invoke this via CLI: * export KERNEL_API_KEY= * kernel deploy index.ts -e OPENAI_API_KEY=XXXXX --force - * kernel invoke ts-cua cua-task -p "{\"query\":\"current market price range for a used dreamcast\"}" + * kernel invoke ts-cua cua-task -p "{\"task\":\"current market price range for a used dreamcast\"}" * kernel logs ts-cua -f # Open in separate tab */ interface CuaInput { - query: string; + task: string; } interface CuaOutput { @@ -47,28 +47,28 @@ app.action( kernelBrowser.browser_live_view_url, ); - if (!payload?.query){ - throw new Error('query is required'); + if (!payload?.task){ + throw new Error('task is required'); } try { // kernel browser const { computer } = await computers.create({ - type: "kernel", + type: "kernel", // for local testing before deploying to Kernel, you can use type: "local" cdp_ws_url: kernelBrowser.cdp_ws_url, }); // setup agent - const agent = new Agent( - "computer-use-preview", + const agent = new Agent({ + model: "computer-use-preview", computer, - [], // additional tools - (message: string) => { + tools: [], // additional function_call tools to provide to the llm + acknowledge_safety_check_callback: (message: string) => { console.log(`> safety check: ${message}`); return true; // Auto-acknowledge all safety checks for testing }, - ); + }); // start agent run const response = await agent.runFullTurn({ @@ -83,7 +83,7 @@ app.action( content: [ { type: "input_text", - text: payload.query, + text: payload.task, // text: "go to https://news.ycombinator.com , open top article , describe the target website design (in yaml format)" }, ], diff --git a/templates/typescript/cua/lib/agent.ts b/templates/typescript/cua/lib/agent.ts index 0c47559..f28b298 100644 --- a/templates/typescript/cua/lib/agent.ts +++ b/templates/typescript/cua/lib/agent.ts @@ -47,13 +47,17 @@ export class Agent { private show_images: boolean; private acknowledge_safety_check_callback: AcknowledgeSafetyCheckCallback; - constructor( - model: string = "computer-use-preview", - computer: BasePlaywrightComputer | null = null, - tools: Tool[], - acknowledge_safety_check_callback: AcknowledgeSafetyCheckCallback = () => - true, - ) { + constructor({ + model = "computer-use-preview", + computer = null, + tools = [], + acknowledge_safety_check_callback = () => true, + }: { + model?: string; + computer?: BasePlaywrightComputer | null; + tools?: Tool[]; + acknowledge_safety_check_callback?: AcknowledgeSafetyCheckCallback; + }) { this.model = model; this.computer = computer; this.tools = [...toolset.shared, ...tools]; From b77c2f07d948bb1d31137f88f435e56bb4f01e51 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Thu, 19 Jun 2025 19:14:13 +0100 Subject: [PATCH 04/13] kernel sdk version in package --- templates/typescript/cua/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/typescript/cua/package.json b/templates/typescript/cua/package.json index cbe2983..1ff128c 100644 --- a/templates/typescript/cua/package.json +++ b/templates/typescript/cua/package.json @@ -2,7 +2,7 @@ "type": "module", "private": true, "dependencies": { - "@onkernel/sdk": "^0.5.0", + "@onkernel/sdk": ">=0.6.0", "@types/node": "^24.0.3", "dotenv": "^16.5.0", "openai": "^5.5.1", From d9499d99e407947d59332973b3d546c30fe04906 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Thu, 19 Jun 2025 19:16:42 +0100 Subject: [PATCH 05/13] kernel sdk versions * --- templates/typescript/advanced-sample/package.json | 2 +- templates/typescript/computer-use/package.json | 2 +- templates/typescript/cua/package.json | 2 +- templates/typescript/sample-app/package.json | 2 +- templates/typescript/stagehand/package.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/templates/typescript/advanced-sample/package.json b/templates/typescript/advanced-sample/package.json index 504cccc..e94eaa2 100644 --- a/templates/typescript/advanced-sample/package.json +++ b/templates/typescript/advanced-sample/package.json @@ -7,7 +7,7 @@ "typescript": "^5" }, "dependencies": { - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "playwright": "^1.52.0" } } diff --git a/templates/typescript/computer-use/package.json b/templates/typescript/computer-use/package.json index 61a3f5a..fa8b15e 100644 --- a/templates/typescript/computer-use/package.json +++ b/templates/typescript/computer-use/package.json @@ -7,7 +7,7 @@ "typescript": "^5" }, "dependencies": { - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "playwright": "^1.52.0", "@anthropic-ai/sdk": "0.52.0", "luxon": "3.6.0" diff --git a/templates/typescript/cua/package.json b/templates/typescript/cua/package.json index 1ff128c..70b296c 100644 --- a/templates/typescript/cua/package.json +++ b/templates/typescript/cua/package.json @@ -2,7 +2,7 @@ "type": "module", "private": true, "dependencies": { - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "@types/node": "^24.0.3", "dotenv": "^16.5.0", "openai": "^5.5.1", diff --git a/templates/typescript/sample-app/package.json b/templates/typescript/sample-app/package.json index 504cccc..e94eaa2 100644 --- a/templates/typescript/sample-app/package.json +++ b/templates/typescript/sample-app/package.json @@ -7,7 +7,7 @@ "typescript": "^5" }, "dependencies": { - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "playwright": "^1.52.0" } } diff --git a/templates/typescript/stagehand/package.json b/templates/typescript/stagehand/package.json index 92fa508..81a3af5 100644 --- a/templates/typescript/stagehand/package.json +++ b/templates/typescript/stagehand/package.json @@ -8,7 +8,7 @@ }, "dependencies": { "@browserbasehq/stagehand": "^2.2.1", - "@onkernel/sdk": ">=0.6.0", + "@onkernel/sdk": "^0.6.0", "zod": "^3.25.7" } } From b0504ef994fb94708d041d588df4ec8e4bc870bb Mon Sep 17 00:00:00 2001 From: raidendotai Date: Thu, 19 Jun 2025 19:18:28 +0100 Subject: [PATCH 06/13] cleanup --- templates/typescript/cua/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/templates/typescript/cua/index.ts b/templates/typescript/cua/index.ts index 135f32c..e32a090 100644 --- a/templates/typescript/cua/index.ts +++ b/templates/typescript/cua/index.ts @@ -1,6 +1,5 @@ import "dotenv/config"; import { Kernel, type KernelContext } from "@onkernel/sdk"; -import { chromium } from "playwright"; import { Agent } from "./lib/agent"; import computers from "./lib/computers"; From e68bf6d579a19904f72488e320978f1f6477f836 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Thu, 19 Jun 2025 20:40:46 +0100 Subject: [PATCH 07/13] added python-cua + updated readme & cli --- README.md | 7 +- index.ts | 20 ++- templates/python/cua/README.md | 7 + templates/python/cua/__init__.py | 0 templates/python/cua/_gitignore | 4 + templates/python/cua/agent/__init__.py | 1 + templates/python/cua/agent/agent.py | 170 ++++++++++++++++++ templates/python/cua/computers/__init__.py | 11 ++ templates/python/cua/computers/computer.py | 29 +++ templates/python/cua/computers/config.py | 7 + .../python/cua/computers/contrib/__init__.py | 0 .../python/cua/computers/default/__init__.py | 2 + .../python/cua/computers/default/kernel.py | 55 ++++++ .../cua/computers/default/local_playwright.py | 53 ++++++ .../python/cua/computers/shared/__init__.py | 0 .../cua/computers/shared/base_playwright.py | 154 ++++++++++++++++ templates/python/cua/main.py | 94 ++++++++++ templates/python/cua/pyproject.toml | 29 +++ templates/python/cua/utils.py | 76 ++++++++ 19 files changed, 710 insertions(+), 9 deletions(-) create mode 100644 templates/python/cua/README.md create mode 100644 templates/python/cua/__init__.py create mode 100644 templates/python/cua/_gitignore create mode 100644 templates/python/cua/agent/__init__.py create mode 100644 templates/python/cua/agent/agent.py create mode 100644 templates/python/cua/computers/__init__.py create mode 100644 templates/python/cua/computers/computer.py create mode 100644 templates/python/cua/computers/config.py create mode 100644 templates/python/cua/computers/contrib/__init__.py create mode 100644 templates/python/cua/computers/default/__init__.py create mode 100644 templates/python/cua/computers/default/kernel.py create mode 100644 templates/python/cua/computers/default/local_playwright.py create mode 100644 templates/python/cua/computers/shared/__init__.py create mode 100644 templates/python/cua/computers/shared/base_playwright.py create mode 100644 templates/python/cua/main.py create mode 100644 templates/python/cua/pyproject.toml create mode 100644 templates/python/cua/utils.py diff --git a/README.md b/README.md index b348fd8..f4d42e6 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ create-kernel-app [app-name] [options] - `stagehand`: Template with Stagehand SDK (Typescript only) - `advanced-sample`: Implements sample apps using advanced Kernel configs - `computer-use`: Implements a prompt loop using Anthropic Computer Use - - `cua`: Implements a Computer Use Agent (OpenAI CUA) sample (Typescript only) + - `cua`: Implements a Computer Use Agent (OpenAI CUA) sample ### Examples @@ -124,7 +124,10 @@ kernel invoke python-basic get-page-title --payload '{"url": "https://www.google kernel invoke python-bu bu-task --payload '{"task": "Compare the price of gpt-4o and DeepSeek-V3"}' # Typescript + CUA Sample -kernel invoke ts-cua cua-task --payload '{"task": "open hackernews and get the top 5 articles"}' +kernel invoke ts-cua cua-task --payload '{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}' + +# Python + CUA Sample +kernel invoke python-cua cua-task --payload '{"task": "Get current market price range for an unboxed Dreamcast"}' ``` ## Sample apps reference diff --git a/index.ts b/index.ts index 80850e1..1eaed85 100644 --- a/index.ts +++ b/index.ts @@ -35,7 +35,7 @@ const TEMPLATE_BROWSER_USE = "browser-use"; const TEMPLATE_STAGEHAND = "stagehand"; const TEMPLATE_ADVANCED_SAMPLE = "advanced-sample"; const TEMPLATE_COMPUTER_USE = "computer-use"; -const TEMPLATE_CUA_SAMPLE = "cua"; +const TEMPLATE_CUA = "cua"; const LANGUAGE_SHORTHAND_TS = "ts"; const LANGUAGE_SHORTHAND_PY = "py"; @@ -75,10 +75,10 @@ const TEMPLATES: Record = { description: "Implements the Anthropic Computer Use SDK", languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON], }, - [TEMPLATE_CUA_SAMPLE]: { + [TEMPLATE_CUA]: { name: "CUA Sample", description: "Implements a Computer Use Agent (OpenAI CUA) sample", - languages: [LANGUAGE_TYPESCRIPT], + languages: [LANGUAGE_TYPESCRIPT, LANGUAGE_PYTHON], }, }; @@ -95,8 +95,8 @@ const INVOKE_SAMPLES: Record< 'kernel invoke ts-advanced test-captcha-solver', [TEMPLATE_COMPUTER_USE]: 'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', - [TEMPLATE_CUA_SAMPLE]: - 'kernel invoke ts-cua cua-task --payload \'{"query": "open hackernews and get the top 5 articles"}\'', + [TEMPLATE_CUA]: + 'kernel invoke ts-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', }, [LANGUAGE_PYTHON]: { [TEMPLATE_SAMPLE_APP]: @@ -107,6 +107,8 @@ const INVOKE_SAMPLES: Record< 'kernel invoke python-advanced test-captcha-solver', [TEMPLATE_COMPUTER_USE]: 'kernel invoke python-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', + [TEMPLATE_CUA]: + 'kernel invoke python-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', }, }; @@ -123,7 +125,7 @@ const REGISTERED_APP_NAMES: Record< 'ts-advanced', [TEMPLATE_COMPUTER_USE]: 'ts-cu', - [TEMPLATE_CUA_SAMPLE]: + [TEMPLATE_CUA]: 'ts-cua', }, [LANGUAGE_PYTHON]: { @@ -135,6 +137,8 @@ const REGISTERED_APP_NAMES: Record< 'python-advanced', [TEMPLATE_COMPUTER_USE]: 'python-cu', + [TEMPLATE_CUA]: + 'python-cua', }, }; @@ -365,7 +369,7 @@ function printNextSteps( ? "kernel deploy index.ts --env OPENAI_API_KEY=XXX" : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_COMPUTER_USE ? "kernel deploy index.ts --env ANTHROPIC_API_KEY=XXX" - : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_CUA_SAMPLE + : language === LANGUAGE_TYPESCRIPT && template === TEMPLATE_CUA ? "kernel deploy index.ts --env OPENAI_API_KEY=XXX" : language === LANGUAGE_PYTHON && (template === TEMPLATE_SAMPLE_APP || template === TEMPLATE_ADVANCED_SAMPLE) ? "kernel deploy main.py" @@ -373,6 +377,8 @@ function printNextSteps( ? "kernel deploy main.py --env OPENAI_API_KEY=XXX" : language === LANGUAGE_PYTHON && template === TEMPLATE_COMPUTER_USE ? "kernel deploy main.py --env ANTHROPIC_API_KEY=XXX" + : language === LANGUAGE_PYTHON && template === TEMPLATE_CUA + ? "kernel deploy main.py --env OPENAI_API_KEY=XXX" : ""; console.log( diff --git a/templates/python/cua/README.md b/templates/python/cua/README.md new file mode 100644 index 0000000..03d2dd7 --- /dev/null +++ b/templates/python/cua/README.md @@ -0,0 +1,7 @@ +# Kernel Python Sample App - CUA + +This is a Kernel application that demonstrates using the Computer Using Agent (CUA) from OpenAI. + +It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. + +See the [docs](https://docs.onkernel.com/quickstart) for more information. \ No newline at end of file diff --git a/templates/python/cua/__init__.py b/templates/python/cua/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/python/cua/_gitignore b/templates/python/cua/_gitignore new file mode 100644 index 0000000..0ab378f --- /dev/null +++ b/templates/python/cua/_gitignore @@ -0,0 +1,4 @@ +__pycache__/ +.env +.venv/ +env/ \ No newline at end of file diff --git a/templates/python/cua/agent/__init__.py b/templates/python/cua/agent/__init__.py new file mode 100644 index 0000000..d2361b7 --- /dev/null +++ b/templates/python/cua/agent/__init__.py @@ -0,0 +1 @@ +from .agent import Agent diff --git a/templates/python/cua/agent/agent.py b/templates/python/cua/agent/agent.py new file mode 100644 index 0000000..d7f4267 --- /dev/null +++ b/templates/python/cua/agent/agent.py @@ -0,0 +1,170 @@ +from computers import Computer +from utils import ( + create_response, + show_image, + pp, + sanitize_message, + check_blocklisted_url, +) +import json +from typing import Callable + + +class Agent: + """ + A sample agent class that can be used to interact with a computer. + + (See simple_cua_loop.py for a simple example without an agent.) + """ + + def __init__( + self, + model="computer-use-preview", + computer: Computer = None, + tools: list[dict] = [], + acknowledge_safety_check_callback: Callable = lambda message: False, + ): + self.model = model + self.computer = computer + self.tools = tools + self.print_steps = True + self.debug = False + self.show_images = False + self.acknowledge_safety_check_callback = acknowledge_safety_check_callback + + if computer: + dimensions = computer.get_dimensions() + self.tools += [ + { + "type": "computer-preview", + "display_width": dimensions[0], + "display_height": dimensions[1], + "environment": computer.get_environment(), + }, + { + "type": "function", + "name": "back", + "description": "Go back to the previous page.", + "parameters": {}, + }, + { + "type": "function", + "name": "goto", + "description": "Go to a specific URL.", + "parameters": { + "type": "object", + "properties": { + "url": { + "type": "string", + "description": "Fully qualified URL to navigate to.", + }, + }, + "additionalProperties": False, + "required": ["url"], + }, + }, + { + "type": "function", + "name": "forward", + "description": "Go forward to the next page.", + "parameters": {}, + }, + ] + + def debug_print(self, *args): + if self.debug: + pp(*args) + + def handle_item(self, item): + """Handle each item; may cause a computer action + screenshot.""" + if item["type"] == "message": + if self.print_steps: + print(item["content"][0]["text"]) + + if item["type"] == "function_call": + name, args = item["name"], json.loads(item["arguments"]) + if self.print_steps: + print(f"{name}({args})") + + if hasattr(self.computer, name): # if function exists on computer, call it + method = getattr(self.computer, name) + method(**args) + return [ + { + "type": "function_call_output", + "call_id": item["call_id"], + "output": "success", # hard-coded output for demo + } + ] + + if item["type"] == "computer_call": + action = item["action"] + action_type = action["type"] + action_args = {k: v for k, v in action.items() if k != "type"} + if self.print_steps: + print(f"{action_type}({action_args})") + + method = getattr(self.computer, action_type) + method(**action_args) + + screenshot_base64 = self.computer.screenshot() + if self.show_images: + show_image(screenshot_base64) + + # if user doesn't ack all safety checks exit with error + pending_checks = item.get("pending_safety_checks", []) + for check in pending_checks: + message = check["message"] + if not self.acknowledge_safety_check_callback(message): + raise ValueError( + f"Safety check failed: {message}. Cannot continue with unacknowledged safety checks." + ) + + call_output = { + "type": "computer_call_output", + "call_id": item["call_id"], + "acknowledged_safety_checks": pending_checks, + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + }, + } + + # additional URL safety checks for browser environments + if self.computer.get_environment() == "browser": + current_url = self.computer.get_current_url() + check_blocklisted_url(current_url) + call_output["output"]["current_url"] = current_url + + return [call_output] + return [] + + def run_full_turn( + self, input_items, print_steps=True, debug=False, show_images=False + ): + self.print_steps = print_steps + self.debug = debug + self.show_images = show_images + new_items = [] + + # keep looping until we get a final response + while new_items[-1].get("role") != "assistant" if new_items else True: + self.debug_print([sanitize_message(msg) for msg in input_items + new_items]) + + response = create_response( + model=self.model, + input=input_items + new_items, + tools=self.tools, + truncation="auto", + ) + self.debug_print(response) + + if "output" not in response and self.debug: + print(response) + raise ValueError("No output from model") + else: + new_items += response["output"] + for item in response["output"]: + new_items += self.handle_item(item) + + return new_items diff --git a/templates/python/cua/computers/__init__.py b/templates/python/cua/computers/__init__.py new file mode 100644 index 0000000..0e8c132 --- /dev/null +++ b/templates/python/cua/computers/__init__.py @@ -0,0 +1,11 @@ +from . import default +from . import contrib +from .computer import Computer +from .config import computers_config + +__all__ = [ + "default", + "contrib", + "Computer", + "computers_config", +] diff --git a/templates/python/cua/computers/computer.py b/templates/python/cua/computers/computer.py new file mode 100644 index 0000000..8098650 --- /dev/null +++ b/templates/python/cua/computers/computer.py @@ -0,0 +1,29 @@ +from typing import Protocol, List, Literal, Dict + + +class Computer(Protocol): + """Defines the 'shape' (methods/properties) our loop expects.""" + + def get_environment(self) -> Literal["windows", "mac", "linux", "browser"]: ... + + def get_dimensions(self) -> tuple[int, int]: ... + + def screenshot(self) -> str: ... + + def click(self, x: int, y: int, button: str = "left") -> None: ... + + def double_click(self, x: int, y: int) -> None: ... + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: ... + + def type(self, text: str) -> None: ... + + def wait(self, ms: int = 1000) -> None: ... + + def move(self, x: int, y: int) -> None: ... + + def keypress(self, keys: List[str]) -> None: ... + + def drag(self, path: List[Dict[str, int]]) -> None: ... + + def get_current_url() -> str: ... diff --git a/templates/python/cua/computers/config.py b/templates/python/cua/computers/config.py new file mode 100644 index 0000000..4bf314c --- /dev/null +++ b/templates/python/cua/computers/config.py @@ -0,0 +1,7 @@ +from .default import * +from .contrib import * + +computers_config = { + "local-playwright": LocalPlaywrightBrowser, + "kernel": KernelPlaywrightBrowser, +} diff --git a/templates/python/cua/computers/contrib/__init__.py b/templates/python/cua/computers/contrib/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/python/cua/computers/default/__init__.py b/templates/python/cua/computers/default/__init__.py new file mode 100644 index 0000000..5e168f7 --- /dev/null +++ b/templates/python/cua/computers/default/__init__.py @@ -0,0 +1,2 @@ +from .local_playwright import LocalPlaywrightBrowser +from .kernel import KernelPlaywrightBrowser diff --git a/templates/python/cua/computers/default/kernel.py b/templates/python/cua/computers/default/kernel.py new file mode 100644 index 0000000..cf28022 --- /dev/null +++ b/templates/python/cua/computers/default/kernel.py @@ -0,0 +1,55 @@ +from playwright.sync_api import Browser, Page +from ..shared.base_playwright import BasePlaywrightComputer + +class KernelPlaywrightBrowser(BasePlaywrightComputer): + """ + Connects to a remote Chromium instance using a provided CDP URL. + Expects a dict as input: {'cdp_ws_url': ..., 'width': ..., 'height': ...} + Width and height are optional, defaulting to 1024x768. + """ + + def __init__(self, config: dict): + super().__init__() + self.cdp_ws_url = config.get("cdp_ws_url") + if not self.cdp_ws_url: + raise ValueError("cdp_ws_url must be provided in config dict") + self.width = config.get("width", 1024) + self.height = config.get("height", 768) + self.dimensions = (self.width, self.height) + + def get_dimensions(self): + return self.dimensions + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + # Connect to the remote browser using the CDP URL + browser = self._playwright.chromium.connect_over_cdp(self.cdp_ws_url) + # Use the first context or create one if none exists + if browser.contexts: + context = browser.contexts[0] + else: + context = browser.new_context() + # Add event listeners for page creation and closure + context.on("page", self._handle_new_page) + # Create a new page and set viewport + page = context.pages[0] if context.pages else context.new_page() + page.set_viewport_size({"width": self.width, "height": self.height}) + page.on("close", self._handle_page_close) + # Optionally, navigate to a default page + # page.goto("about:blank") + return browser, page + + def _handle_new_page(self, page: Page): + """Handle the creation of a new page.""" + print("New page created") + self._page = page + page.on("close", self._handle_page_close) + + def _handle_page_close(self, page: Page): + """Handle the closure of a page.""" + print("Page closed") + if hasattr(self, "_browser") and self._page == page: + if self._browser.contexts[0].pages: + self._page = self._browser.contexts[0].pages[-1] + else: + print("Warning: All pages have been closed.") + self._page = None diff --git a/templates/python/cua/computers/default/local_playwright.py b/templates/python/cua/computers/default/local_playwright.py new file mode 100644 index 0000000..aab3355 --- /dev/null +++ b/templates/python/cua/computers/default/local_playwright.py @@ -0,0 +1,53 @@ +from playwright.sync_api import Browser, Page +from ..shared.base_playwright import BasePlaywrightComputer + + +class LocalPlaywrightBrowser(BasePlaywrightComputer): + """Launches a local Chromium instance using Playwright.""" + + def __init__(self, headless: bool = False): + super().__init__() + self.headless = headless + + def _get_browser_and_page(self) -> tuple[Browser, Page]: + width, height = self.get_dimensions() + launch_args = [ + f"--window-size={width},{height}", + "--disable-extensions", + "--disable-file-system", + ] + browser = self._playwright.chromium.launch( + chromium_sandbox=True, + headless=self.headless, + args=launch_args, + env={"DISPLAY": ":0"}, + ) + + context = browser.new_context() + + # Add event listeners for page creation and closure + context.on("page", self._handle_new_page) + + page = context.new_page() + page.set_viewport_size({"width": width, "height": height}) + page.on("close", self._handle_page_close) + + # page.goto("about:blank") + + return browser, page + + def _handle_new_page(self, page: Page): + """Handle the creation of a new page.""" + print("New page created") + self._page = page + page.on("close", self._handle_page_close) + + def _handle_page_close(self, page: Page): + """Handle the closure of a page.""" + print("Page closed") + if self._page == page: + if self._browser.contexts[0].pages: + self._page = self._browser.contexts[0].pages[-1] + else: + print("Warning: All pages have been closed.") + self._page = None diff --git a/templates/python/cua/computers/shared/__init__.py b/templates/python/cua/computers/shared/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/python/cua/computers/shared/base_playwright.py b/templates/python/cua/computers/shared/base_playwright.py new file mode 100644 index 0000000..0c38e24 --- /dev/null +++ b/templates/python/cua/computers/shared/base_playwright.py @@ -0,0 +1,154 @@ +import time +import base64 +from typing import List, Dict, Literal +from playwright.sync_api import sync_playwright, Browser, Page +from utils import check_blocklisted_url + +# Optional: key mapping if your model uses "CUA" style keys +CUA_KEY_TO_PLAYWRIGHT_KEY = { + "/": "Divide", + "\\": "Backslash", + "alt": "Alt", + "arrowdown": "ArrowDown", + "arrowleft": "ArrowLeft", + "arrowright": "ArrowRight", + "arrowup": "ArrowUp", + "backspace": "Backspace", + "capslock": "CapsLock", + "cmd": "Meta", + "ctrl": "Control", + "delete": "Delete", + "end": "End", + "enter": "Enter", + "esc": "Escape", + "home": "Home", + "insert": "Insert", + "option": "Alt", + "pagedown": "PageDown", + "pageup": "PageUp", + "shift": "Shift", + "space": " ", + "super": "Meta", + "tab": "Tab", + "win": "Meta", +} + + +class BasePlaywrightComputer: + """ + Abstract base for Playwright-based computers: + + - Subclasses override `_get_browser_and_page()` to do local or remote connection, + returning (Browser, Page). + - This base class handles context creation (`__enter__`/`__exit__`), + plus standard "Computer" actions like click, scroll, etc. + - We also have extra browser actions: `goto(url)` and `back()`. + """ + + def get_environment(self): + return "browser" + + def get_dimensions(self): + return (1024, 768) + + def __init__(self): + self._playwright = None + self._browser: Browser | None = None + self._page: Page | None = None + + def __enter__(self): + # Start Playwright and call the subclass hook for getting browser/page + self._playwright = sync_playwright().start() + self._browser, self._page = self._get_browser_and_page() + + # Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS + def handle_route(route, request): + + url = request.url + if check_blocklisted_url(url): + print(f"Flagging blocked domain: {url}") + route.abort() + else: + route.continue_() + + self._page.route("**/*", handle_route) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self._browser: + self._browser.close() + if self._playwright: + self._playwright.stop() + + def get_current_url(self) -> str: + return self._page.url + + # --- Common "Computer" actions --- + def screenshot(self) -> str: + """Capture only the viewport (not full_page).""" + png_bytes = self._page.screenshot(full_page=False) + return base64.b64encode(png_bytes).decode("utf-8") + + def click(self, x: int, y: int, button: str = "left") -> None: + match button: + case "back": + self.back() + case "forward": + self.forward() + case "wheel": + self._page.mouse.wheel(x, y) + case _: + button_mapping = {"left": "left", "right": "right"} + button_type = button_mapping.get(button, "left") + self._page.mouse.click(x, y, button=button_type) + + def double_click(self, x: int, y: int) -> None: + self._page.mouse.dblclick(x, y) + + def scroll(self, x: int, y: int, scroll_x: int, scroll_y: int) -> None: + self._page.mouse.move(x, y) + self._page.evaluate(f"window.scrollBy({scroll_x}, {scroll_y})") + + def type(self, text: str) -> None: + self._page.keyboard.type(text) + + def wait(self, ms: int = 1000) -> None: + time.sleep(ms / 1000) + + def move(self, x: int, y: int) -> None: + self._page.mouse.move(x, y) + + def keypress(self, keys: List[str]) -> None: + mapped_keys = [CUA_KEY_TO_PLAYWRIGHT_KEY.get(key.lower(), key) for key in keys] + for key in mapped_keys: + self._page.keyboard.down(key) + for key in reversed(mapped_keys): + self._page.keyboard.up(key) + + def drag(self, path: List[Dict[str, int]]) -> None: + if not path: + return + self._page.mouse.move(path[0]["x"], path[0]["y"]) + self._page.mouse.down() + for point in path[1:]: + self._page.mouse.move(point["x"], point["y"]) + self._page.mouse.up() + + # --- Extra browser-oriented actions --- + def goto(self, url: str) -> None: + try: + return self._page.goto(url) + except Exception as e: + print(f"Error navigating to {url}: {e}") + + def back(self) -> None: + return self._page.go_back() + + def forward(self) -> None: + return self._page.go_forward() + + # --- Subclass hook --- + def _get_browser_and_page(self) -> tuple[Browser, Page]: + """Subclasses must implement, returning (Browser, Page).""" + raise NotImplementedError diff --git a/templates/python/cua/main.py b/templates/python/cua/main.py new file mode 100644 index 0000000..6ec301a --- /dev/null +++ b/templates/python/cua/main.py @@ -0,0 +1,94 @@ +import os +from typing import TypedDict +import kernel +from kernel import Kernel +from computers.default import KernelPlaywrightBrowser +from agent import Agent +import datetime +import asyncio + +""" +Example app that runs an agent using openai CUA +Args: + ctx: Kernel context containing invocation information + payload: An object with a `task` property +Returns: + An answer to the task, elapsed time and optionally the messages stack +Invoke this via CLI: + export KERNEL_API_KEY= + kernel deploy main.py -e OPENAI_API_KEY=XXXXX --force + kernel invoke python-cua cua-task -p '{"task":"go to https://news.ycombinator.com and list top 5 articles"}' + kernel logs python-cua -f # Open in separate tab +""" + +class CuaInput(TypedDict): + task: str + +class CuaOutput(TypedDict): + result: str + +api_key = os.getenv("OPENAI_API_KEY") +if not api_key: + raise ValueError("OPENAI_API_KEY is not set") + +client = Kernel() +app = kernel.App("python-cua") + +@app.action("cua-task") +async def cua_task( + ctx: kernel.KernelContext, + payload: CuaInput, +) -> CuaOutput: + # A function that processes a user task using the kernel browser and agent + + if not payload or not payload.get("task"): + raise ValueError("task is required") + + kernel_browser = await asyncio.to_thread( + client.browsers.create, invocation_id=ctx.invocation_id, stealth=True + ) + print("Kernel browser live view url: ", kernel_browser.browser_live_view_url) + cdp_ws_url = kernel_browser.cdp_ws_url + + def run_agent(): + with KernelPlaywrightBrowser({"cdp_ws_url": cdp_ws_url}) as computer: + + # messages to provide to the agent + items = [ + { + "role": "system", + "content": f"- Current date and time: {datetime.datetime.utcnow().isoformat()} ({datetime.datetime.utcnow().strftime('%A')})", + }, + { + "role": "user", + "content": payload["task"] + } + ] + + # setup the agent + agent = Agent( + computer=computer, + tools=[], # can provide additional tools to the agent + acknowledge_safety_check_callback=lambda message: (print(f"> agent : safety check message (skipping): {message}") or True) # safety check function , now defaults to true + ) + + # run the agent + response_items = agent.run_full_turn( + items, + debug=True, + show_images=False, + ) + + if not response_items or "content" not in response_items[-1]: + raise ValueError("No response from agent") + # The content may be a list of blocks, get the first text block + content = response_items[-1]["content"] + if isinstance(content, list) and content and isinstance(content[0], dict) and "text" in content[0]: + result = content[0]["text"] + elif isinstance(content, str): + result = content + else: + result = str(content) + return {"result": result} + + return await asyncio.to_thread(run_agent) diff --git a/templates/python/cua/pyproject.toml b/templates/python/cua/pyproject.toml new file mode 100644 index 0000000..7115077 --- /dev/null +++ b/templates/python/cua/pyproject.toml @@ -0,0 +1,29 @@ +[project] +name = "python-cua" +version = "0.1.0" +description = "Kernel sample app for CUA" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "annotated-types==0.7.0", + "anyio==4.8.0", + "certifi==2025.1.31", + "charset-normalizer==3.4.1", + "distro==1.9.0", + "greenlet==3.1.1", + "h11==0.14.0", + "httpcore==1.0.7", + "httpx==0.28.1", + "idna==3.10", + "jiter==0.8.2", + "pillow==11.1.0", + "playwright==1.50.0", + "pydantic==2.10.6", + "pydantic_core==2.27.2", + "pyee==12.1.1", + "python-dotenv==1.0.1", + "requests==2.32.3", + "sniffio==1.3.1", + "typing_extensions==4.12.2", + "urllib3==2.3.0", +] diff --git a/templates/python/cua/utils.py b/templates/python/cua/utils.py new file mode 100644 index 0000000..b17ee81 --- /dev/null +++ b/templates/python/cua/utils.py @@ -0,0 +1,76 @@ +import os +import requests +from dotenv import load_dotenv +import json +import base64 +from PIL import Image +from io import BytesIO +import io +from urllib.parse import urlparse + +load_dotenv(override=True) + +BLOCKED_DOMAINS = [ + "maliciousbook.com", + "evilvideos.com", + "darkwebforum.com", + "shadytok.com", + "suspiciouspins.com", + "ilanbigio.com", +] + + +def pp(obj): + print(json.dumps(obj, indent=4)) + + +def show_image(base_64_image): + image_data = base64.b64decode(base_64_image) + image = Image.open(BytesIO(image_data)) + image.show() + + +def calculate_image_dimensions(base_64_image): + image_data = base64.b64decode(base_64_image) + image = Image.open(io.BytesIO(image_data)) + return image.size + + +def sanitize_message(msg: dict) -> dict: + """Return a copy of the message with image_url omitted for computer_call_output messages.""" + if msg.get("type") == "computer_call_output": + output = msg.get("output", {}) + if isinstance(output, dict): + sanitized = msg.copy() + sanitized["output"] = {**output, "image_url": "[omitted]"} + return sanitized + return msg + + +def create_response(**kwargs): + url = "https://api.openai.com/v1/responses" + headers = { + "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}", + "Content-Type": "application/json" + } + + openai_org = os.getenv("OPENAI_ORG") + if openai_org: + headers["Openai-Organization"] = openai_org + + response = requests.post(url, headers=headers, json=kwargs) + + if response.status_code != 200: + print(f"Error: {response.status_code} {response.text}") + + return response.json() + + +def check_blocklisted_url(url: str) -> None: + """Raise ValueError if the given URL (including subdomains) is in the blocklist.""" + hostname = urlparse(url).hostname or "" + if any( + hostname == blocked or hostname.endswith(f".{blocked}") + for blocked in BLOCKED_DOMAINS + ): + raise ValueError(f"Blocked URL: {url}") From 3031c966d5592bbc64d9397e7639165f7b41f8d6 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Mon, 23 Jun 2025 22:07:08 +0100 Subject: [PATCH 08/13] ts-cua: lint & openai types update --- templates/typescript/cua/.env.example | 2 + templates/typescript/cua/.eslintrc.cjs | 10 + templates/typescript/cua/.prettierrc | 2 +- templates/typescript/cua/README.md | 2 +- templates/typescript/cua/index.ts | 168 ++++--- templates/typescript/cua/lib/agent.ts | 423 ++++++++--------- templates/typescript/cua/lib/computers.ts | 45 +- .../typescript/cua/lib/playwright/base.ts | 448 +++++++++--------- .../typescript/cua/lib/playwright/kernel.ts | 103 ++-- .../typescript/cua/lib/playwright/local.ts | 104 ++-- templates/typescript/cua/lib/toolset.ts | 72 +-- templates/typescript/cua/lib/utils.ts | 112 ++--- templates/typescript/cua/package.json | 35 +- templates/typescript/cua/test.local.ts | 41 ++ templates/typescript/cua/tsconfig.json | 42 +- 15 files changed, 779 insertions(+), 830 deletions(-) create mode 100644 templates/typescript/cua/.env.example create mode 100644 templates/typescript/cua/.eslintrc.cjs create mode 100644 templates/typescript/cua/test.local.ts diff --git a/templates/typescript/cua/.env.example b/templates/typescript/cua/.env.example new file mode 100644 index 0000000..4270afa --- /dev/null +++ b/templates/typescript/cua/.env.example @@ -0,0 +1,2 @@ +OPENAI_API_KEY=YOUR_OPENAI_API_KEY +# KERNEL_API_KEY=YOUR_KERNEL_KEY \ No newline at end of file diff --git a/templates/typescript/cua/.eslintrc.cjs b/templates/typescript/cua/.eslintrc.cjs new file mode 100644 index 0000000..d9a7fb7 --- /dev/null +++ b/templates/typescript/cua/.eslintrc.cjs @@ -0,0 +1,10 @@ +module.exports = { + parser: '@typescript-eslint/parser', + parserOptions: { project: './tsconfig.json', sourceType: 'module' }, + plugins: ['@typescript-eslint'], + extends: ['eslint:recommended', 'plugin:@typescript-eslint/recommended', 'prettier'], + rules: { + '@typescript-eslint/no-explicit-any': 'error', + 'no-console': 'off', + }, +}; diff --git a/templates/typescript/cua/.prettierrc b/templates/typescript/cua/.prettierrc index 3ee282f..ca8527e 100644 --- a/templates/typescript/cua/.prettierrc +++ b/templates/typescript/cua/.prettierrc @@ -4,4 +4,4 @@ "singleQuote": true, "printWidth": 100, "tabWidth": 2 -} \ No newline at end of file +} diff --git a/templates/typescript/cua/README.md b/templates/typescript/cua/README.md index 0cb2dfe..c196bcd 100644 --- a/templates/typescript/cua/README.md +++ b/templates/typescript/cua/README.md @@ -5,4 +5,4 @@ This is a Kernel application that demonstrates using the Computer Using Agent (C It generally follows the [OpenAI CUA Sample App Reference](https://github.com/openai/openai-cua-sample-app) and uses Playwright via Kernel for browser automation. Also makes use of the latest OpenAI SDK format, and has local equivalent to Kernel methods for local testing before deploying on Kernel. -See the [docs](https://docs.onkernel.com/quickstart) for information. \ No newline at end of file +See the [docs](https://docs.onkernel.com/quickstart) for information. diff --git a/templates/typescript/cua/index.ts b/templates/typescript/cua/index.ts index e32a090..cbf9136 100644 --- a/templates/typescript/cua/index.ts +++ b/templates/typescript/cua/index.ts @@ -1,14 +1,24 @@ -import "dotenv/config"; -import { Kernel, type KernelContext } from "@onkernel/sdk"; -import { Agent } from "./lib/agent"; -import computers from "./lib/computers"; +import 'dotenv/config'; +import { Kernel, type KernelContext } from '@onkernel/sdk'; +import { Agent } from './lib/agent'; +import computers from './lib/computers'; +import type { ResponseOutputMessage, ResponseItem } from 'openai/resources/responses/responses'; + +interface CuaInput { + task: string; +} +interface CuaOutput { + elapsed: number; + answer: string | null; + logs?: ResponseItem[]; +} const kernel = new Kernel(); -const app = kernel.app("ts-cua"); +const app = kernel.app('ts-cua'); -// LLM API Keys are set in the environment during `kernel deploy -e ANTHROPIC_API_KEY=XXX` -// See https://docs.onkernel.com/launch/deploy#environment-variables -if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); +if (!process.env.OPENAI_API_KEY) { + throw new Error('OPENAI_API_KEY is not set'); +} /** * Example app that run an agent using openai CUA @@ -20,92 +30,78 @@ if (!process.env.OPENAI_API_KEY) throw new Error('OPENAI_API_KEY is not set'); * Invoke this via CLI: * export KERNEL_API_KEY= * kernel deploy index.ts -e OPENAI_API_KEY=XXXXX --force - * kernel invoke ts-cua cua-task -p "{\"task\":\"current market price range for a used dreamcast\"}" + * kernel invoke ts-cua cua-task -p "{\"task\":\"go to ebay.com and find the current market price range for a dreamcast\"}" * kernel logs ts-cua -f # Open in separate tab */ -interface CuaInput { - task: string; -} - -interface CuaOutput { - elapsed: number; - response?: Array; - answer: object; -} - app.action( - "cua-task", - async (ctx: KernelContext, payload?: CuaInput): Promise => { - const startTime = Date.now(); - const kernelBrowser = await kernel.browsers.create({ - invocation_id: ctx.invocation_id, - }); - console.log( - "> Kernel browser live view url: ", - kernelBrowser.browser_live_view_url, - ); - - if (!payload?.task){ - throw new Error('task is required'); - } - - try { + 'cua-task', + async (ctx: KernelContext, payload?: CuaInput): Promise => { + const start = Date.now(); + if (!payload?.task) throw new Error('task is required'); - // kernel browser - const { computer } = await computers.create({ - type: "kernel", // for local testing before deploying to Kernel, you can use type: "local" - cdp_ws_url: kernelBrowser.cdp_ws_url, - }); + try { + const kb = await kernel.browsers.create({ invocation_id: ctx.invocation_id }); + console.log('> Kernel browser live view url:', kb.browser_live_view_url); - // setup agent - const agent = new Agent({ - model: "computer-use-preview", - computer, - tools: [], // additional function_call tools to provide to the llm - acknowledge_safety_check_callback: (message: string) => { - console.log(`> safety check: ${message}`); - return true; // Auto-acknowledge all safety checks for testing - }, - }); + const { computer } = await computers.create({ type: 'kernel', cdp_ws_url: kb.cdp_ws_url }); + const agent = new Agent({ + model: 'computer-use-preview', + computer, + tools: [], + acknowledge_safety_check_callback: (m: string) => { + console.log(`> safety check: ${m}`); + return true; + }, + }); - // start agent run - const response = await agent.runFullTurn({ - messages: [ - { - role: "system", - content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString("en-US", { weekday: "long" })})`, - }, - { - type: "message", - role: "user", - content: [ - { - type: "input_text", - text: payload.task, - // text: "go to https://news.ycombinator.com , open top article , describe the target website design (in yaml format)" - }, - ], - }, - ], - print_steps: true, // log function_call and computer_call actions - debug: true, // show agent debug logs (llm messages and responses) - show_images: false, // if set to true, response messages stack will return base64 images (webp format) of screenshots, if false, replaced with "[omitted]"" - }); + // run agent and get response + const logs = await agent.runFullTurn({ + messages: [ + { + role: 'system', + content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString( + 'en-US', + { weekday: 'long' }, + )})`, + }, + { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: payload.task }], + }, + ], + print_steps: true, + debug: true, + show_images: false, + }); - console.log("> agent run done"); + const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); - const endTime = Date.now(); - const timeElapsed = (endTime - startTime) / 1000; // Convert to seconds + // filter only LLM messages + const messages = logs.filter( + (item): item is ResponseOutputMessage => + item.type === 'message' && + typeof (item as ResponseOutputMessage).role === 'string' && + Array.isArray((item as ResponseOutputMessage).content), + ); + const assistant = messages.find((m) => m.role === 'assistant'); + const lastContentIndex = assistant?.content?.length ? assistant.content.length - 1 : -1; + const lastContent = lastContentIndex >= 0 ? assistant?.content?.[lastContentIndex] : null; + const answer = lastContent && 'text' in lastContent ? lastContent.text : null; - return { - // response, // full messages stack trace - elapsed: parseFloat(timeElapsed.toFixed(2)), - answer: response?.slice(-1)?.[0]?.content?.[0]?.text ?? null, - }; - } finally { - // Note: KernelPlaywrightComputer handles browser cleanup internally - // No need to manually close browser here - } - }, + return { + // logs, // optionally, get the full agent run messages logs + elapsed, + answer, + }; + } catch (error) { + const elapsed = parseFloat(((Date.now() - start) / 1000).toFixed(2)); + console.error('Error in cua-task:', error); + return { + elapsed, + answer: null, + }; + } + }, ); diff --git a/templates/typescript/cua/lib/agent.ts b/templates/typescript/cua/lib/agent.ts index f28b298..02412a8 100644 --- a/templates/typescript/cua/lib/agent.ts +++ b/templates/typescript/cua/lib/agent.ts @@ -1,220 +1,209 @@ -import utils from "./utils"; -import computers from "./computers"; -import toolset from "./toolset"; +import { + type ResponseItem, + type ResponseInputItem, + type ResponseOutputMessage, + type ResponseFunctionToolCallItem, + type ResponseFunctionToolCallOutputItem, + type ResponseComputerToolCall, + type ResponseComputerToolCallOutputItem, + type ComputerTool, +} from 'openai/resources/responses/responses'; + +import * as utils from './utils'; +import toolset from './toolset'; +import type { BasePlaywrightComputer } from './playwright/base'; +import type { LocalPlaywrightComputer } from './playwright/local'; +import type { KernelPlaywrightComputer } from './playwright/kernel'; -import type { BasePlaywrightComputer } from "./playwright/base"; - - -interface Item { - [key: string]: any; -} - -interface Tool { - type: string; - display_width?: number; - display_height?: number; - environment?: string; - [key: string]: any; -} - -interface SafetyCheck { - message: string; - [key: string]: any; -} - -interface ComputerCallOutput { - type: string; - call_id: string; - acknowledged_safety_checks: SafetyCheck[]; - output: { - type: string; - image_url: string; - current_url?: string; - }; -} - -type AcknowledgeSafetyCheckCallback = (message: string) => boolean; - -/** - * A sample agent class that can be used to interact with a computer. - */ export class Agent { - private model: string; - private computer: BasePlaywrightComputer | null; - private tools: Tool[]; - private print_steps: boolean; - private debug: boolean; - private show_images: boolean; - private acknowledge_safety_check_callback: AcknowledgeSafetyCheckCallback; - - constructor({ - model = "computer-use-preview", - computer = null, - tools = [], - acknowledge_safety_check_callback = () => true, - }: { - model?: string; - computer?: BasePlaywrightComputer | null; - tools?: Tool[]; - acknowledge_safety_check_callback?: AcknowledgeSafetyCheckCallback; - }) { - this.model = model; - this.computer = computer; - this.tools = [...toolset.shared, ...tools]; - this.print_steps = true; - this.debug = false; - this.show_images = false; - this.acknowledge_safety_check_callback = acknowledge_safety_check_callback; - - if (computer) { - const dimensions = computer.getDimensions(); - this.tools.push({ - type: "computer-preview", - display_width: dimensions[0], - display_height: dimensions[1], - environment: computer.getEnvironment(), - }); - } - } - - private debugPrint(...args: any[]): void { - if (this.debug) { - console.warn("--- debug:agent:debugPrint"); - console.dir(...args, { depth: null }); - } - } - - private async handleItem(item: Item): Promise { - /**Handle each item; may cause a computer action + screenshot.*/ - if (item.type === "message") { - if (this.print_steps && item.content?.[0]?.text) { - console.log(item.content[0].text); - } - } - - if (item.type === "function_call") { - const name = item.name!; - const args = JSON.parse(item.arguments!); - if (this.print_steps) { - console.log(`${name}(${JSON.stringify(args)})`); - } - - if (this.computer && (this.computer as any)[name]) { - const method = (this.computer as any)[name]; - await method.call(this.computer, ...Object.values(args)); - } - return [ - { - type: "function_call_output", - call_id: item.call_id!, - output: "success", // hard-coded output for demo - }, - ]; - } - - if (item.type === "computer_call") { - const action = item.action!; - const action_type = action.type; - const action_args = Object.fromEntries( - Object.entries(action).filter(([k]) => k !== "type"), - ); - if (this.print_steps) { - console.log(`${action_type}(${JSON.stringify(action_args)})`); - } - - if (this.computer) { - const method = (this.computer as any)[action_type]; - await method.call(this.computer, ...Object.values(action_args)); - - const screenshot_base64 = await this.computer.screenshot(); - // console.dir({ debug: { screenshot_base64 }}) - - // if user doesn't ack all safety checks exit with error - const pending_checks = item.pending_safety_checks || []; - for (const check of pending_checks) { - const message = check.message; - if (!this.acknowledge_safety_check_callback(message)) { - throw new Error( - `Safety check failed: ${message}. Cannot continue with unacknowledged safety checks.`, - ); - } - } - - const call_output: ComputerCallOutput = { - type: "computer_call_output", - call_id: item.call_id!, - acknowledged_safety_checks: pending_checks, - output: { - type: "input_image", - image_url: `data:image/webp;base64,${screenshot_base64}`, - }, - }; - - // additional URL safety checks for browser environments - if (this.computer.getEnvironment() === "browser") { - const current_url = this.computer.getCurrentUrl(); - utils.checkBlocklistedUrl(current_url); - call_output.output.current_url = current_url; - } - - return [call_output]; - } - } - return []; - } - - async runFullTurn({ - messages, - print_steps = true, - debug = false, - show_images = false, - }: { - messages: Item[]; - print_steps?: boolean; - debug?: boolean; - show_images?: boolean; - }): Promise { - this.print_steps = print_steps; - this.debug = debug; - this.show_images = show_images; - const new_items: Item[] = []; - - // keep looping until we get a final response - while ( - new_items.length === 0 || - (new_items[new_items.length - 1]?.role !== "assistant") - ) { - this.debugPrint( - messages.concat(new_items).map((msg) => utils.sanitizeMessage(msg)), - ); - - const response = await utils.createResponse({ - model: this.model, - input: messages.concat(new_items), - tools: this.tools, - truncation: "auto", - }); - this.debugPrint(response); - - if (!response.output && this.debug) { - console.log(response); - throw new Error("No output from model"); - } else if (response.output) { - new_items.push(...response.output); - for (const item of response.output) { - const handled_items = await this.handleItem(item); - new_items.push(...handled_items); - } - } - } - - // Return sanitized messages if show_images is false - if (!show_images) { - return new_items.map((msg) => utils.sanitizeMessage(msg)); - } - - return new_items; - } + private model: string; + private computer: + | BasePlaywrightComputer + | LocalPlaywrightComputer + | KernelPlaywrightComputer + | undefined; + private tools: ComputerTool[]; + private print_steps = true; + private debug = false; + private show_images = false; + private ackCb: (msg: string) => boolean; + + constructor(opts: { + model?: string; + computer?: + | BasePlaywrightComputer + | LocalPlaywrightComputer + | KernelPlaywrightComputer + | undefined; + tools?: ComputerTool[]; + acknowledge_safety_check_callback?: (msg: string) => boolean; + }) { + this.model = opts.model ?? 'computer-use-preview'; + this.computer = opts.computer; + this.tools = [...toolset.shared, ...(opts.tools ?? [])] as ComputerTool[]; + this.ackCb = opts.acknowledge_safety_check_callback ?? (() => true); + + if (this.computer) { + const [w, h] = this.computer.getDimensions(); + this.tools.push({ + type: 'computer_use_preview', + display_width: w, + display_height: h, + environment: this.computer.getEnvironment(), + }); + } + } + + private debugPrint(...args: unknown[]): void { + if (this.debug) { + console.warn('--- debug:agent:debugPrint'); + try { + console.dir( + args.map((msg) => utils.sanitizeMessage(msg as ResponseItem)), + { depth: null }, + ); + } catch (e) { + console.dir(args, { depth: null }); + } + } + } + + private async handleItem(item: ResponseItem): Promise { + if (item.type === 'message' && this.print_steps) { + const msg = item as ResponseOutputMessage; + const c = msg.content; + if (Array.isArray(c) && c[0] && 'text' in c[0] && typeof c[0].text === 'string') + console.log(c[0].text); + } + + if (item.type === 'function_call') { + const fc = item as ResponseFunctionToolCallItem; + const argsObj = JSON.parse(fc.arguments) as Record; + if (this.print_steps) console.log(`${fc.name}(${JSON.stringify(argsObj)})`); + if (this.computer) { + const fn = (this.computer as unknown as Record)[fc.name]; + if (typeof fn === 'function') + await (fn as (...a: unknown[]) => unknown)(...Object.values(argsObj)); + } + return [ + { + type: 'function_call_output', + call_id: fc.call_id, + output: 'success', + } as unknown as ResponseFunctionToolCallOutputItem, + ]; + } + + if (item.type === 'computer_call') { + const cc = item as ResponseComputerToolCall; + const { type: actionType, ...actionArgs } = cc.action; + if (this.print_steps) console.log(`${actionType}(${JSON.stringify(actionArgs)})`); + if (this.computer) { + const fn = (this.computer as unknown as Record)[actionType as string]; + if (typeof fn === 'function') { + await (fn as (...a: unknown[]) => unknown)(...Object.values(actionArgs)); + const screenshot = await this.computer.screenshot(); + const pending = cc.pending_safety_checks ?? []; + console.dir({ debug_agent_computer_call: cc }); + for (const { message } of pending) + if (!this.ackCb(message)) throw new Error(`Safety check failed: ${message}`); + const out: Omit = { + type: 'computer_call_output', + call_id: cc.call_id, + // id: "?", // <---- omitting to work - need to determine id source, != call_id + acknowledged_safety_checks: pending, + output: { + type: 'computer_screenshot', + image_url: `data:image/webp;base64,${screenshot}`, + }, + }; + if (this.computer.getEnvironment() === 'browser') + utils.checkBlocklistedUrl(this.computer.getCurrentUrl()); + return [out as ResponseItem]; + } + } + } + + return []; + } + + async runFullTurn(opts: { + messages: ResponseInputItem[]; + print_steps?: boolean; + debug?: boolean; + show_images?: boolean; + }): Promise { + this.print_steps = opts.print_steps ?? true; + this.debug = opts.debug ?? false; + this.show_images = opts.show_images ?? false; + const newItems: ResponseItem[] = []; + + while ( + newItems.length === 0 || + (newItems[newItems.length - 1] as ResponseItem & { role?: string }).role !== 'assistant' + ) { + // Add current URL to system message if in browser environment + const inputMessages = [...opts.messages]; + + if (this.computer?.getEnvironment() === 'browser') { + const current_url = this.computer.getCurrentUrl(); + // Find system message by checking if it has a role property with value 'system' + const sysIndex = inputMessages.findIndex((msg) => 'role' in msg && msg.role === 'system'); + + if (sysIndex >= 0) { + const msg = inputMessages[sysIndex]; + const urlInfo = `\n- Current URL: ${current_url}`; + + // Create a properly typed message based on the original + if (msg && 'content' in msg) { + if (typeof msg.content === 'string') { + // Create a new message with the updated content + const updatedMsg = { + ...msg, + content: msg.content + urlInfo, + }; + // Type assertion to ensure compatibility + inputMessages[sysIndex] = updatedMsg as typeof msg; + } else if (Array.isArray(msg.content) && msg.content.length > 0) { + // Handle array content case + const updatedContent = [...msg.content]; + + // Check if first item has text property + if (updatedContent[0] && 'text' in updatedContent[0]) { + updatedContent[0] = { + ...updatedContent[0], + text: updatedContent[0].text + urlInfo, + }; + } + + // Create updated message with new content + const updatedMsg = { + ...msg, + content: updatedContent, + }; + // Type assertion to ensure compatibility + inputMessages[sysIndex] = updatedMsg as typeof msg; + } + } + } + } + + this.debugPrint(...inputMessages, ...newItems); + const response = await utils.createResponse({ + model: this.model, + input: [...inputMessages, ...newItems], + tools: this.tools, + truncation: 'auto', + }); + if (!response.output) throw new Error('No output from model'); + for (const msg of response.output as ResponseItem[]) { + newItems.push(msg, ...(await this.handleItem(msg))); + } + } + + // Return sanitized messages if show_images is false + return !this.show_images + ? newItems.map((msg) => utils.sanitizeMessage(msg) as ResponseItem) + : newItems; + } } - -export default { Agent }; diff --git a/templates/typescript/cua/lib/computers.ts b/templates/typescript/cua/lib/computers.ts index 3c8aa47..5828fc8 100644 --- a/templates/typescript/cua/lib/computers.ts +++ b/templates/typescript/cua/lib/computers.ts @@ -1,25 +1,28 @@ -import { KernelPlaywrightComputer } from "./playwright/kernel.ts"; -import { LocalPlaywrightComputer } from "./playwright/local.ts"; +import { KernelPlaywrightComputer } from './playwright/kernel'; +import { LocalPlaywrightComputer } from './playwright/local'; -interface ComputerConfig { - type: "local" | "kernel"; - [key: string]: any; +interface KernelConfig { + type: 'kernel'; + cdp_ws_url: string; } +interface LocalConfig { + type: 'local'; + headless?: boolean; +} +type ComputerConfig = KernelConfig | LocalConfig; -const computers = { - async create({ type, ...args }: ComputerConfig) { - if (type === "kernel") { - const computer = new KernelPlaywrightComputer(args.cdp_ws_url); - await computer.enter(); - return { computer }; - } else if (type === "local") { - const computer = new LocalPlaywrightComputer(args.headless); - await computer.enter(); - return { computer }; - } else { - throw new Error(`Unknown computer type: ${type}`); - } - }, +export default { + async create( + cfg: ComputerConfig, + ): Promise<{ computer: KernelPlaywrightComputer | LocalPlaywrightComputer }> { + if (cfg.type === 'kernel') { + const computer = new KernelPlaywrightComputer(cfg.cdp_ws_url); + await computer.enter(); + return { computer }; + } else { + const computer = new LocalPlaywrightComputer(cfg.headless ?? false); + await computer.enter(); + return { computer }; + } + }, }; - -export default computers; diff --git a/templates/typescript/cua/lib/playwright/base.ts b/templates/typescript/cua/lib/playwright/base.ts index 5176869..7a06151 100644 --- a/templates/typescript/cua/lib/playwright/base.ts +++ b/templates/typescript/cua/lib/playwright/base.ts @@ -1,220 +1,242 @@ -import utils from "../utils.ts"; -import sharp from "sharp"; -import type { Browser, Page, Route, Request } from "playwright"; - -// Optional: key mapping if your model uses "CUA" style keys -const CUA_KEY_TO_PLAYWRIGHT_KEY: Record = { - "/": "/", - "\\": "\\", - alt: "Alt", - arrowdown: "ArrowDown", - arrowleft: "ArrowLeft", - arrowright: "ArrowRight", - arrowup: "ArrowUp", - backspace: "Backspace", - capslock: "CapsLock", - cmd: "Meta", - ctrl: "Control", - delete: "Delete", - end: "End", - enter: "Enter", - esc: "Escape", - home: "Home", - insert: "Insert", - option: "Alt", - pagedown: "PageDown", - pageup: "PageUp", - shift: "Shift", - space: " ", - super: "Meta", - tab: "Tab", - win: "Meta", +import utils from '../utils'; +import sharp from 'sharp'; +import type { Browser, Page, Route, Request, Response } from 'playwright'; + +// CUA key -> Playwright key mapping +const KEY_MAP: Record = { + '/': '/', + '\\': '\\', + alt: 'Alt', + arrowdown: 'ArrowDown', + arrowleft: 'ArrowLeft', + arrowright: 'ArrowRight', + arrowup: 'ArrowUp', + backspace: 'Backspace', + capslock: 'CapsLock', + cmd: 'Meta', + ctrl: 'Control', + delete: 'Delete', + end: 'End', + enter: 'Enter', + esc: 'Escape', + home: 'Home', + insert: 'Insert', + option: 'Alt', + pagedown: 'PageDown', + pageup: 'PageUp', + shift: 'Shift', + space: ' ', + super: 'Meta', + tab: 'Tab', + win: 'Meta', }; interface Point { - x: number; - y: number; + x: number; + y: number; } -/** - * Abstract base for Playwright-based computers: - * - * - Subclasses override `_getBrowserAndPage()` to do local or remote connection, - * returning [Browser, Page]. - * - This base class handles context creation (`enter()`/`exit()`), - * plus standard "Computer" actions like click, scroll, etc. - * - We also have extra browser actions: `goto(url)` and `back()`. - */ - export class BasePlaywrightComputer { - protected _browser: Browser | null = null; - protected _page: Page | null = null; - - constructor() { - this._browser = null; - this._page = null; - } - - /** - * Type guard to assert that this._page is present and is a Playwright Page. - * Throws an error if not present. - */ - protected _assertPage(): asserts this is { _page: Page } { - if (!this._page) { - throw new Error("Playwright Page is not initialized. Did you forget to call enter()?"); - } - } - - getEnvironment(): string { - return "browser"; - } - - getDimensions(): [number, number] { - return [1024, 768]; - } - - async enter(): Promise { - // Call the subclass hook for getting browser/page - [this._browser, this._page] = await this._getBrowserAndPage(); - - // Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS - const handleRoute = (route: Route, request: Request): void => { - const url = request.url(); - if (utils.checkBlocklistedUrl(url)) { - console.log(`Flagging blocked domain: ${url}`); - route.abort(); - } else { - route.continue(); - } - }; - - this._assertPage(); - await this._page.route("**/*", handleRoute); - return this; - } - - async exit(): Promise { - if (this._browser) { - await this._browser.close(); - } - } - - getCurrentUrl(): string { - this._assertPage(); - return this._page.url(); - } - - // Common "Computer" actions - async screenshot(): Promise { - this._assertPage(); - // Capture only the viewport (not full_page) - const screenshotBuffer = await this._page.screenshot({ fullPage: false }); - const webpBuffer = await sharp(screenshotBuffer).webp().toBuffer(); - return webpBuffer.toString("base64"); - } - - async click(button: string = "left", x: number, y: number): Promise { - this._assertPage(); - // console.dir({ debug:{base:{click:{x,y,button}}} },{depth:null}) - switch (button) { - case "back": - await this.back(); - break; - case "forward": - await this.forward(); - break; - case "wheel": - await this._page.mouse.wheel(x, y); - break; - default: - const buttonMapping: Record = { - left: "left", - right: "right", - }; - const buttonType = - buttonMapping[button as keyof typeof buttonMapping] || "left"; - await this._page.mouse.click(x, y, { button: buttonType }); - } - } - - async doubleClick(x: number, y: number): Promise { - this._assertPage(); - await this._page.mouse.dblclick(x, y); - } - - async scroll( - x: number, - y: number, - scrollX: number, - scrollY: number, - ): Promise { - this._assertPage(); - await this._page.mouse.move(x, y); - await this._page.evaluate(`window.scrollBy(${scrollX}, ${scrollY})`); - } - - async type(text: string): Promise { - this._assertPage(); - await this._page.keyboard.type(text); - } - - async keypress(keys: string[]): Promise { - this._assertPage(); - const mappedKeys = keys.map( - (key) => CUA_KEY_TO_PLAYWRIGHT_KEY[key.toLowerCase()] || key, - ); - for (const key of mappedKeys) { - await this._page.keyboard.down(key); - } - for (const key of mappedKeys.reverse()) { - await this._page.keyboard.up(key); - } - } - - async wait(ms: number = 1000): Promise { - await new Promise((resolve) => setTimeout(resolve, ms)); - } - - async move(x: number, y: number): Promise { - this._assertPage(); - await this._page.mouse.move(x, y); - } - - async drag(path: Point[]): Promise { - this._assertPage(); - const first = path[0]; - if (!first) return; - await this._page.mouse.move(first.x, first.y); - await this._page.mouse.down(); - for (const point of path.slice(1)) { - await this._page.mouse.move(point.x, point.y); - } - await this._page.mouse.up(); - } - - // Extra browser-oriented actions - async goto(url: string): Promise { - this._assertPage(); - try { - return await this._page.goto(url); - } catch (e) { - console.log(`Error navigating to ${url}: ${e}`); - } - } - - async back(): Promise { - this._assertPage(); - return await this._page.goBack(); - } - - async forward(): Promise { - this._assertPage(); - return await this._page.goForward(); - } - - // Subclass hook - async _getBrowserAndPage(): Promise<[Browser, Page]> { - // Subclasses must implement, returning [Browser, Page] - throw new Error("Subclasses must implement _getBrowserAndPage()"); - } + protected _browser: Browser | null = null; + protected _page: Page | null = null; + + constructor() { + this._browser = null; + this._page = null; + } + + /** + * Type guard to assert that this._page is present and is a Playwright Page. + * Throws an error if not present. + */ + protected _assertPage(): asserts this is { _page: Page } { + if (!this._page) { + throw new Error('Playwright Page is not initialized. Did you forget to call enter()?'); + } + } + + protected _handleNewPage = (page: Page): void => { + /** Handle the creation of a new page. */ + console.log('New page created'); + this._page = page; + page.on('close', this._handlePageClose.bind(this)); + }; + + protected _handlePageClose = (page: Page): void => { + /** Handle the closure of a page. */ + console.log('Page closed'); + try { + this._assertPage(); + } catch { + return; + } + if (this._page !== page) return; + + const browser = this._browser; + if (!browser || typeof browser.contexts !== 'function') { + console.log('Warning: Browser or context not available.'); + this._page = undefined as unknown as Page; + return; + } + + const contexts = browser.contexts(); + if (!contexts.length) { + console.log('Warning: No browser contexts available.'); + this._page = undefined as unknown as Page; + return; + } + + const context = contexts[0]; + if (!context || typeof context.pages !== 'function') { + console.log('Warning: Context pages not available.'); + this._page = undefined as unknown as Page; + return; + } + + const pages = context.pages(); + if (pages.length) { + this._page = pages[pages.length - 1] as Page; + } else { + console.log('Warning: All pages have been closed.'); + this._page = undefined as unknown as Page; + } + }; + + // Subclass hook + protected _getBrowserAndPage = async (): Promise<[Browser, Page]> => { + // Subclasses must implement, returning [Browser, Page] + throw new Error('Subclasses must implement _getBrowserAndPage()'); + }; + + getEnvironment = (): 'windows' | 'mac' | 'linux' | 'ubuntu' | 'browser' => { + return 'browser'; + }; + + getDimensions = (): [number, number] => { + return [1024, 768]; + }; + + enter = async (): Promise => { + // Call the subclass hook for getting browser/page + [this._browser, this._page] = await this._getBrowserAndPage(); + + // Set up network interception to flag URLs matching domains in BLOCKED_DOMAINS + const handleRoute = (route: Route, request: Request): void => { + const url = request.url(); + if (utils.checkBlocklistedUrl(url)) { + console.log(`Flagging blocked domain: ${url}`); + route.abort(); + } else { + route.continue(); + } + }; + + this._assertPage(); + await this._page.route('**/*', handleRoute); + return this; + }; + + exit = async (): Promise => { + if (this._browser) await this._browser.close(); + }; + + getCurrentUrl = (): string => { + this._assertPage(); + return this._page.url(); + }; + + screenshot = async (): Promise => { + this._assertPage(); + const buf = await this._page.screenshot({ fullPage: false }); + const webp = await sharp(buf).webp().toBuffer(); + return webp.toString('base64'); + }; + + click = async ( + button: 'left' | 'right' | 'back' | 'forward' | 'wheel', + x: number, + y: number, + ): Promise => { + this._assertPage(); + switch (button) { + case 'back': + await this.back(); + return; + case 'forward': + await this.forward(); + return; + case 'wheel': + await this._page.mouse.wheel(x, y); + return; + default: { + const btn = button === 'right' ? 'right' : 'left'; + await this._page.mouse.click(x, y, { button: btn }); + return; + } + } + }; + + doubleClick = async (x: number, y: number): Promise => { + this._assertPage(); + await this._page.mouse.dblclick(x, y); + }; + + scroll = async (x: number, y: number, scrollX: number, scrollY: number): Promise => { + this._assertPage(); + await this._page.mouse.move(x, y); + await this._page.evaluate( + (params: { dx: number; dy: number }) => window.scrollBy(params.dx, params.dy), + { dx: scrollX, dy: scrollY }, + ); + }; + + type = async (text: string): Promise => { + this._assertPage(); + await this._page.keyboard.type(text); + }; + + keypress = async (keys: string[]): Promise => { + this._assertPage(); + const mapped = keys.map((k) => KEY_MAP[k.toLowerCase()] ?? k); + for (const k of mapped) await this._page.keyboard.down(k); + for (const k of [...mapped].reverse()) await this._page.keyboard.up(k); + }; + + wait = async (ms = 1000): Promise => { + await new Promise((resolve) => setTimeout(resolve, ms)); + }; + + move = async (x: number, y: number): Promise => { + this._assertPage(); + await this._page.mouse.move(x, y); + }; + + drag = async (path: Point[]): Promise => { + this._assertPage(); + const first = path[0]; + if (!first) return; + await this._page.mouse.move(first.x, first.y); + await this._page.mouse.down(); + for (const pt of path.slice(1)) await this._page.mouse.move(pt.x, pt.y); + await this._page.mouse.up(); + }; + + goto = async (url: string): Promise => { + this._assertPage(); + try { + return await this._page.goto(url); + } catch { + return null; + } + }; + + back = async (): Promise => { + this._assertPage(); + return (await this._page.goBack()) || null; + }; + + forward = async (): Promise => { + this._assertPage(); + return (await this._page.goForward()) || null; + }; } diff --git a/templates/typescript/cua/lib/playwright/kernel.ts b/templates/typescript/cua/lib/playwright/kernel.ts index 8f1cc6d..45ceb97 100644 --- a/templates/typescript/cua/lib/playwright/kernel.ts +++ b/templates/typescript/cua/lib/playwright/kernel.ts @@ -1,90 +1,43 @@ -import { chromium, type Browser, type Page } from "playwright"; -import { BasePlaywrightComputer } from "./base"; +import { chromium, type Browser, type Page } from 'playwright'; +import { BasePlaywrightComputer } from './base'; /** * KernelPlaywrightComputer connects to a remote browser instance via CDP WebSocket URL. * Similar to LocalPlaywrightComputer but uses an existing browser instance instead of launching one. */ export class KernelPlaywrightComputer extends BasePlaywrightComputer { - private cdp_ws_url: string; + private cdp_ws_url: string; - constructor(cdp_ws_url: string) { - super(); - this.cdp_ws_url = cdp_ws_url; - } + constructor(cdp_ws_url: string) { + super(); + this.cdp_ws_url = cdp_ws_url; + } - async _getBrowserAndPage(): Promise<[Browser, Page]> { - const [width, height] = this.getDimensions(); + _getBrowserAndPage = async (): Promise<[Browser, Page]> => { + const [width, height] = this.getDimensions(); - // Connect to existing browser instance via CDP - const browser = await chromium.connectOverCDP(this.cdp_ws_url); + // Connect to existing browser instance via CDP + const browser = await chromium.connectOverCDP(this.cdp_ws_url); - // Get existing context or create new one - let context = browser.contexts()[0]; - if (!context) { - context = await browser.newContext(); - } + // Get existing context or create new one + let context = browser.contexts()[0]; + if (!context) { + context = await browser.newContext(); + } - // Add event listeners for page creation and closure - context.on("page", this._handleNewPage.bind(this)); + // Add event listeners for page creation and closure + context.on('page', this._handleNewPage.bind(this)); - // Get existing page or create new one - let page = context.pages()[0]; - if (!page) { - page = await context.newPage(); - } + // Get existing page or create new one + let page = context.pages()[0]; + if (!page) { + page = await context.newPage(); + } - // Set viewport size - await page.setViewportSize({ width, height }); - page.on("close", this._handlePageClose.bind(this)); + // Set viewport size + await page.setViewportSize({ width, height }); + page.on('close', this._handlePageClose.bind(this)); - return [browser, page]; - } - - private _handleNewPage(page: Page): void { - /** Handle the creation of a new page. */ - console.log("New page created"); - this._page = page; - page.on("close", this._handlePageClose.bind(this)); - } - - private _handlePageClose(page: Page): void { - /** Handle the closure of a page. */ - console.log("Page closed"); - try { - this._assertPage(); - } catch { - return; - } - if (this._page !== page) return; - - const browser = this._browser; - if (!browser || typeof browser.contexts !== "function") { - console.log("Warning: Browser or context not available."); - this._page = undefined as any; - return; - } - - const contexts = browser.contexts(); - if (!contexts.length) { - console.log("Warning: No browser contexts available."); - this._page = undefined as any; - return; - } - - const context = contexts[0]; - if (!context || typeof context.pages !== "function") { - console.log("Warning: Context pages not available."); - this._page = undefined as any; - return; - } - - const pages = context.pages(); - if (pages.length) { - this._page = pages[pages.length - 1]!; - } else { - console.log("Warning: All pages have been closed."); - this._page = undefined as any; - } - } + return [browser, page]; + }; } diff --git a/templates/typescript/cua/lib/playwright/local.ts b/templates/typescript/cua/lib/playwright/local.ts index d5cb284..d63f123 100644 --- a/templates/typescript/cua/lib/playwright/local.ts +++ b/templates/typescript/cua/lib/playwright/local.ts @@ -1,89 +1,43 @@ -import { chromium, type Browser, type Page } from "playwright"; -import { BasePlaywrightComputer } from "./base"; +import { chromium, type Browser, type Page } from 'playwright'; +import { BasePlaywrightComputer } from './base'; /** * Launches a local Chromium instance using Playwright. */ export class LocalPlaywrightComputer extends BasePlaywrightComputer { - private headless: boolean; + private headless: boolean; - constructor(headless: boolean = false) { - super(); - this.headless = headless; - } + constructor(headless = false) { + super(); + this.headless = headless; + } - async _getBrowserAndPage(): Promise<[Browser, Page]> { - const [width, height] = this.getDimensions(); - const launchArgs = [ - `--window-size=${width},${height}`, - "--disable-extensions", - "--disable-file-system", - ]; + _getBrowserAndPage = async (): Promise<[Browser, Page]> => { + const [width, height] = this.getDimensions(); + const launchArgs = [ + `--window-size=${width},${height}`, + '--disable-extensions', + '--disable-file-system', + ]; - const browser = await chromium.launch({ - headless: this.headless, - args: launchArgs, - env: { DISPLAY: ":0" }, - }); + const browser = await chromium.launch({ + headless: this.headless, + args: launchArgs, + env: { DISPLAY: ':0' }, + }); - const context = await browser.newContext(); + const context = await browser.newContext(); - // Add event listeners for page creation and closure - context.on("page", this._handleNewPage.bind(this)); + // Add event listeners for page creation and closure + context.on('page', this._handleNewPage.bind(this)); - const page = await context.newPage(); - await page.setViewportSize({ width, height }); - page.on("close", this._handlePageClose.bind(this)); + const page = await context.newPage(); + await page.setViewportSize({ width, height }); + page.on('close', this._handlePageClose.bind(this)); - await page.goto("https://bing.com"); + await page.goto('https://duckduckgo.com'); - return [browser, page]; - } - - private _handleNewPage(page: Page): void { - /** Handle the creation of a new page. */ - console.log("New page created"); - this._page = page; - page.on("close", this._handlePageClose.bind(this)); - } - - private _handlePageClose(page: Page): void { - /** Handle the closure of a page. */ - console.log("Page closed"); - try { - this._assertPage(); - } catch { - return; - } - if (this._page !== page) return; - - const browser = this._browser; - if (!browser || typeof browser.contexts !== "function") { - console.log("Warning: Browser or context not available."); - this._page = undefined as any; - return; - } - - const contexts = browser.contexts(); - if (!contexts.length) { - console.log("Warning: No browser contexts available."); - this._page = undefined as any; - return; - } - - const context = contexts[0]; - if (!context || typeof context.pages !== "function") { - console.log("Warning: Context pages not available."); - this._page = undefined as any; - return; - } - - const pages = context.pages(); - if (pages.length) { - this._page = pages[pages.length - 1]!; - } else { - console.log("Warning: All pages have been closed."); - this._page = undefined as any; - } - } + // console.dir({debug_getBrowserAndPage: [browser, page]}); + return [browser, page]; + }; } diff --git a/templates/typescript/cua/lib/toolset.ts b/templates/typescript/cua/lib/toolset.ts index d15fdc0..2999d0b 100644 --- a/templates/typescript/cua/lib/toolset.ts +++ b/templates/typescript/cua/lib/toolset.ts @@ -1,40 +1,40 @@ const shared = [ - { - type: "function", - name: "goto", - description: "Go to a specific URL.", - parameters: { - type: "object", - properties: { - url: { - type: "string", - description: "Fully qualified URL to navigate to.", - }, - }, - additionalProperties: false, - required: ["url"], - }, - }, - { - type: "function", - name: "back", - description: "Navigate back in the browser history.", - parameters: { - type: "object", - properties: {}, - additionalProperties: false, - }, - }, - { - type: "function", - name: "forward", - description: "Navigate forward in the browser history.", - parameters: { - type: "object", - properties: {}, - additionalProperties: false, - }, - }, + { + type: 'function', + name: 'goto', + description: 'Go to a specific URL.', + parameters: { + type: 'object', + properties: { + url: { + type: 'string', + description: 'Fully qualified URL to navigate to.', + }, + }, + additionalProperties: false, + required: ['url'], + }, + }, + { + type: 'function', + name: 'back', + description: 'Navigate back in the browser history.', + parameters: { + type: 'object', + properties: {}, + additionalProperties: false, + }, + }, + { + type: 'function', + name: 'forward', + description: 'Navigate forward in the browser history.', + parameters: { + type: 'object', + properties: {}, + additionalProperties: false, + }, + }, ]; export default { shared }; diff --git a/templates/typescript/cua/lib/utils.ts b/templates/typescript/cua/lib/utils.ts index e45d733..f2dc0fd 100644 --- a/templates/typescript/cua/lib/utils.ts +++ b/templates/typescript/cua/lib/utils.ts @@ -1,77 +1,61 @@ -import "dotenv/config"; -import sharp from "sharp"; -import OpenAI from "openai"; +import 'dotenv/config'; +import sharp from 'sharp'; +import OpenAI from 'openai'; +import { type ResponseItem } from 'openai/resources/responses/responses'; +const openai = new OpenAI(); const BLOCKED_DOMAINS: readonly string[] = [ - "maliciousbook.com", - "evilvideos.com", - "darkwebforum.com", - "shadytok.com", - "suspiciouspins.com", - "ilanbigio.com", + 'maliciousbook.com', + 'evilvideos.com', + 'darkwebforum.com', + 'shadytok.com', + 'suspiciouspins.com', + 'ilanbigio.com', ] as const; -interface ImageDimensions { - width: number; - height: number; +export async function calculateImageDimensions( + base64Image: string, +): Promise<{ width: number; height: number }> { + const buf = Buffer.from(base64Image, 'base64'); + const meta = await sharp(buf).metadata(); + return { width: meta.width ?? 0, height: meta.height ?? 0 }; } - -interface ComputerCallOutput { - type: "computer_call_output"; - output?: { - image_url?: string; - [key: string]: any; - }; - [key: string]: any; -} - -interface Message { - [key: string]: any; -} - -async function calculateImageDimensions( - base64Image: string, -): Promise { - const imageBuffer = Buffer.from(base64Image, "base64"); - const metadata = await sharp(imageBuffer).metadata(); - return { width: metadata.width!, height: metadata.height! }; -} - -function sanitizeMessage(msg: Message): Message { - /** Return a copy of the message with image_url omitted for computer_call_output messages. */ - if (msg.type === "computer_call_output") { - const output = msg.output || {}; - if (typeof output === "object") { - const sanitized = { ...msg }; - sanitized.output = { ...output, image_url: "[omitted]" }; - return sanitized; - } - } - return msg; +export function sanitizeMessage(msg: ResponseItem): ResponseItem { + const sanitizedMsg = { ...msg } as ResponseItem; + if ( + sanitizedMsg.type === 'computer_call_output' && + typeof sanitizedMsg.output === 'object' && + sanitizedMsg.output !== null + ) { + sanitizedMsg.output = { ...sanitizedMsg.output }; + const output = sanitizedMsg.output as { image_url?: string }; + if (output.image_url) { + output.image_url = '[omitted]'; + } + } + return sanitizedMsg; } -async function createResponse(kwargs: any): Promise { - const openai = new OpenAI(); - try { - const response = await openai.responses.create(kwargs); - return response; - } catch (error: any) { - console.error(`Error: ${error.status} ${error.message}`); - throw error; - } +export async function createResponse( + params: OpenAI.Responses.ResponseCreateParams, +): Promise<{ output?: OpenAI.Responses.ResponseOutputItem[] }> { + try { + const response = await openai.responses.create(params); + return 'output' in response ? response : { output: undefined }; + } catch (err: unknown) { + console.error((err as Error).message); + throw err; + } } -function checkBlocklistedUrl(url: string): boolean { - /** Return true if the given URL (including subdomains) is in the blocklist. */ - const hostname = new URL(url).hostname || ""; - return BLOCKED_DOMAINS.some( - (blocked) => hostname === blocked || hostname.endsWith(`.${blocked}`), - ); +export function checkBlocklistedUrl(url: string): boolean { + const host = new URL(url).hostname; + return BLOCKED_DOMAINS.some((d) => host === d || host.endsWith(`.${d}`)); } export default { - calculateImageDimensions, - sanitizeMessage, - createResponse, - checkBlocklistedUrl, + calculateImageDimensions, + sanitizeMessage, + createResponse, + checkBlocklistedUrl, }; diff --git a/templates/typescript/cua/package.json b/templates/typescript/cua/package.json index 70b296c..29a6b16 100644 --- a/templates/typescript/cua/package.json +++ b/templates/typescript/cua/package.json @@ -1,15 +1,24 @@ { - "type": "module", - "private": true, - "dependencies": { - "@onkernel/sdk": "^0.6.0", - "@types/node": "^24.0.3", - "dotenv": "^16.5.0", - "openai": "^5.5.1", - "playwright": "^1.53.0", - "sharp": "^0.34.2" - }, - "peerDependencies": { - "typescript": "^5.8.3" - } + "type": "module", + "private": true, + "scripts": { + "build": "tsc", + "lint": "eslint . --ext .ts", + "lint:fix": "eslint . --ext .ts --fix" + }, + "dependencies": { + "@onkernel/sdk": "^0.6.0", + "dotenv": "^16.5.0", + "openai": "^5.7.0", + "playwright": "^1.53.0", + "sharp": "^0.34.2" + }, + "devDependencies": { + "@types/node": "^24.0.3", + "@typescript-eslint/eslint-plugin": "^5.0.0", + "@typescript-eslint/parser": "^5.0.0", + "eslint": "^8.0.0", + "eslint-config-prettier": "^8.8.0", + "typescript": "^5.8.3" + } } diff --git a/templates/typescript/cua/test.local.ts b/templates/typescript/cua/test.local.ts new file mode 100644 index 0000000..5f0298a --- /dev/null +++ b/templates/typescript/cua/test.local.ts @@ -0,0 +1,41 @@ +import 'dotenv/config'; +import { Agent } from './lib/agent'; +import computers from './lib/computers'; + +/* + to run a local browser test before deploying to kernel +*/ + +async function test() { + const { computer } = await computers.create({ type: 'local' }); + const agent = new Agent({ + model: 'computer-use-preview', + computer, + tools: [], + acknowledge_safety_check_callback: (m: string) => { console.log(`> safety check: ${m}`); return true; }, + }); + + // run agent and get response + const logs = await agent.runFullTurn({ + messages: [ + { + role: 'system', + content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString( + 'en-US', + { weekday: 'long' } + )})`, + }, + { + type: 'message', + role: 'user', + content: [{ type: 'input_text', text: "go to ebay.com and look up oberheim ob-x prices and give me a report" }], + }, + ], + print_steps: true, + debug: true, + show_images: false, + }); + console.dir(logs, { depth: null }); +} + +test(); \ No newline at end of file diff --git a/templates/typescript/cua/tsconfig.json b/templates/typescript/cua/tsconfig.json index f5c1fe2..05b1408 100644 --- a/templates/typescript/cua/tsconfig.json +++ b/templates/typescript/cua/tsconfig.json @@ -1,30 +1,16 @@ { - "compilerOptions": { - // Environment setup & latest features - "lib": ["ESNext", "DOM"], - "target": "ESNext", - "module": "ESNext", - "moduleDetection": "force", - "jsx": "react-jsx", - "allowJs": true, - - // Bundler mode - "moduleResolution": "bundler", - "allowImportingTsExtensions": true, - "verbatimModuleSyntax": true, - "noEmit": true, - - // Best practices - "strict": true, - "skipLibCheck": true, - "noFallthroughCasesInSwitch": true, - "noUncheckedIndexedAccess": true, - - // Some stricter flags (disabled by default) - "noUnusedLocals": false, - "noUnusedParameters": false, - "noPropertyAccessFromIndexSignature": false - }, - "include": ["./**/*.ts", "./**/*.tsx"], - "exclude": ["node_modules", "dist"] + "compilerOptions": { + "lib": ["ESNext", "DOM"], + "target": "ESNext", + "module": "ESNext", + "moduleResolution": "bundler", + "allowJs": false, + "strict": true, + "noEmit": true, + "skipLibCheck": true, + "noFallthroughCasesInSwitch": true, + "noUncheckedIndexedAccess": true + }, + "include": ["./**/*.ts"], + "exclude": ["node_modules", "dist"] } From 32e242e3b47444fa05223fe8fdf197c203e3ebc3 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Mon, 23 Jun 2025 22:22:36 +0100 Subject: [PATCH 09/13] ts-cua: lint & openai types update * --- templates/typescript/cua/.eslintrc.cjs | 10 ---------- templates/typescript/cua/index.ts | 6 +++--- templates/typescript/cua/lib/agent.ts | 4 ++-- templates/typescript/cua/package.json | 8 +------- templates/typescript/cua/test.local.ts | 18 +++++++++++++----- 5 files changed, 19 insertions(+), 27 deletions(-) delete mode 100644 templates/typescript/cua/.eslintrc.cjs diff --git a/templates/typescript/cua/.eslintrc.cjs b/templates/typescript/cua/.eslintrc.cjs deleted file mode 100644 index d9a7fb7..0000000 --- a/templates/typescript/cua/.eslintrc.cjs +++ /dev/null @@ -1,10 +0,0 @@ -module.exports = { - parser: '@typescript-eslint/parser', - parserOptions: { project: './tsconfig.json', sourceType: 'module' }, - plugins: ['@typescript-eslint'], - extends: ['eslint:recommended', 'plugin:@typescript-eslint/recommended', 'prettier'], - rules: { - '@typescript-eslint/no-explicit-any': 'error', - 'no-console': 'off', - }, -}; diff --git a/templates/typescript/cua/index.ts b/templates/typescript/cua/index.ts index cbf9136..aad2e93 100644 --- a/templates/typescript/cua/index.ts +++ b/templates/typescript/cua/index.ts @@ -14,7 +14,7 @@ interface CuaOutput { } const kernel = new Kernel(); -const app = kernel.app('ts-cua'); +const app = kernel.app('ts-cua-dev'); if (!process.env.OPENAI_API_KEY) { throw new Error('OPENAI_API_KEY is not set'); @@ -30,7 +30,7 @@ if (!process.env.OPENAI_API_KEY) { * Invoke this via CLI: * export KERNEL_API_KEY= * kernel deploy index.ts -e OPENAI_API_KEY=XXXXX --force - * kernel invoke ts-cua cua-task -p "{\"task\":\"go to ebay.com and find the current market price range for a dreamcast\"}" + * kernel invoke ts-cua cua-task -p "{\"task\":\"current market price range for a used dreamcast\"}" * kernel logs ts-cua -f # Open in separate tab */ @@ -49,7 +49,7 @@ app.action( model: 'computer-use-preview', computer, tools: [], - acknowledge_safety_check_callback: (m: string) => { + acknowledge_safety_check_callback: (m: string): boolean => { console.log(`> safety check: ${m}`); return true; }, diff --git a/templates/typescript/cua/lib/agent.ts b/templates/typescript/cua/lib/agent.ts index 02412a8..630f32e 100644 --- a/templates/typescript/cua/lib/agent.ts +++ b/templates/typescript/cua/lib/agent.ts @@ -41,7 +41,7 @@ export class Agent { this.model = opts.model ?? 'computer-use-preview'; this.computer = opts.computer; this.tools = [...toolset.shared, ...(opts.tools ?? [])] as ComputerTool[]; - this.ackCb = opts.acknowledge_safety_check_callback ?? (() => true); + this.ackCb = opts.acknowledge_safety_check_callback ?? ((): boolean => true); if (this.computer) { const [w, h] = this.computer.getDimensions(); @@ -62,7 +62,7 @@ export class Agent { args.map((msg) => utils.sanitizeMessage(msg as ResponseItem)), { depth: null }, ); - } catch (e) { + } catch { console.dir(args, { depth: null }); } } diff --git a/templates/typescript/cua/package.json b/templates/typescript/cua/package.json index 29a6b16..dfbb715 100644 --- a/templates/typescript/cua/package.json +++ b/templates/typescript/cua/package.json @@ -2,9 +2,7 @@ "type": "module", "private": true, "scripts": { - "build": "tsc", - "lint": "eslint . --ext .ts", - "lint:fix": "eslint . --ext .ts --fix" + "build": "tsc" }, "dependencies": { "@onkernel/sdk": "^0.6.0", @@ -15,10 +13,6 @@ }, "devDependencies": { "@types/node": "^24.0.3", - "@typescript-eslint/eslint-plugin": "^5.0.0", - "@typescript-eslint/parser": "^5.0.0", - "eslint": "^8.0.0", - "eslint-config-prettier": "^8.8.0", "typescript": "^5.8.3" } } diff --git a/templates/typescript/cua/test.local.ts b/templates/typescript/cua/test.local.ts index 5f0298a..23f9a5c 100644 --- a/templates/typescript/cua/test.local.ts +++ b/templates/typescript/cua/test.local.ts @@ -6,13 +6,16 @@ import computers from './lib/computers'; to run a local browser test before deploying to kernel */ -async function test() { +async function test(): Promise { const { computer } = await computers.create({ type: 'local' }); const agent = new Agent({ model: 'computer-use-preview', computer, tools: [], - acknowledge_safety_check_callback: (m: string) => { console.log(`> safety check: ${m}`); return true; }, + acknowledge_safety_check_callback: (m: string): boolean => { + console.log(`> safety check: ${m}`); + return true; + }, }); // run agent and get response @@ -22,13 +25,18 @@ async function test() { role: 'system', content: `- Current date and time: ${new Date().toISOString()} (${new Date().toLocaleDateString( 'en-US', - { weekday: 'long' } + { weekday: 'long' }, )})`, }, { type: 'message', role: 'user', - content: [{ type: 'input_text', text: "go to ebay.com and look up oberheim ob-x prices and give me a report" }], + content: [ + { + type: 'input_text', + text: 'go to ebay.com and look up oberheim ob-x prices and give me a report', + }, + ], }, ], print_steps: true, @@ -38,4 +46,4 @@ async function test() { console.dir(logs, { depth: null }); } -test(); \ No newline at end of file +test(); From 6010feaf972a345e3108579d7cc82a3e734833c3 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Mon, 23 Jun 2025 22:25:06 +0100 Subject: [PATCH 10/13] ts-cua: lint & openai types update ** --- templates/typescript/cua/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/typescript/cua/index.ts b/templates/typescript/cua/index.ts index aad2e93..4b01c69 100644 --- a/templates/typescript/cua/index.ts +++ b/templates/typescript/cua/index.ts @@ -14,7 +14,7 @@ interface CuaOutput { } const kernel = new Kernel(); -const app = kernel.app('ts-cua-dev'); +const app = kernel.app('ts-cua'); if (!process.env.OPENAI_API_KEY) { throw new Error('OPENAI_API_KEY is not set'); From ed20cd47d98de73b033ceaaca5ed099b7096a87f Mon Sep 17 00:00:00 2001 From: raidendotai Date: Mon, 23 Jun 2025 23:01:14 +0100 Subject: [PATCH 11/13] python-cua: requirements versions --- templates/python/cua/pyproject.toml | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/templates/python/cua/pyproject.toml b/templates/python/cua/pyproject.toml index 7115077..8f675b8 100644 --- a/templates/python/cua/pyproject.toml +++ b/templates/python/cua/pyproject.toml @@ -6,24 +6,24 @@ readme = "README.md" requires-python = ">=3.11" dependencies = [ "annotated-types==0.7.0", - "anyio==4.8.0", - "certifi==2025.1.31", - "charset-normalizer==3.4.1", + "anyio==4.9.0", + "certifi==2025.6.15", + "charset-normalizer==3.4.2", "distro==1.9.0", - "greenlet==3.1.1", - "h11==0.14.0", - "httpcore==1.0.7", + "greenlet==3.2.3", + "h11==0.16.0", + "httpcore==1.0.9", "httpx==0.28.1", "idna==3.10", - "jiter==0.8.2", - "pillow==11.1.0", - "playwright==1.50.0", - "pydantic==2.10.6", - "pydantic_core==2.27.2", - "pyee==12.1.1", - "python-dotenv==1.0.1", - "requests==2.32.3", + "jiter==0.10.0", + "pillow==11.2.1", + "playwright==1.52.0", + "pydantic==2.11.7", + "pydantic_core==2.35.1", + "pyee==13.0.0", + "python-dotenv==1.1.0", + "requests==2.32.4", "sniffio==1.3.1", - "typing_extensions==4.12.2", - "urllib3==2.3.0", + "typing_extensions==4.14.0", + "urllib3==2.5.0", ] From 4142c50acd16dbbed26f3d297afc0838939f4839 Mon Sep 17 00:00:00 2001 From: raidendotai Date: Tue, 24 Jun 2025 03:29:39 +0100 Subject: [PATCH 12/13] python-cua: requirements versions * --- templates/python/cua/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/templates/python/cua/pyproject.toml b/templates/python/cua/pyproject.toml index 8f675b8..3d558e8 100644 --- a/templates/python/cua/pyproject.toml +++ b/templates/python/cua/pyproject.toml @@ -17,6 +17,7 @@ dependencies = [ "idna==3.10", "jiter==0.10.0", "pillow==11.2.1", + "kernel>=0.6.0", "playwright==1.52.0", "pydantic==2.11.7", "pydantic_core==2.35.1", From b1aa294c390a7b71b2383ebae0fc43d37af0b44e Mon Sep 17 00:00:00 2001 From: raiden-staging Date: Thu, 3 Jul 2025 16:01:28 +0100 Subject: [PATCH 13/13] cli : payload examples fixes --- index.ts | 4 ++-- templates/typescript/cua/lib/agent.ts | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/index.ts b/index.ts index 1eaed85..5e131e4 100644 --- a/index.ts +++ b/index.ts @@ -96,7 +96,7 @@ const INVOKE_SAMPLES: Record< [TEMPLATE_COMPUTER_USE]: 'kernel invoke ts-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', [TEMPLATE_CUA]: - 'kernel invoke ts-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', + 'kernel invoke ts-cua cua-task --payload \'{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', }, [LANGUAGE_PYTHON]: { [TEMPLATE_SAMPLE_APP]: @@ -108,7 +108,7 @@ const INVOKE_SAMPLES: Record< [TEMPLATE_COMPUTER_USE]: 'kernel invoke python-cu cu-task --payload \'{"query": "Return the first url of a search result for NYC restaurant reviews Pete Wells"}\'', [TEMPLATE_CUA]: - 'kernel invoke python-cua cua-task --payload \'{"query": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', + 'kernel invoke python-cua cua-task --payload \'{"task": "Go to https://news.ycombinator.com and get the top 5 articles"}\'', }, }; diff --git a/templates/typescript/cua/lib/agent.ts b/templates/typescript/cua/lib/agent.ts index 630f32e..9744165 100644 --- a/templates/typescript/cua/lib/agent.ts +++ b/templates/typescript/cua/lib/agent.ts @@ -104,7 +104,6 @@ export class Agent { await (fn as (...a: unknown[]) => unknown)(...Object.values(actionArgs)); const screenshot = await this.computer.screenshot(); const pending = cc.pending_safety_checks ?? []; - console.dir({ debug_agent_computer_call: cc }); for (const { message } of pending) if (!this.ackCb(message)) throw new Error(`Safety check failed: ${message}`); const out: Omit = {