diff --git a/packages/core/src/browser/ariaBrowser.ts b/packages/core/src/browser/ariaBrowser.ts index ca1abacd..06fbf382 100644 --- a/packages/core/src/browser/ariaBrowser.ts +++ b/packages/core/src/browser/ariaBrowser.ts @@ -99,6 +99,14 @@ export interface AriaBrowser { */ performAction(ref: string, action: PageAction, value?: string): Promise; + /** + * Look up element identity (role + accessible name) for a ref from the + * most recent ariaTree snapshot. Returns null if the ref is unknown. + * Used by the repetition detector to distinguish logical targets when + * the ref string itself churns between snapshots. + */ + getRefIdentity(ref: string): Promise<{ role: string; name: string } | null>; + /** * Waits for a specific load state of the page * @param state The load state to wait for diff --git a/packages/core/src/browser/ariaTree/ariaSnapshot.ts b/packages/core/src/browser/ariaTree/ariaSnapshot.ts index cff71e8c..738fb2de 100644 --- a/packages/core/src/browser/ariaTree/ariaSnapshot.ts +++ b/packages/core/src/browser/ariaTree/ariaSnapshot.ts @@ -48,6 +48,11 @@ export function generateAndRenderAriaTree(root: Element, counter?: RefCounter): // that strip data-pilo-ref attributes, e.g. React reconciliation) (globalThis as any).__piloRefMap = new Map(); + // Initialize identity map for ref -> {role, name} lookup. Used by the + // repetition detector to distinguish logical click targets when refs + // themselves churn between snapshots. + (globalThis as any).__piloIdentityMap = new Map(); + const ariaTree = generateAriaTree(root); return renderAriaTree(ariaTree, refCounter); } @@ -493,6 +498,13 @@ function renderAriaTree(root: AriaNode, counter: RefCounter): string { if (ariaNode.element) { (globalThis as any).__piloRefMap?.set(ref, ariaNode.element); } + // Capture role + accessible name for the repetition detector. Stored + // separately from refMap so consumers that only need identity don't + // have to re-walk the DOM. + (globalThis as any).__piloIdentityMap?.set(ref, { + role: String(ariaNode.role), + name: ariaNode.name ?? "", + }); } const escapedKey = indent + "- " + yamlEscapeKeyIfNeeded(key); diff --git a/packages/core/src/browser/playwrightBrowser.ts b/packages/core/src/browser/playwrightBrowser.ts index 510f2a71..9faffcbf 100644 --- a/packages/core/src/browser/playwrightBrowser.ts +++ b/packages/core/src/browser/playwrightBrowser.ts @@ -788,6 +788,25 @@ export class PlaywrightBrowser implements AriaBrowser { return locator; } + async getRefIdentity(ref: string): Promise<{ role: string; name: string } | null> { + if (!this.page) return null; + try { + return await this.page.evaluate( + (refArg) => + ( + globalThis as unknown as { + __piloIdentityMap?: Map; + } + ).__piloIdentityMap?.get(refArg) ?? null, + ref, + ); + } catch { + // The page may have navigated or torn down the identity map. Identity + // is advisory for repetition detection — log nothing, just bail out. + return null; + } + } + async performAction(ref: string, action: PageAction, value?: string): Promise { if (!this.page) throw new Error("Browser not started"); return withSpan( diff --git a/packages/core/src/tools/webActionTools.ts b/packages/core/src/tools/webActionTools.ts index f1f415de..b6c8e04b 100644 --- a/packages/core/src/tools/webActionTools.ts +++ b/packages/core/src/tools/webActionTools.ts @@ -43,6 +43,11 @@ type ActionResult = { value?: string | number; error?: string; isRecoverable?: boolean; + // Snapshot-time role + accessible name for the targeted element. Optional + // (only populated when a ref is provided and the browser can resolve it). + // Used by the repetition detector to distinguish logical targets even + // when ref strings churn between snapshots. + targetIdentity?: { role: string; name: string }; }; /** @@ -82,6 +87,17 @@ async function performActionWithValidation( value, }); + // Capture target identity BEFORE the action runs — the snapshot's + // identity map is the basis for the ref the LLM emitted, and the + // action itself may invalidate the ref by causing navigation or DOM + // teardown. Identity is advisory; null means we just won't fold it + // into the repetition signature for this turn. + let targetIdentity: { role: string; name: string } | undefined; + if (ref) { + const id = await context.browser.getRefIdentity(ref); + if (id) targetIdentity = id; + } + // Perform the action await context.browser.performAction(ref || "", action, value); @@ -97,6 +113,7 @@ async function performActionWithValidation( action, ...(ref && { ref }), ...(value !== undefined && { value }), + ...(targetIdentity && { targetIdentity }), }; } catch (error) { // For browser exceptions, emit failure with error details and return error info diff --git a/packages/core/src/webAgent.ts b/packages/core/src/webAgent.ts index 05d68a56..187ff38c 100644 --- a/packages/core/src/webAgent.ts +++ b/packages/core/src/webAgent.ts @@ -9,7 +9,7 @@ import { streamText, ModelMessage, StreamTextResult } from "ai"; import type { ProviderConfig } from "./provider.js"; -import { AriaBrowser } from "./browser/ariaBrowser.js"; +import { AriaBrowser, PageAction } from "./browser/ariaBrowser.js"; import { BrowserReconnectedEventData, CdpEndpointConnectedEventData, @@ -230,6 +230,15 @@ export class WebAgent { private readonly onUserDataRequired: UserDataCallback | undefined; private readonly taskId: string | undefined; + // Actions where same-action-same-value repetition is legitimate workflow + // (e.g. scrolling an infinite feed, waiting for a slow page) rather than a + // stuck-loop signal. The detector skips these entirely and resets state so + // a real loop interrupted by a scroll doesn't compound across the gap. + private static readonly REPETITION_EXEMPT_ACTIONS: ReadonlySet = new Set([ + PageAction.Scroll, + PageAction.Wait, + ]); + constructor( private browser: AriaBrowser, options: WebAgentOptions, @@ -1154,6 +1163,15 @@ export class WebAgent { actionExecuted: boolean; error?: TaskError; } | null { + // Skip exempt actions entirely, and reset state so an in-progress repeat + // count doesn't carry across them (an exempt action between two clicks + // is treated as progress, not as a transparent gap). + if (WebAgent.REPETITION_EXEMPT_ACTIONS.has(actionOutput.action)) { + executionState.actionRepeatCount = 0; + executionState.lastAction = undefined; + return null; + } + // Define explicit thresholds for warning and abort const REPETITION_WARNING_THRESHOLD = this.maxRepeatedActions + 1; const REPETITION_ABORT_THRESHOLD = this.maxRepeatedActions + 2; @@ -1161,8 +1179,8 @@ export class WebAgent { // Create signature for current action const currentActionSignature = this.createActionSignature( actionOutput.action, - actionOutput.ref, actionOutput.value, + actionOutput.targetIdentity, ); // Check if this is the same action as the last one @@ -1505,10 +1523,29 @@ export class WebAgent { // === Helper Methods === /** - * Create a signature for an action to track repetitions + * Build a repetition-signature for an action. The element ref is + * deliberately excluded because refs are regenerated every snapshot — a + * logical "click Submit" gets a new ref each turn, so including it would + * make near-duplicate actions hash differently and bypass the detector. + * Element identity (role + accessible name) is folded in when present so + * ref-only actions like `click` don't collapse different logical targets + * (e.g. "click Next" vs "click Submit") onto the same signature. + * Value is normalized (lowercase + trim) so cosmetic input variation + * doesn't reset the counter on the same logical action. */ - private createActionSignature(action: string, ref?: string, value?: string | number): string { - return `${action}:${ref || ""}:${value || ""}`; + private createActionSignature( + action: string, + value?: string | number, + identity?: { role: string; name: string }, + ): string { + const normalizedValue = String(value ?? "") + .toLowerCase() + .trim(); + if (identity) { + const normalizedName = identity.name.toLowerCase().trim(); + return `${action}:${identity.role}:${normalizedName}:${normalizedValue}`; + } + return `${action}:${normalizedValue}`; } /** diff --git a/packages/core/test/tools/webActionTools.test.ts b/packages/core/test/tools/webActionTools.test.ts index 34374e74..2254bd87 100644 --- a/packages/core/test/tools/webActionTools.test.ts +++ b/packages/core/test/tools/webActionTools.test.ts @@ -73,6 +73,10 @@ class MockBrowser implements AriaBrowser { // Mock implementation - can be configured to throw errors for testing } + async getRefIdentity(_ref: string): Promise<{ role: string; name: string } | null> { + return null; + } + async waitForLoadState(): Promise {} async runInTemporaryTab(fn: (tab: any) => Promise): Promise { @@ -256,6 +260,34 @@ describe("Web Action Tools", () => { await expect(tools.click.execute({ ref: "btn1" })).rejects.toThrow("Network error"); }); + + it("should capture target identity from the browser and include it in the result", async () => { + vi.spyOn(mockBrowser, "getRefIdentity").mockResolvedValueOnce({ + role: "button", + name: "Submit", + }); + + const result = await tools.click.execute({ ref: "btn1" }); + + expect(result).toEqual({ + success: true, + action: "click", + ref: "btn1", + targetIdentity: { role: "button", name: "Submit" }, + }); + }); + + it("should omit targetIdentity when the browser returns null", async () => { + vi.spyOn(mockBrowser, "getRefIdentity").mockResolvedValueOnce(null); + + const result = await tools.click.execute({ ref: "btn1" }); + + expect(result).toEqual({ + success: true, + action: "click", + ref: "btn1", + }); + }); }); describe("Fill Action", () => { diff --git a/packages/core/test/webAgent.test.ts b/packages/core/test/webAgent.test.ts index 421456b2..a3237dcf 100644 --- a/packages/core/test/webAgent.test.ts +++ b/packages/core/test/webAgent.test.ts @@ -191,6 +191,10 @@ class MockBrowser implements AriaBrowser { async performAction(_ref: string, _action: PageAction, _value?: string): Promise {} + async getRefIdentity(_ref: string): Promise<{ role: string; name: string } | null> { + return null; + } + async waitForLoadState(): Promise {} async runInTemporaryTab(fn: (tab: any) => Promise): Promise { @@ -3867,5 +3871,425 @@ describe("WebAgent", () => { expect(result.finalAnswer).toBe("Completed"); expect(result.stats.actions).toBe(4); // 2 clicks + 1 fill + 1 done }); + + it("should detect repeated clicks across changing element refs", async () => { + // Regression test for #430: refs change every snapshot, so the agent + // calling click on the "same" logical button gets a different ref each + // turn. Pre-fix, this slipped past the detector entirely. + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Test task", + plan: "1. Click submit", + }, + }, + ], + } as any); + + // Five clicks on the "same" logical button, but ref churns each turn. + // Defaults: warning at count 3 (maxRepeatedActions+1), abort at count 4. + const churningRefs = ["E1", "E5", "E12", "E3", "E8"]; + for (const ref of churningRefs) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Click", + toolResults: [ + { + type: "tool-result", + toolCallId: `click_${ref}`, + toolName: "click", + input: { ref }, + output: { + success: true, + action: "click", + ref, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Click" }], + }, + }) as any, + ); + } + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(false); + expect(result.finalAnswer).toContain("Aborted: Excessive repetition"); + expect(result.finalAnswer).toContain("click"); + }); + + it("should not flag legitimate repeated scroll actions", async () => { + // Scroll repeats with the same direction are legitimate workflow + // (traversing an infinite-scroll feed). The detector must exempt them. + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Reach the bottom of the feed", + plan: "1. Scroll down repeatedly", + }, + }, + ], + } as any); + + // Five scrolls in a row would trip the detector under the old logic + // (same action, same value, same ref). They must not now. + for (let i = 0; i < 5; i++) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Scroll", + toolResults: [ + { + type: "tool-result", + toolCallId: `scroll_${i}`, + toolName: "scroll", + input: { direction: "down" }, + output: { + success: true, + action: "scroll", + value: "down", + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Scroll" }], + }, + }) as any, + ); + } + + // Then finish cleanly. + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Done", + toolResults: [ + { + type: "tool-result", + toolCallId: "done_1", + toolName: "done", + input: { result: "Reached bottom" }, + output: { + action: "done", + result: "Reached bottom", + isTerminal: true, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Done" }], + }, + }) as any, + ); + + mockGenerateTextWithRetry.mockResolvedValueOnce(mockValidationResponse("complete")); + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(true); + expect(result.finalAnswer).toBe("Reached bottom"); + + // No "repeated the same action" warning should have been injected. + const messages = mockStreamText.mock.calls.flatMap((call) => call[0]?.messages || []); + const warningMessage = messages.find( + (msg: any) => msg.role === "user" && msg.content?.includes("repeated the same action"), + ); + expect(warningMessage).toBeUndefined(); + }); + + it("should not flag legitimate repeated wait actions", async () => { + // Same shape as the scroll exemption test: identical wait calls must + // not be flagged as repetition (waiting for a slow page legitimately + // repeats with the same duration). + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Wait for page to load", + plan: "1. Wait for content", + }, + }, + ], + } as any); + + for (let i = 0; i < 5; i++) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Wait", + toolResults: [ + { + type: "tool-result", + toolCallId: `wait_${i}`, + toolName: "wait", + input: { seconds: 2 }, + output: { + success: true, + action: "wait", + value: "2", + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Wait" }], + }, + }) as any, + ); + } + + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Done", + toolResults: [ + { + type: "tool-result", + toolCallId: "done_1", + toolName: "done", + input: { result: "Page loaded" }, + output: { + action: "done", + result: "Page loaded", + isTerminal: true, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Done" }], + }, + }) as any, + ); + + mockGenerateTextWithRetry.mockResolvedValueOnce(mockValidationResponse("complete")); + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(true); + expect(result.finalAnswer).toBe("Page loaded"); + + const messages = mockStreamText.mock.calls.flatMap((call) => call[0]?.messages || []); + const warningMessage = messages.find( + (msg: any) => msg.role === "user" && msg.content?.includes("repeated the same action"), + ); + expect(warningMessage).toBeUndefined(); + }); + + it("should treat cosmetically different fill values as repetition", async () => { + // Value normalization (trim + lowercase) folds near-duplicates. An + // agent that fills "hello", then "HELLO ", then "hello" again is in a + // loop even though the strings aren't byte-identical. + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Test task", + plan: "1. Fill input", + }, + }, + ], + } as any); + + const variants = ["hello", "HELLO ", " Hello", "hello", "HELLO"]; + for (let i = 0; i < variants.length; i++) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Fill", + toolResults: [ + { + type: "tool-result", + toolCallId: `fill_${i}`, + toolName: "fill", + input: { ref: `input_${i}`, value: variants[i] }, + output: { + success: true, + action: "fill", + ref: `input_${i}`, + value: variants[i], + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Fill" }], + }, + }) as any, + ); + } + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(false); + expect(result.finalAnswer).toContain("Aborted: Excessive repetition"); + expect(result.finalAnswer).toContain("fill"); + }); + + it("should not flag clicks on different logical targets", async () => { + // Pass 2: ref-only actions (click, hover, check, etc.) all carry no + // `value`, so without element identity the bare `click:` signature + // collapses every click together — a multi-step wizard or pagination + // run trips the detector falsely. The targetIdentity field on the + // tool output distinguishes "click button:Next" from "click button:Submit". + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Walk through a wizard", + plan: "1. Click through wizard steps", + }, + }, + ], + } as any); + + const targets = [ + { ref: "E1", role: "button", name: "Next" }, + { ref: "E5", role: "button", name: "Continue" }, + { ref: "E12", role: "link", name: "Skip" }, + { ref: "E3", role: "button", name: "Confirm" }, + { ref: "E8", role: "button", name: "Finish" }, + ]; + for (const { ref, role, name } of targets) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Click", + toolResults: [ + { + type: "tool-result", + toolCallId: `click_${ref}`, + toolName: "click", + input: { ref }, + output: { + success: true, + action: "click", + ref, + targetIdentity: { role, name }, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Click" }], + }, + }) as any, + ); + } + + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Done", + toolResults: [ + { + type: "tool-result", + toolCallId: "done_1", + toolName: "done", + input: { result: "Wizard completed" }, + output: { + action: "done", + result: "Wizard completed", + isTerminal: true, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Done" }], + }, + }) as any, + ); + + mockGenerateTextWithRetry.mockResolvedValueOnce(mockValidationResponse("complete")); + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(true); + expect(result.finalAnswer).toBe("Wizard completed"); + + const messages = mockStreamText.mock.calls.flatMap((call) => call[0]?.messages || []); + const warningMessage = messages.find( + (msg: any) => msg.role === "user" && msg.content?.includes("repeated the same action"), + ); + expect(warningMessage).toBeUndefined(); + }); + + it("should detect repeated clicks on the same logical target across ref churn", async () => { + // Hardened version of the Pass 1 regression test: same logical button + // (role + name) across changing refs still trips the detector once + // identity is in the signature. Distinguishes the bug case (one button + // repeatedly clicked) from the wizard case (different buttons clicked). + mockGenerateTextWithRetry.mockResolvedValueOnce({ + text: "Plan", + toolResults: [ + { + type: "tool-result", + toolCallId: "plan_1", + toolName: "create_plan", + output: { + successCriteria: "Test task", + plan: "1. Click submit", + }, + }, + ], + } as any); + + const churningRefs = ["E1", "E5", "E12", "E3", "E8"]; + for (const ref of churningRefs) { + mockStreamText.mockReturnValueOnce( + createMockStreamResponse({ + text: "Click", + toolResults: [ + { + type: "tool-result", + toolCallId: `click_${ref}`, + toolName: "click", + input: { ref }, + output: { + success: true, + action: "click", + ref, + targetIdentity: { role: "button", name: "Submit" }, + }, + }, + ], + response: { + messages: [{ role: "assistant", content: "Click" }], + }, + }) as any, + ); + } + + const result = await webAgent.execute("Test task", { + startingUrl: "https://example.com", + }); + + expect(result.success).toBe(false); + expect(result.finalAnswer).toContain("Aborted: Excessive repetition"); + expect(result.finalAnswer).toContain("click"); + }); }); }); diff --git a/packages/extension/src/background/ExtensionBrowser.ts b/packages/extension/src/background/ExtensionBrowser.ts index a3ad81c5..eff4d846 100644 --- a/packages/extension/src/background/ExtensionBrowser.ts +++ b/packages/extension/src/background/ExtensionBrowser.ts @@ -302,6 +302,28 @@ export class ExtensionBrowser implements AriaBrowser { } } + async getRefIdentity(ref: string): Promise<{ role: string; name: string } | null> { + try { + const tab = await this.getActiveTab(); + const [{ result }] = await browser.scripting.executeScript({ + target: { tabId: tab.id! }, + func: (refArg: string) => + ( + globalThis as unknown as { + __piloIdentityMap?: Map; + } + ).__piloIdentityMap?.get(refArg) ?? null, + args: [ref], + }); + return (result as { role: string; name: string } | null) ?? null; + } catch { + // Identity is advisory for repetition detection — if the page or + // identity map isn't available, bail out rather than failing the + // surrounding action. + return null; + } + } + async performAction(ref: string, action: PageAction, value?: string): Promise { console.log( `ExtensionBrowser: performAction() called with ref: ${ref}, action: ${action}, value: ${value}`,