Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions packages/core/src/browser/ariaBrowser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,14 @@ export interface AriaBrowser {
*/
performAction(ref: string, action: PageAction, value?: string): Promise<void>;

/**
* Look up element identity (role + accessible name) for a ref from the
* most recent ariaTree snapshot. Returns null if the ref is unknown.
* Used by the repetition detector to distinguish logical targets when
* the ref string itself churns between snapshots.
*/
getRefIdentity(ref: string): Promise<{ role: string; name: string } | null>;

/**
* Waits for a specific load state of the page
* @param state The load state to wait for
Expand Down
12 changes: 12 additions & 0 deletions packages/core/src/browser/ariaTree/ariaSnapshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ export function generateAndRenderAriaTree(root: Element, counter?: RefCounter):
// that strip data-pilo-ref attributes, e.g. React reconciliation)
(globalThis as any).__piloRefMap = new Map<string, Element>();

// Initialize identity map for ref -> {role, name} lookup. Used by the
// repetition detector to distinguish logical click targets when refs
// themselves churn between snapshots.
(globalThis as any).__piloIdentityMap = new Map<string, { role: string; name: string }>();

const ariaTree = generateAriaTree(root);
return renderAriaTree(ariaTree, refCounter);
}
Expand Down Expand Up @@ -493,6 +498,13 @@ function renderAriaTree(root: AriaNode, counter: RefCounter): string {
if (ariaNode.element) {
(globalThis as any).__piloRefMap?.set(ref, ariaNode.element);
}
// Capture role + accessible name for the repetition detector. Stored
// separately from refMap so consumers that only need identity don't
// have to re-walk the DOM.
(globalThis as any).__piloIdentityMap?.set(ref, {
role: String(ariaNode.role),
name: ariaNode.name ?? "",
});
}

const escapedKey = indent + "- " + yamlEscapeKeyIfNeeded(key);
Expand Down
19 changes: 19 additions & 0 deletions packages/core/src/browser/playwrightBrowser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,25 @@ export class PlaywrightBrowser implements AriaBrowser {
return locator;
}

async getRefIdentity(ref: string): Promise<{ role: string; name: string } | null> {
if (!this.page) return null;
try {
return await this.page.evaluate(
(refArg) =>
(
globalThis as unknown as {
__piloIdentityMap?: Map<string, { role: string; name: string }>;
}
).__piloIdentityMap?.get(refArg) ?? null,
ref,
);
} catch {
// The page may have navigated or torn down the identity map. Identity
// is advisory for repetition detection — log nothing, just bail out.
return null;
}
}

async performAction(ref: string, action: PageAction, value?: string): Promise<void> {
if (!this.page) throw new Error("Browser not started");
return withSpan(
Expand Down
17 changes: 17 additions & 0 deletions packages/core/src/tools/webActionTools.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,11 @@ type ActionResult = {
value?: string | number;
error?: string;
isRecoverable?: boolean;
// Snapshot-time role + accessible name for the targeted element. Optional
// (only populated when a ref is provided and the browser can resolve it).
// Used by the repetition detector to distinguish logical targets even
// when ref strings churn between snapshots.
targetIdentity?: { role: string; name: string };
};

/**
Expand Down Expand Up @@ -82,6 +87,17 @@ async function performActionWithValidation(
value,
});

// Capture target identity BEFORE the action runs — the snapshot's
// identity map is the basis for the ref the LLM emitted, and the
// action itself may invalidate the ref by causing navigation or DOM
// teardown. Identity is advisory; null means we just won't fold it
// into the repetition signature for this turn.
let targetIdentity: { role: string; name: string } | undefined;
if (ref) {
const id = await context.browser.getRefIdentity(ref);
if (id) targetIdentity = id;
}

// Perform the action
await context.browser.performAction(ref || "", action, value);

Expand All @@ -97,6 +113,7 @@ async function performActionWithValidation(
action,
...(ref && { ref }),
...(value !== undefined && { value }),
...(targetIdentity && { targetIdentity }),
};
} catch (error) {
// For browser exceptions, emit failure with error details and return error info
Expand Down
47 changes: 42 additions & 5 deletions packages/core/src/webAgent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import { streamText, ModelMessage, StreamTextResult } from "ai";
import type { ProviderConfig } from "./provider.js";
import { AriaBrowser } from "./browser/ariaBrowser.js";
import { AriaBrowser, PageAction } from "./browser/ariaBrowser.js";
import {
BrowserReconnectedEventData,
CdpEndpointConnectedEventData,
Expand Down Expand Up @@ -230,6 +230,15 @@ export class WebAgent {
private readonly onUserDataRequired: UserDataCallback | undefined;
private readonly taskId: string | undefined;

// Actions where same-action-same-value repetition is legitimate workflow
// (e.g. scrolling an infinite feed, waiting for a slow page) rather than a
// stuck-loop signal. The detector skips these entirely and resets state so
// a real loop interrupted by a scroll doesn't compound across the gap.
private static readonly REPETITION_EXEMPT_ACTIONS: ReadonlySet<string> = new Set<string>([
PageAction.Scroll,
PageAction.Wait,
]);

constructor(
private browser: AriaBrowser,
options: WebAgentOptions,
Expand Down Expand Up @@ -1154,15 +1163,24 @@ export class WebAgent {
actionExecuted: boolean;
error?: TaskError;
} | null {
// Skip exempt actions entirely, and reset state so an in-progress repeat
// count doesn't carry across them (an exempt action between two clicks
// is treated as progress, not as a transparent gap).
if (WebAgent.REPETITION_EXEMPT_ACTIONS.has(actionOutput.action)) {
executionState.actionRepeatCount = 0;
executionState.lastAction = undefined;
return null;
}

// Define explicit thresholds for warning and abort
const REPETITION_WARNING_THRESHOLD = this.maxRepeatedActions + 1;
const REPETITION_ABORT_THRESHOLD = this.maxRepeatedActions + 2;

// Create signature for current action
const currentActionSignature = this.createActionSignature(
actionOutput.action,
actionOutput.ref,
actionOutput.value,
actionOutput.targetIdentity,
);

// Check if this is the same action as the last one
Expand Down Expand Up @@ -1505,10 +1523,29 @@ export class WebAgent {
// === Helper Methods ===

/**
* Create a signature for an action to track repetitions
* Build a repetition-signature for an action. The element ref is
* deliberately excluded because refs are regenerated every snapshot — a
* logical "click Submit" gets a new ref each turn, so including it would
* make near-duplicate actions hash differently and bypass the detector.
* Element identity (role + accessible name) is folded in when present so
* ref-only actions like `click` don't collapse different logical targets
* (e.g. "click Next" vs "click Submit") onto the same signature.
* Value is normalized (lowercase + trim) so cosmetic input variation
* doesn't reset the counter on the same logical action.
*/
private createActionSignature(action: string, ref?: string, value?: string | number): string {
return `${action}:${ref || ""}:${value || ""}`;
private createActionSignature(
action: string,
value?: string | number,
identity?: { role: string; name: string },
): string {
const normalizedValue = String(value ?? "")
.toLowerCase()
.trim();
if (identity) {
const normalizedName = identity.name.toLowerCase().trim();
return `${action}:${identity.role}:${normalizedName}:${normalizedValue}`;
}
return `${action}:${normalizedValue}`;
Comment on lines 1525 to +1548
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch — fixed in 40db079.

The collapse problem affected 8 ref-only actions (click, hover, check, uncheck, focus, enter, back, forward), so plausible real-world false positives like a multi-step wizard's three "Next" clicks or a multi-checkbox form would trip the detector.

Implemented the second option you suggested (targetIdentity field on ActionResult). The path:

  1. ariaSnapshot.ts populates __piloIdentityMap: Map<ref, {role, name}> alongside the existing __piloRefMap, in the same conditional that stamps data-pilo-ref. Both browsers run the same bundled snapshot code, so the map exists in both.
  2. New AriaBrowser.getRefIdentity(ref) method — PlaywrightBrowser queries via page.evaluate, ExtensionBrowser via browser.scripting.executeScript.
  3. performActionWithValidation calls getRefIdentity(ref) before the action runs (so we capture identity while the ref is still valid — the action may navigate away or tear down the DOM) and adds the result to ActionResult as optional targetIdentity.
  4. createActionSignature now folds identity into the signature when present: click:button:Next: and click:button:Submit: hash distinctly, while click:button:Submit: remains constant across ref churn.

New test cases cover both directions:

  • should not flag clicks on different logical targets — 5 clicks with distinct targetIdentity complete without warning.
  • should detect repeated clicks on the same logical target across ref churn — 5 clicks with the same targetIdentity but churning refs still trip the abort.

Identity is advisory (missing identity → falls back to the Pass 1 signature shape), so callers that don't have ref-resolvable identity still work.

}

/**
Expand Down
32 changes: 32 additions & 0 deletions packages/core/test/tools/webActionTools.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ class MockBrowser implements AriaBrowser {
// Mock implementation - can be configured to throw errors for testing
}

async getRefIdentity(_ref: string): Promise<{ role: string; name: string } | null> {
return null;
}

async waitForLoadState(): Promise<void> {}

async runInTemporaryTab<T>(fn: (tab: any) => Promise<T>): Promise<T> {
Expand Down Expand Up @@ -256,6 +260,34 @@ describe("Web Action Tools", () => {

await expect(tools.click.execute({ ref: "btn1" })).rejects.toThrow("Network error");
});

it("should capture target identity from the browser and include it in the result", async () => {
vi.spyOn(mockBrowser, "getRefIdentity").mockResolvedValueOnce({
role: "button",
name: "Submit",
});

const result = await tools.click.execute({ ref: "btn1" });

expect(result).toEqual({
success: true,
action: "click",
ref: "btn1",
targetIdentity: { role: "button", name: "Submit" },
});
});

it("should omit targetIdentity when the browser returns null", async () => {
vi.spyOn(mockBrowser, "getRefIdentity").mockResolvedValueOnce(null);

const result = await tools.click.execute({ ref: "btn1" });

expect(result).toEqual({
success: true,
action: "click",
ref: "btn1",
});
});
});

describe("Fill Action", () => {
Expand Down
Loading
Loading