diff --git a/dotnet/uiAutomationHelper/.gitignore b/dotnet/uiAutomationHelper/.gitignore new file mode 100644 index 0000000000..2789d7166d --- /dev/null +++ b/dotnet/uiAutomationHelper/.gitignore @@ -0,0 +1,5 @@ +bin/ +obj/ +*.user +*.suo +.vs/ diff --git a/dotnet/uiAutomationHelper/UiAutomationHelper.csproj b/dotnet/uiAutomationHelper/UiAutomationHelper.csproj new file mode 100644 index 0000000000..f269663667 --- /dev/null +++ b/dotnet/uiAutomationHelper/UiAutomationHelper.csproj @@ -0,0 +1,27 @@ + + + net8.0-windows + Exe + enable + enable + latest + UiAutomationHelper + UiAutomationHelper + 0.1.0 + bin\$(Configuration) + false + $(NoWarn);SYSLIB1054;CA1508;CA1859 + + + + + + + + + + + + + + diff --git a/dotnet/uiAutomationHelper/UiAutomationHelper.sln b/dotnet/uiAutomationHelper/UiAutomationHelper.sln new file mode 100644 index 0000000000..7a0665e979 --- /dev/null +++ b/dotnet/uiAutomationHelper/UiAutomationHelper.sln @@ -0,0 +1,56 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.8.34408.163 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "UiAutomationHelper", "UiAutomationHelper.csproj", "{A1B2C3D4-E5F6-7890-ABCD-1234567890AB}" +EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "test", "test", "{0C88DD14-F956-CE84-757C-A364CCF449FC}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "UiAutomationHelper.Tests", "test\UiAutomationHelper.Tests.csproj", "{04DFE25D-177A-4CD2-99A8-39B68BB1674E}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Debug|x64 = Debug|x64 + Debug|x86 = Debug|x86 + Release|Any CPU = Release|Any CPU + Release|x64 = Release|x64 + Release|x86 = Release|x86 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Debug|x64.ActiveCfg = Debug|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Debug|x64.Build.0 = Debug|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Debug|x86.ActiveCfg = Debug|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Debug|x86.Build.0 = Debug|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Release|Any CPU.Build.0 = Release|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Release|x64.ActiveCfg = Release|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Release|x64.Build.0 = Release|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Release|x86.ActiveCfg = Release|Any CPU + {A1B2C3D4-E5F6-7890-ABCD-1234567890AB}.Release|x86.Build.0 = Release|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Debug|Any CPU.Build.0 = Debug|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Debug|x64.ActiveCfg = Debug|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Debug|x64.Build.0 = Debug|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Debug|x86.ActiveCfg = Debug|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Debug|x86.Build.0 = Debug|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Release|Any CPU.ActiveCfg = Release|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Release|Any CPU.Build.0 = Release|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Release|x64.ActiveCfg = Release|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Release|x64.Build.0 = Release|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Release|x86.ActiveCfg = Release|Any CPU + {04DFE25D-177A-4CD2-99A8-39B68BB1674E}.Release|x86.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(NestedProjects) = preSolution + {04DFE25D-177A-4CD2-99A8-39B68BB1674E} = {0C88DD14-F956-CE84-757C-A364CCF449FC} + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {B1C2D3E4-F5A6-7890-BCDE-2345678901BC} + EndGlobalSection +EndGlobal diff --git a/dotnet/uiAutomationHelper/src/Methods/ActionMethods.cs b/dotnet/uiAutomationHelper/src/Methods/ActionMethods.cs new file mode 100644 index 0000000000..705c2db8aa --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/ActionMethods.cs @@ -0,0 +1,332 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; +using FlaUI.Core.AutomationElements; +using FlaUI.Core.Definitions; +using FlaUI.Core.Input; +using UiAutomationHelper.Models; +using UiAutomationHelper.Rpc; +using UiAutomationHelper.Uia; + +namespace UiAutomationHelper.Methods; + +internal static class ActionMethods +{ + public static void Register(Dispatch dispatch) + { + dispatch.Register("do.invoke", (p, ct) => Task.FromResult(Invoke(p))); + dispatch.Register("do.toggle", (p, ct) => Task.FromResult(Toggle(p))); + dispatch.Register("do.setValue", (p, ct) => Task.FromResult(SetValue(p))); + dispatch.Register("do.select", (p, ct) => Task.FromResult(Select(p))); + dispatch.Register("do.expand", (p, ct) => Task.FromResult(Expand(p))); + dispatch.Register("do.scroll", (p, ct) => Task.FromResult(Scroll(p))); + dispatch.Register("do.focus", (p, ct) => Task.FromResult(Focus(p))); + dispatch.Register("do.click", (p, ct) => Task.FromResult(Click(p))); + dispatch.Register("do.sendKeys", (p, ct) => Task.FromResult(SendKeys(p))); + } + + private static object? Invoke(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + var el = ResolveAndCheckEnabled(p.Selector); + if (!el.Patterns.Invoke.IsSupported) + { + throw new RpcException(RpcErrorCode.PatternNotSupported, + $"Element does not support Invoke: {p.Selector}"); + } + el.Patterns.Invoke.Pattern.Invoke(); + return new { ok = true }; + } + + private static object? Toggle(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + var el = ResolveAndCheckEnabled(p.Selector); + if (!el.Patterns.Toggle.IsSupported) + { + throw new RpcException(RpcErrorCode.PatternNotSupported, + $"Element does not support Toggle: {p.Selector}"); + } + var pattern = el.Patterns.Toggle.Pattern; + if (p.Value.HasValue) + { + var desired = p.Value.Value ? ToggleState.On : ToggleState.Off; + // Tri-state controls take up to 3 toggles to reach a chosen On/Off state. + for (int i = 0; i < 3; i++) + { + if (pattern.ToggleState.Value == desired) + { + break; + } + pattern.Toggle(); + } + } + else + { + pattern.Toggle(); + } + return new { ok = true, toggleState = ToggleStateString(pattern.ToggleState.Value) }; + } + + private static object? SetValue(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + var el = ResolveAndCheckEnabled(p.Selector); + var raw = p.Value?.ToString() ?? ""; + + if (el.Patterns.Value.IsSupported && + el.Patterns.Value.Pattern.IsReadOnly.ValueOrDefault == false) + { + el.Patterns.Value.Pattern.SetValue(raw); + return new { ok = true }; + } + if (el.Patterns.RangeValue.IsSupported) + { + if (!double.TryParse(raw, System.Globalization.NumberStyles.Any, + System.Globalization.CultureInfo.InvariantCulture, out var num)) + { + throw new RpcException(RpcErrorCode.InvalidParams, + $"Value '{raw}' is not numeric for RangeValue control"); + } + el.Patterns.RangeValue.Pattern.SetValue(num); + return new { ok = true }; + } + throw new RpcException(RpcErrorCode.PatternNotSupported, + $"Element supports neither writable Value nor RangeValue: {p.Selector}"); + } + + private static object? Select(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + var el = ResolveAndCheckEnabled(p.Selector); + + if (el.Patterns.SelectionItem.IsSupported) + { + el.Patterns.SelectionItem.Pattern.Select(); + return new { ok = true }; + } + if (el.Patterns.Selection.IsSupported) + { + if (p.Item == null || !p.Item.HasValue) + { + throw new RpcException(RpcErrorCode.InvalidParams, + "'item' is required when selecting from a Selection container"); + } + var item = p.Item.Value; + AutomationElement? target = null; + var children = el.FindAllChildren(); + if (item.ValueKind == JsonValueKind.Number) + { + int idx = item.GetInt32(); + if (idx >= 0 && idx < children.Length) + { + target = children[idx]; + } + } + else if (item.ValueKind == JsonValueKind.String) + { + string? name = item.GetString(); + target = Array.Find(children, + c => string.Equals(c.Properties.Name.ValueOrDefault, name, StringComparison.Ordinal)); + } + if (target == null) + { + throw new RpcException(RpcErrorCode.ElementNotFound, + "Selection container has no matching item"); + } + if (!target.Patterns.SelectionItem.IsSupported) + { + throw new RpcException(RpcErrorCode.PatternNotSupported, + "Matched child does not support SelectionItem"); + } + target.Patterns.SelectionItem.Pattern.Select(); + return new { ok = true }; + } + throw new RpcException(RpcErrorCode.PatternNotSupported, + $"Element supports neither SelectionItem nor Selection: {p.Selector}"); + } + + private static object? Expand(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + var el = ResolveAndCheckEnabled(p.Selector); + if (!el.Patterns.ExpandCollapse.IsSupported) + { + throw new RpcException(RpcErrorCode.PatternNotSupported, + $"Element does not support ExpandCollapse: {p.Selector}"); + } + var pattern = el.Patterns.ExpandCollapse.Pattern; + if (p.ExpandValue) + { + pattern.Expand(); + } + else + { + pattern.Collapse(); + } + return new { ok = true }; + } + + private static object? Scroll(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + var el = ResolveAndCheckEnabled(p.Selector); + if (!el.Patterns.Scroll.IsSupported) + { + throw new RpcException(RpcErrorCode.PatternNotSupported, + $"Element does not support Scroll: {p.Selector}"); + } + bool large = string.Equals(p.Amount, "large", StringComparison.OrdinalIgnoreCase); + var (h, v) = (p.Direction ?? "down").ToLowerInvariant() switch + { + "up" => (ScrollAmount.NoAmount, large ? ScrollAmount.LargeDecrement : ScrollAmount.SmallDecrement), + "down" => (ScrollAmount.NoAmount, large ? ScrollAmount.LargeIncrement : ScrollAmount.SmallIncrement), + "left" => (large ? ScrollAmount.LargeDecrement : ScrollAmount.SmallDecrement, ScrollAmount.NoAmount), + "right" => (large ? ScrollAmount.LargeIncrement : ScrollAmount.SmallIncrement, ScrollAmount.NoAmount), + _ => throw new RpcException(RpcErrorCode.InvalidParams, + $"Unknown direction: {p.Direction}"), + }; + el.Patterns.Scroll.Pattern.Scroll(h, v); + return new { ok = true }; + } + + private static object? Focus(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Selector)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'selector' is required"); + } + var el = SelectorResolver.ResolveOrThrow(p.Selector); + el.Focus(); + return new { ok = true }; + } + + private static object? Click(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Selector)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'selector' is required"); + } + var el = SelectorResolver.ResolveOrThrow(p.Selector); + var rect = el.Properties.BoundingRectangle.ValueOrDefault; + int x = rect.X + rect.Width / 2; + int y = rect.Y + rect.Height / 2; + if (p.Position != null) + { + if (p.Position.X.HasValue) x = (int)p.Position.X.Value; + if (p.Position.Y.HasValue) y = (int)p.Position.Y.Value; + } + var pt = new System.Drawing.Point(x, y); + if (string.Equals(p.Button, "right", StringComparison.OrdinalIgnoreCase)) + { + Mouse.RightClick(pt); + } + else + { + Mouse.LeftClick(pt); + } + return new { ok = true }; + } + + private static object? SendKeys(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Keys)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'keys' is required"); + } + if (!string.IsNullOrEmpty(p.Selector)) + { + var el = SelectorResolver.ResolveOrThrow(p.Selector); + el.Focus(); + } + Keyboard.Type(p.Keys); + return new { ok = true }; + } + + private static AutomationElement ResolveAndCheckEnabled(string? selector) + { + if (string.IsNullOrEmpty(selector)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'selector' is required"); + } + var el = SelectorResolver.ResolveOrThrow(selector); + if (!el.Properties.IsEnabled.ValueOrDefault) + { + throw new RpcException(RpcErrorCode.ElementNotEnabled, $"Element is not enabled: {selector}"); + } + return el; + } + + private static string ToggleStateString(ToggleState s) => s switch + { + ToggleState.On => "on", + ToggleState.Off => "off", + ToggleState.Indeterminate => "indeterminate", + _ => "unknown", + }; +} + +internal sealed class DoInvokeParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } +} + +internal sealed class DoToggleParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("value")] public bool? Value { get; set; } +} + +internal sealed class DoSetValueParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("value")] public JsonElement? Value { get; set; } +} + +internal sealed class DoSelectParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("item")] public JsonElement? Item { get; set; } +} + +internal sealed class DoExpandParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("expand")] public bool ExpandValue { get; set; } = true; +} + +internal sealed class DoScrollParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("direction")] public string? Direction { get; set; } + [JsonPropertyName("amount")] public string? Amount { get; set; } +} + +internal sealed class DoFocusParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } +} + +internal sealed class DoClickParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("button")] public string? Button { get; set; } + [JsonPropertyName("position")] public ClickPosition? Position { get; set; } +} + +internal sealed class ClickPosition +{ + [JsonPropertyName("x")] public double? X { get; set; } + [JsonPropertyName("y")] public double? Y { get; set; } +} + +internal sealed class DoSendKeysParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("keys")] public string? Keys { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/AppMethods.cs b/dotnet/uiAutomationHelper/src/Methods/AppMethods.cs new file mode 100644 index 0000000000..58f7a0f2d6 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/AppMethods.cs @@ -0,0 +1,174 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using System.Text.Json.Serialization; +using FlaUI.Core; +using FlaUI.Core.AutomationElements; +using FlaUI.Core.Definitions; +using UiAutomationHelper.Models; +using UiAutomationHelper.Rpc; +using UiAutomationHelper.Uia; + +namespace UiAutomationHelper.Methods; + +internal static class AppMethods +{ + public static void Register(Dispatch dispatch) + { + dispatch.Register("app.launch", (p, ct) => Task.FromResult(Launch(p))); + dispatch.Register("app.attach", (p, ct) => Task.FromResult(Attach(p))); + dispatch.Register("app.list", (p, ct) => Task.FromResult(List())); + dispatch.Register("app.kill", (p, ct) => Task.FromResult(Kill(p))); + } + + private static object? Launch(System.Text.Json.JsonElement? @params) + { + var p = RpcParams.Parse(@params); + Application app; + if (!string.IsNullOrEmpty(p.Aumid)) + { + app = Application.LaunchStoreApp(p.Aumid); + AppRegistry.Register(app.ProcessId, p.Aumid); + } + else if (!string.IsNullOrEmpty(p.ExePath)) + { + var args = p.Args != null && p.Args.Length > 0 ? string.Join(" ", p.Args) : null; + app = args != null + ? Application.Launch(p.ExePath, args) + : Application.Launch(p.ExePath); + } + else + { + throw new RpcException(RpcErrorCode.InvalidParams, "Either 'aumid' or 'exePath' is required"); + } + + var window = app.GetMainWindow(AutomationHost.Automation, TimeSpan.FromSeconds(15)); + if (window == null) + { + throw new RpcException(RpcErrorCode.Timeout, "Launched app has no main window"); + } + return new { pid = app.ProcessId, mainWindow = BuildWindowSelector(window) }; + } + + private static object? Attach(System.Text.Json.JsonElement? @params) + { + var p = RpcParams.Parse(@params); + Application app; + int pid; + if (p.Pid.HasValue) + { + pid = p.Pid.Value; + app = Application.Attach(pid); + } + else if (!string.IsNullOrEmpty(p.WindowTitle)) + { + var found = FindWindowPidByTitle(p.WindowTitle); + if (found == null) + { + throw new RpcException(RpcErrorCode.ElementNotFound, $"No window matching '{p.WindowTitle}'"); + } + pid = found.Value; + app = Application.Attach(pid); + } + else + { + throw new RpcException(RpcErrorCode.InvalidParams, "Either 'pid' or 'windowTitle' is required"); + } + + var window = app.GetMainWindow(AutomationHost.Automation, TimeSpan.FromSeconds(5)); + if (window == null) + { + throw new RpcException(RpcErrorCode.Timeout, "Attached app has no main window"); + } + return new { pid, mainWindow = BuildWindowSelector(window) }; + } + + private static object? List() => ComRetry.Run(() => + { + var desktop = AutomationHost.Automation.GetDesktop(); + var cf = AutomationHost.Automation.ConditionFactory; + var windows = desktop.FindAllChildren(cf.ByControlType(ControlType.Window)); + var results = new List(); + foreach (var w in windows) + { + var pid = w.Properties.ProcessId.ValueOrDefault; + var title = w.Properties.Name.ValueOrDefault ?? ""; + if (pid <= 0 || string.IsNullOrWhiteSpace(title)) + { + continue; + } + results.Add(new + { + pid, + title, + aumid = AppRegistry.GetAumid(pid), + mainWindow = BuildWindowSelector(w), + }); + } + return (object?)results; + }); + + private static object? Kill(System.Text.Json.JsonElement? @params) + { + var p = RpcParams.Parse(@params); + if (!p.Pid.HasValue) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'pid' is required"); + } + try + { + using var proc = Process.GetProcessById(p.Pid.Value); + try { proc.CloseMainWindow(); } catch { /* CloseMainWindow can throw on UWP */ } + if (!proc.WaitForExit(2000)) + { + proc.Kill(entireProcessTree: true); + proc.WaitForExit(2000); + } + } + catch (ArgumentException) + { + // Process not running — treat as already-killed + } + AppRegistry.Forget(p.Pid.Value); + return new { ok = true }; + } + + private static int? FindWindowPidByTitle(string substringMatch) + { + var desktop = AutomationHost.Automation.GetDesktop(); + var cf = AutomationHost.Automation.ConditionFactory; + var windows = desktop.FindAllChildren(cf.ByControlType(ControlType.Window)); + foreach (var w in windows) + { + var name = w.Properties.Name.ValueOrDefault ?? ""; + if (name.Contains(substringMatch, StringComparison.OrdinalIgnoreCase)) + { + var pid = w.Properties.ProcessId.ValueOrDefault; + if (pid > 0) return pid; + } + } + return null; + } + + internal static string BuildWindowSelector(AutomationElement window) => + Selectors.BuildAbsolutePath(window); +} + +internal sealed class AppLaunchParams +{ + [JsonPropertyName("aumid")] public string? Aumid { get; set; } + [JsonPropertyName("exePath")] public string? ExePath { get; set; } + [JsonPropertyName("args")] public string[]? Args { get; set; } +} + +internal sealed class AppAttachParams +{ + [JsonPropertyName("pid")] public int? Pid { get; set; } + [JsonPropertyName("windowTitle")] public string? WindowTitle { get; set; } +} + +internal sealed class AppKillParams +{ + [JsonPropertyName("pid")] public int? Pid { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/EventMethods.cs b/dotnet/uiAutomationHelper/src/Methods/EventMethods.cs new file mode 100644 index 0000000000..d3d632c952 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/EventMethods.cs @@ -0,0 +1,229 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using System.Text.Json; +using System.Text.Json.Serialization; +using FlaUI.Core.AutomationElements; +using FlaUI.Core.Definitions; +using UiAutomationHelper.Models; +using UiAutomationHelper.Rpc; +using UiAutomationHelper.Uia; + +namespace UiAutomationHelper.Methods; + +internal static class EventMethods +{ + public static void Register(Dispatch dispatch) + { + dispatch.Register("events.idle", IdleAsync); + dispatch.Register("events.subscribe", (p, ct) => Task.FromResult(Subscribe(p))); + dispatch.Register("events.unsubscribe", (p, ct) => Task.FromResult(Unsubscribe(p))); + } + + private static object? Subscribe(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Root)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'root' is required"); + } + if (p.EventTypes == null || p.EventTypes.Length == 0) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'eventTypes' is required"); + } + var root = SelectorResolver.ResolveOrThrow(p.Root); + var sub = new Subscription { RootSelector = p.Root, Root = root }; + var automation = AutomationHost.Automation; + var subId = sub.Id; + + foreach (var eventType in p.EventTypes) + { + switch (eventType) + { + case "Invoked": + { + var ev = automation.EventLibrary.Invoke.InvokedEvent; + var h = root.RegisterAutomationEvent( + ev, + TreeScope.Subtree, + (el, eid) => OnAutomationEvent(subId, "Invoked", el)); + sub.AutomationHandlers.Add((ev, h)); + break; + } + case "ValueChanged": + { + var pid = automation.PropertyLibrary.Value.Value; + var h = root.RegisterPropertyChangedEvent( + TreeScope.Subtree, + (el, prop, val) => OnPropertyChangedEvent(subId, "ValueChanged", el, val), + pid); + sub.PropertyChangedHandlers.Add(h); + break; + } + case "ToggleStateChanged": + { + var pid = automation.PropertyLibrary.Toggle.ToggleState; + var h = root.RegisterPropertyChangedEvent( + TreeScope.Subtree, + (el, prop, val) => OnPropertyChangedEvent(subId, "ToggleStateChanged", el, val), + pid); + sub.PropertyChangedHandlers.Add(h); + break; + } + case "StructureChanged": + { + var h = root.RegisterStructureChangedEvent( + TreeScope.Subtree, + (el, type, ridArr) => OnStructureChangedEvent(subId, el, type)); + sub.StructureChangedHandlers.Add(h); + break; + } + default: + sub.Dispose(); + throw new RpcException(RpcErrorCode.InvalidParams, + $"Unknown eventType: {eventType}. Supported: Invoked, ValueChanged, ToggleStateChanged, StructureChanged."); + } + } + SubscriptionRegistry.Add(sub); + return new { subscriptionId = sub.Id }; + } + + private static object? Unsubscribe(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.SubscriptionId)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'subscriptionId' is required"); + } + var ok = SubscriptionRegistry.Remove(p.SubscriptionId); + return new { ok }; + } + + private static void OnAutomationEvent(string subId, string type, AutomationElement el) + { + try + { + var selector = Selectors.BuildAbsolutePath(el); + Notifier.Send("event.fired", new + { + subscriptionId = subId, + eventType = type, + selector, + controlSnapshot = TakeSnapshot(el), + timestamp = DateTime.UtcNow.ToString("o"), + }); + } + catch + { + // Element may have been torn down between event firing and our + // attempt to read it. Drop silently. + } + } + + private static void OnPropertyChangedEvent(string subId, string type, AutomationElement el, object? newValue) + { + try + { + var selector = Selectors.BuildAbsolutePath(el); + Notifier.Send("event.fired", new + { + subscriptionId = subId, + eventType = type, + selector, + controlSnapshot = TakeSnapshot(el), + newValue = newValue?.ToString(), + timestamp = DateTime.UtcNow.ToString("o"), + }); + } + catch { } + } + + private static void OnStructureChangedEvent(string subId, AutomationElement el, StructureChangeType type) + { + try + { + var selector = Selectors.BuildAbsolutePath(el); + Notifier.Send("event.fired", new + { + subscriptionId = subId, + eventType = "StructureChanged", + selector, + changeType = type.ToString(), + controlSnapshot = TakeSnapshot(el), + timestamp = DateTime.UtcNow.ToString("o"), + }); + } + catch { } + } + + private static object TakeSnapshot(AutomationElement el) => new + { + controlType = el.ControlType.ToString(), + name = el.Properties.Name.ValueOrDefault, + automationId = el.Properties.AutomationId.ValueOrDefault, + className = el.Properties.ClassName.ValueOrDefault, + value = TryGetValue(el), + toggleState = TryGetToggleState(el), + }; + + private static string? TryGetValue(AutomationElement el) + { + try + { + return el.Patterns.Value.IsSupported + ? el.Patterns.Value.Pattern.Value.ValueOrDefault + : null; + } + catch { return null; } + } + + private static string? TryGetToggleState(AutomationElement el) + { + try + { + return el.Patterns.Toggle.IsSupported + ? el.Patterns.Toggle.Pattern.ToggleState.ValueOrDefault.ToString() + : null; + } + catch { return null; } + } + + private static async Task IdleAsync(System.Text.Json.JsonElement? @params, CancellationToken ct) + { + var p = RpcParams.Parse(@params); + int debounceMs = p.DebounceMs ?? 500; + int maxWaitMs = p.MaxWaitMs ?? 10000; + + EventBridge.EnsureSubscribed(); + EventBridge.ResetActivityClock(); + + var sw = Stopwatch.StartNew(); + while (sw.ElapsedMilliseconds < maxWaitMs) + { + if (EventBridge.QuietMs() >= debounceMs) + { + return new { ok = true, idle = true, waitedMs = sw.ElapsedMilliseconds }; + } + await Task.Delay(50, ct).ConfigureAwait(false); + } + return new { ok = true, idle = false, waitedMs = sw.ElapsedMilliseconds }; + } +} + +internal sealed class EventsIdleParams +{ + [JsonPropertyName("debounceMs")] public int? DebounceMs { get; set; } + [JsonPropertyName("maxWaitMs")] public int? MaxWaitMs { get; set; } +} + +internal sealed class EventsSubscribeParams +{ + [JsonPropertyName("root")] public string? Root { get; set; } + [JsonPropertyName("eventTypes")] public string[]? EventTypes { get; set; } +} + +internal sealed class EventsUnsubscribeParams +{ + [JsonPropertyName("subscriptionId")] public string? SubscriptionId { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/FindMethods.cs b/dotnet/uiAutomationHelper/src/Methods/FindMethods.cs new file mode 100644 index 0000000000..3689c36ed7 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/FindMethods.cs @@ -0,0 +1,49 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using System.Text.Json.Serialization; +using UiAutomationHelper.Models; +using UiAutomationHelper.Rpc; +using UiAutomationHelper.Uia; + +namespace UiAutomationHelper.Methods; + +internal static class FindMethods +{ + public static void Register(Dispatch dispatch) + { + dispatch.Register("find", FindAsync); + } + + private static async Task FindAsync(System.Text.Json.JsonElement? @params, CancellationToken ct) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Selector)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'selector' is required"); + } + var path = SelectorParser.Parse(p.Selector); + int timeoutMs = p.TimeoutMs ?? 0; + var sw = Stopwatch.StartNew(); + while (true) + { + var element = SelectorResolver.Resolve(path); + if (element != null) + { + return new { found = true, resolved = p.Selector }; + } + if (sw.ElapsedMilliseconds >= timeoutMs) + { + return new { found = false }; + } + await Task.Delay(100, ct).ConfigureAwait(false); + } + } +} + +internal sealed class FindParams +{ + [JsonPropertyName("selector")] public string? Selector { get; set; } + [JsonPropertyName("timeoutMs")] public int? TimeoutMs { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/HealthMethods.cs b/dotnet/uiAutomationHelper/src/Methods/HealthMethods.cs new file mode 100644 index 0000000000..8e7edc0f1f --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/HealthMethods.cs @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Reflection; +using UiAutomationHelper.Rpc; + +namespace UiAutomationHelper.Methods; + +internal static class HealthMethods +{ + public static void Register(Dispatch dispatch) + { + dispatch.Register("health.ping", (_, _) => + { + var version = Assembly.GetExecutingAssembly() + .GetName().Version?.ToString() ?? "0.0.0.0"; + return Task.FromResult(new { ok = true, version }); + }); + } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/Register.cs b/dotnet/uiAutomationHelper/src/Methods/Register.cs new file mode 100644 index 0000000000..33ee300e9a --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/Register.cs @@ -0,0 +1,21 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using UiAutomationHelper.Rpc; + +namespace UiAutomationHelper.Methods; + +internal static class Register +{ + public static void All(Dispatch dispatch) + { + HealthMethods.Register(dispatch); + AppMethods.Register(dispatch); + TreeMethods.Register(dispatch); + ScreenshotMethods.Register(dispatch); + ActionMethods.Register(dispatch); + FindMethods.Register(dispatch); + EventMethods.Register(dispatch); + SnapshotMethods.Register(dispatch); + } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/ScreenshotMethods.cs b/dotnet/uiAutomationHelper/src/Methods/ScreenshotMethods.cs new file mode 100644 index 0000000000..6fded2731d --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/ScreenshotMethods.cs @@ -0,0 +1,46 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; +using UiAutomationHelper.Models; +using UiAutomationHelper.Rpc; +using UiAutomationHelper.Uia; + +namespace UiAutomationHelper.Methods; + +internal static class ScreenshotMethods +{ + public static void Register(Dispatch dispatch) + { + dispatch.Register("screenshot", (p, ct) => Task.FromResult(Capture(p))); + } + + private static object? Capture(System.Text.Json.JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Root)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'root' is required"); + } + var element = SelectorResolver.ResolveOrThrow(p.Root); + var hwnd = (IntPtr)element.Properties.NativeWindowHandle.ValueOrDefault; + if (hwnd == IntPtr.Zero) + { + throw new RpcException(RpcErrorCode.InternalError, "Element has no native window handle"); + } + try + { + var (bytes, rect) = ScreenshotCapturer.Capture(hwnd); + return new { pngBase64 = Convert.ToBase64String(bytes), rect }; + } + catch (Exception ex) + { + throw new RpcException(RpcErrorCode.InternalError, $"Screenshot failed: {ex.Message}"); + } + } +} + +internal sealed class ScreenshotParams +{ + [JsonPropertyName("root")] public string? Root { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/SnapshotMethods.cs b/dotnet/uiAutomationHelper/src/Methods/SnapshotMethods.cs new file mode 100644 index 0000000000..649bed3eeb --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/SnapshotMethods.cs @@ -0,0 +1,180 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; +using UiAutomationHelper.Models; +using UiAutomationHelper.Rpc; +using UiAutomationHelper.Snapshot; + +namespace UiAutomationHelper.Methods; + +internal static class SnapshotMethods +{ + private static readonly JsonSerializerOptions ManifestJsonOpts = new() + { + WriteIndented = true, + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + }; + + public static void Register(Dispatch dispatch) + { + dispatch.Register("snapshot.capture", (p, ct) => Task.FromResult(Capture(p))); + dispatch.Register("snapshot.restore", (p, ct) => Task.FromResult(Restore(p))); + dispatch.Register("snapshot.delete", (p, ct) => Task.FromResult(Delete(p))); + } + + private static object? Capture(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.SnapshotDir) || p.Policy == null) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'snapshotDir' and 'policy' are required"); + } + var policy = p.Policy; + var snapshotDir = Path.GetFullPath(p.SnapshotDir); + Directory.CreateDirectory(snapshotDir); + + if (NeedsKill(policy)) + { + ProcessKiller.KillByIdentity( + policy.ProcessIdentity?.Aumid, + policy.ProcessIdentity?.ProcessName); + } + + var manifest = new SnapshotManifest + { + CapturedAt = DateTime.UtcNow.ToString("o"), + IntegrationName = policy.IntegrationName, + }; + long total = 0; + var sourcesDir = Path.Combine(snapshotDir, "sources"); + Directory.CreateDirectory(sourcesDir); + + for (int i = 0; i < policy.State.Count; i++) + { + var src = policy.State[i]; + var record = new SnapshotSourceRecord { Index = i, Kind = src.Kind }; + switch (src.Kind) + { + case "folder": + var path = ExpandPath(src.Path ?? ""); + record.Source = path; + var slot = Path.Combine(sourcesDir, i.ToString()); + record.StoredAt = Path.GetRelativePath(snapshotDir, slot).Replace('\\', '/'); + record.Bytes = FolderSnapshotter.Capture(path, slot, src.Exclude); + break; + case "registry": + case "appCommand": + throw new RpcException(RpcErrorCode.InvalidParams, + $"Source kind '{src.Kind}' not yet supported (slice 3a is folder-only)"); + default: + throw new RpcException(RpcErrorCode.InvalidParams, + $"Unknown source kind: {src.Kind}"); + } + manifest.Sources.Add(record); + total += record.Bytes; + } + manifest.TotalBytes = total; + File.WriteAllText( + Path.Combine(snapshotDir, "manifest.json"), + JsonSerializer.Serialize(manifest, ManifestJsonOpts)); + + return new + { + snapshotId = Path.GetFileName(snapshotDir), + bytes = total, + sourceCount = manifest.Sources.Count, + }; + } + + private static object? Restore(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.SnapshotDir) || p.Policy == null) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'snapshotDir' and 'policy' are required"); + } + var snapshotDir = Path.GetFullPath(p.SnapshotDir); + if (!Directory.Exists(snapshotDir)) + { + throw new RpcException(RpcErrorCode.SnapshotMissing, $"Snapshot not found: {snapshotDir}"); + } + var policy = p.Policy; + + if (NeedsKill(policy)) + { + ProcessKiller.KillByIdentity( + policy.ProcessIdentity?.Aumid, + policy.ProcessIdentity?.ProcessName); + } + + var sourcesDir = Path.Combine(snapshotDir, "sources"); + long total = 0; + for (int i = 0; i < policy.State.Count; i++) + { + var src = policy.State[i]; + switch (src.Kind) + { + case "folder": + var target = ExpandPath(src.Path ?? ""); + var slot = Path.Combine(sourcesDir, i.ToString()); + total += FolderSnapshotter.Restore(slot, target); + break; + case "registry": + case "appCommand": + throw new RpcException(RpcErrorCode.InvalidParams, + $"Source kind '{src.Kind}' not yet supported (slice 3a is folder-only)"); + } + } + return new { ok = true, bytes = total }; + } + + private static object? Delete(JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.SnapshotDir)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'snapshotDir' is required"); + } + var dir = Path.GetFullPath(p.SnapshotDir); + if (Directory.Exists(dir)) + { + Directory.Delete(dir, recursive: true); + } + return new { ok = true }; + } + + private static bool NeedsKill(SnapshotPolicy policy) + { + foreach (var s in policy.State) + { + // Default: kill for folder + appCommand, not for registry. + var defaultKill = s.Kind switch + { + "registry" => false, + _ => true, + }; + if (s.RequireKill ?? defaultKill) + { + return true; + } + } + return false; + } + + private static string ExpandPath(string p) => + Environment.ExpandEnvironmentVariables(p); +} + +internal sealed class SnapshotCaptureParams +{ + [JsonPropertyName("snapshotDir")] public string? SnapshotDir { get; set; } + [JsonPropertyName("policy")] public SnapshotPolicy? Policy { get; set; } +} + +internal sealed class SnapshotDeleteParams +{ + [JsonPropertyName("snapshotDir")] public string? SnapshotDir { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Methods/TreeMethods.cs b/dotnet/uiAutomationHelper/src/Methods/TreeMethods.cs new file mode 100644 index 0000000000..04178adb3a --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Methods/TreeMethods.cs @@ -0,0 +1,67 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; +using UiAutomationHelper.Models; +using UiAutomationHelper.Rpc; +using UiAutomationHelper.Uia; + +namespace UiAutomationHelper.Methods; + +internal static class TreeMethods +{ + public static void Register(Dispatch dispatch) + { + dispatch.Register("tree.dump", (p, ct) => Task.FromResult(Dump(p))); + dispatch.Register("tree.fingerprint", (p, ct) => Task.FromResult(Fingerprint(p))); + } + + private static object? Dump(System.Text.Json.JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Root)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'root' is required"); + } + return ComRetry.Run(() => + { + var element = SelectorResolver.ResolveOrThrow(p.Root); + int depth = p.MaxDepth ?? 20; + return (object?)TreeWalker.Walk(element, p.Root, depth); + }); + } + + private static object? Fingerprint(System.Text.Json.JsonElement? @params) + { + var p = RpcParams.ParseRequired(@params); + if (string.IsNullOrEmpty(p.Root)) + { + throw new RpcException(RpcErrorCode.InvalidParams, "'root' is required"); + } + return ComRetry.Run(() => + { + var element = SelectorResolver.ResolveOrThrow(p.Root); + var result = FingerprintComputer.Compute(element, p.Root, p.DynamicRules); + return (object?)new + { + hash = result.Hash, + controlCount = result.ControlCount, + activeWindowTitle = result.ActiveWindowTitle, + focusedSelector = result.FocusedSelector, + }; + }); + } +} + +internal sealed class TreeDumpParams +{ + [JsonPropertyName("root")] public string? Root { get; set; } + [JsonPropertyName("maxDepth")] public int? MaxDepth { get; set; } + [JsonPropertyName("filter")] public string? Filter { get; set; } +} + +internal sealed class TreeFingerprintParams +{ + [JsonPropertyName("root")] public string? Root { get; set; } + [JsonPropertyName("dynamicRules")] public DynamicControlRule[]? DynamicRules { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Models/DynamicControlRule.cs b/dotnet/uiAutomationHelper/src/Models/DynamicControlRule.cs new file mode 100644 index 0000000000..72be3849a7 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/DynamicControlRule.cs @@ -0,0 +1,38 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; + +namespace UiAutomationHelper.Models; + +internal sealed class DynamicControlRule +{ + [JsonPropertyName("id")] public string Id { get; set; } = ""; + [JsonPropertyName("match")] public ControlMatcher Match { get; set; } = new(); + [JsonPropertyName("dynamicProperties")] public string[] DynamicProperties { get; set; } = Array.Empty(); + [JsonPropertyName("semantic")] public string? Semantic { get; set; } + [JsonPropertyName("reason")] public string? Reason { get; set; } + [JsonPropertyName("confidence")] public double Confidence { get; set; } + [JsonPropertyName("observations")] public int Observations { get; set; } + [JsonPropertyName("firstSeen")] public string? FirstSeen { get; set; } + [JsonPropertyName("lastConfirmed")] public string? LastConfirmed { get; set; } + [JsonPropertyName("notes")] public string? Notes { get; set; } +} + +internal sealed class ControlMatcher +{ + /// + /// One of: "automationId", "selector", "selectorPattern", "container". + /// + [JsonPropertyName("kind")] public string Kind { get; set; } = ""; + + // automationId / selector / selectorPattern + [JsonPropertyName("value")] public string? Value { get; set; } + [JsonPropertyName("pattern")] public string? Pattern { get; set; } + + // container + [JsonPropertyName("container")] public string? Container { get; set; } + [JsonPropertyName("controlType")] public string? ControlType { get; set; } + [JsonPropertyName("nameRegex")] public string? NameRegex { get; set; } + [JsonPropertyName("classNameRegex")] public string? ClassNameRegex { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Models/RpcErrorCode.cs b/dotnet/uiAutomationHelper/src/Models/RpcErrorCode.cs new file mode 100644 index 0000000000..a1e52fd26b --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/RpcErrorCode.cs @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace UiAutomationHelper.Models; + +internal enum RpcErrorCode +{ + ParseError = -32700, + InvalidRequest = -32600, + MethodNotFound = -32601, + InvalidParams = -32602, + InternalError = -32603, + ElementNotFound = -32001, + ElementNotEnabled = -32002, + PatternNotSupported = -32003, + AppCrashed = -32004, + Timeout = -32005, + SnapshotPolicyInvalid = -32006, + SnapshotMissing = -32007, +} diff --git a/dotnet/uiAutomationHelper/src/Models/RpcException.cs b/dotnet/uiAutomationHelper/src/Models/RpcException.cs new file mode 100644 index 0000000000..3dc45968b3 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/RpcException.cs @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace UiAutomationHelper.Models; + +internal sealed class RpcException : Exception +{ + public RpcErrorCode Code { get; } + public object? ErrorData { get; } + + public RpcException(RpcErrorCode code, string message, object? data = null) + : base(message) + { + Code = code; + ErrorData = data; + } +} diff --git a/dotnet/uiAutomationHelper/src/Models/RpcNotification.cs b/dotnet/uiAutomationHelper/src/Models/RpcNotification.cs new file mode 100644 index 0000000000..f900a2bd94 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/RpcNotification.cs @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; + +namespace UiAutomationHelper.Models; + +internal sealed class RpcNotification +{ + [JsonPropertyName("jsonrpc")] public string JsonRpc { get; init; } = "2.0"; + [JsonPropertyName("method")] public string Method { get; init; } = ""; + [JsonPropertyName("params")] public object? Params { get; init; } +} diff --git a/dotnet/uiAutomationHelper/src/Models/RpcRequest.cs b/dotnet/uiAutomationHelper/src/Models/RpcRequest.cs new file mode 100644 index 0000000000..2edc6958ff --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/RpcRequest.cs @@ -0,0 +1,15 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace UiAutomationHelper.Models; + +internal sealed class RpcRequest +{ + [JsonPropertyName("jsonrpc")] public string? JsonRpc { get; set; } + [JsonPropertyName("id")] public JsonElement? Id { get; set; } + [JsonPropertyName("method")] public string? Method { get; set; } + [JsonPropertyName("params")] public JsonElement? Params { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Models/RpcResponse.cs b/dotnet/uiAutomationHelper/src/Models/RpcResponse.cs new file mode 100644 index 0000000000..e79dbe5de4 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/RpcResponse.cs @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; + +namespace UiAutomationHelper.Models; + +internal sealed class RpcResponse +{ + [JsonPropertyName("jsonrpc")] public string JsonRpc { get; init; } = "2.0"; + [JsonPropertyName("id")] public JsonElement? Id { get; init; } + [JsonPropertyName("result")] public object? Result { get; init; } + [JsonPropertyName("error")] public RpcErrorObject? Error { get; init; } + + public static RpcResponse Success(JsonElement? id, object? result) => + new() { Id = id, Result = result }; + + public static RpcResponse Fail(JsonElement? id, RpcErrorCode code, string message, object? data = null) => + new() + { + Id = id, + Error = new RpcErrorObject { Code = (int)code, Message = message, Data = data }, + }; +} + +internal sealed class RpcErrorObject +{ + [JsonPropertyName("code")] public int Code { get; set; } + [JsonPropertyName("message")] public string Message { get; set; } = ""; + [JsonPropertyName("data")] public object? Data { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Models/SelectorPath.cs b/dotnet/uiAutomationHelper/src/Models/SelectorPath.cs new file mode 100644 index 0000000000..6cf7541dfe --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/SelectorPath.cs @@ -0,0 +1,13 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace UiAutomationHelper.Models; + +internal sealed record SelectorPath(IReadOnlyList Segments); + +internal sealed record SelectorSegment( + string ControlType, + string? Name = null, + string? AutomationId = null, + string? ClassName = null, + int? Index = null); diff --git a/dotnet/uiAutomationHelper/src/Models/SnapshotPolicy.cs b/dotnet/uiAutomationHelper/src/Models/SnapshotPolicy.cs new file mode 100644 index 0000000000..506f6e5008 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/SnapshotPolicy.cs @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; + +namespace UiAutomationHelper.Models; + +internal sealed class SnapshotPolicy +{ + [JsonPropertyName("version")] public int Version { get; set; } = 1; + [JsonPropertyName("integrationName")] public string IntegrationName { get; set; } = ""; + [JsonPropertyName("detectionStatus")] public string? DetectionStatus { get; set; } + [JsonPropertyName("processIdentity")] public ProcessIdentity? ProcessIdentity { get; set; } + [JsonPropertyName("state")] public List State { get; set; } = new(); + [JsonPropertyName("hooks")] public PolicyHooks? Hooks { get; set; } +} + +internal sealed class ProcessIdentity +{ + [JsonPropertyName("aumid")] public string? Aumid { get; set; } + [JsonPropertyName("processName")] public string? ProcessName { get; set; } + [JsonPropertyName("exePath")] public string? ExePath { get; set; } +} + +internal sealed class SnapshotSource +{ + [JsonPropertyName("kind")] public string Kind { get; set; } = ""; + // folder + [JsonPropertyName("path")] public string? Path { get; set; } + [JsonPropertyName("recursive")] public bool? Recursive { get; set; } + [JsonPropertyName("exclude")] public string[]? Exclude { get; set; } + // registry + [JsonPropertyName("key")] public string? Key { get; set; } + // appCommand + [JsonPropertyName("capture")] public ScriptHook? Capture { get; set; } + [JsonPropertyName("restore")] public ScriptHook? Restore { get; set; } + // shared + [JsonPropertyName("requireKill")] public bool? RequireKill { get; set; } +} + +internal sealed class ScriptHook +{ + [JsonPropertyName("command")] public string Command { get; set; } = ""; + [JsonPropertyName("args")] public string[]? Args { get; set; } + [JsonPropertyName("cwd")] public string? Cwd { get; set; } +} + +internal sealed class PolicyHooks +{ + [JsonPropertyName("beforeCapture")] public ScriptHook[]? BeforeCapture { get; set; } + [JsonPropertyName("afterRestore")] public ScriptHook[]? AfterRestore { get; set; } +} + +internal sealed class SnapshotManifest +{ + [JsonPropertyName("version")] public int Version { get; set; } = 1; + [JsonPropertyName("capturedAt")] public string CapturedAt { get; set; } = ""; + [JsonPropertyName("integrationName")] public string IntegrationName { get; set; } = ""; + [JsonPropertyName("sources")] public List Sources { get; set; } = new(); + [JsonPropertyName("totalBytes")] public long TotalBytes { get; set; } +} + +internal sealed class SnapshotSourceRecord +{ + [JsonPropertyName("index")] public int Index { get; set; } + [JsonPropertyName("kind")] public string Kind { get; set; } = ""; + [JsonPropertyName("source")] public string Source { get; set; } = ""; // resolved path/key + [JsonPropertyName("storedAt")] public string StoredAt { get; set; } = ""; // relative to snapshot dir + [JsonPropertyName("bytes")] public long Bytes { get; set; } +} diff --git a/dotnet/uiAutomationHelper/src/Models/TreeNode.cs b/dotnet/uiAutomationHelper/src/Models/TreeNode.cs new file mode 100644 index 0000000000..9c308e6b94 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Models/TreeNode.cs @@ -0,0 +1,29 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json.Serialization; + +namespace UiAutomationHelper.Models; + +internal sealed class TreeNode +{ + [JsonPropertyName("selector")] public string Selector { get; set; } = ""; + [JsonPropertyName("automationId")] public string? AutomationId { get; set; } + [JsonPropertyName("name")] public string? Name { get; set; } + [JsonPropertyName("controlType")] public string ControlType { get; set; } = ""; + [JsonPropertyName("className")] public string? ClassName { get; set; } + [JsonPropertyName("isEnabled")] public bool IsEnabled { get; set; } + [JsonPropertyName("isOffscreen")] public bool IsOffscreen { get; set; } + [JsonPropertyName("hasKeyboardFocus")] public bool HasKeyboardFocus { get; set; } + [JsonPropertyName("patterns")] public List Patterns { get; set; } = new(); + [JsonPropertyName("boundingRect")] public Rect BoundingRect { get; set; } = new(0, 0, 0, 0); + [JsonPropertyName("value")] public string? Value { get; set; } + [JsonPropertyName("toggleState")] public string? ToggleState { get; set; } + [JsonPropertyName("children")] public List Children { get; set; } = new(); +} + +internal sealed record Rect( + [property: JsonPropertyName("x")] double X, + [property: JsonPropertyName("y")] double Y, + [property: JsonPropertyName("width")] double Width, + [property: JsonPropertyName("height")] double Height); diff --git a/dotnet/uiAutomationHelper/src/Program.cs b/dotnet/uiAutomationHelper/src/Program.cs new file mode 100644 index 0000000000..0ea3f36f55 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Program.cs @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text; +using UiAutomationHelper.Methods; +using UiAutomationHelper.Rpc; + +namespace UiAutomationHelper; + +internal static class Program +{ + public static async Task Main(string[] args) + { + Console.InputEncoding = Encoding.UTF8; + Console.OutputEncoding = Encoding.UTF8; + + using var cts = new CancellationTokenSource(); + Console.CancelKeyPress += (_, e) => + { + e.Cancel = true; + cts.Cancel(); + }; + + var dispatch = new Dispatch(); + Methods.Register.All(dispatch); + + var server = new JsonRpcServer(Console.In, Console.Out, dispatch); + Notifier.Init(server); + await server.RunAsync(cts.Token).ConfigureAwait(false); + return 0; + } +} diff --git a/dotnet/uiAutomationHelper/src/Rpc/Dispatch.cs b/dotnet/uiAutomationHelper/src/Rpc/Dispatch.cs new file mode 100644 index 0000000000..597638c055 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Rpc/Dispatch.cs @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Rpc; + +internal delegate Task RpcMethod(JsonElement? @params, CancellationToken ct); + +internal sealed class Dispatch +{ + private readonly Dictionary _methods = new(StringComparer.Ordinal); + + public void Register(string name, RpcMethod method) + { + if (_methods.ContainsKey(name)) + { + throw new InvalidOperationException($"Method already registered: {name}"); + } + _methods[name] = method; + } + + public Task InvokeAsync(string name, JsonElement? @params, CancellationToken ct) + { + if (!_methods.TryGetValue(name, out var method)) + { + throw new RpcException(RpcErrorCode.MethodNotFound, $"Method not found: {name}"); + } + return method(@params, ct); + } +} diff --git a/dotnet/uiAutomationHelper/src/Rpc/JsonRpcServer.cs b/dotnet/uiAutomationHelper/src/Rpc/JsonRpcServer.cs new file mode 100644 index 0000000000..0073edb4f7 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Rpc/JsonRpcServer.cs @@ -0,0 +1,139 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using System.Text.Json.Serialization; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Rpc; + +internal sealed class JsonRpcServer +{ + private readonly TextReader _input; + private readonly TextWriter _output; + private readonly Dispatch _dispatch; + private readonly object _writeLock = new(); + + internal static readonly JsonSerializerOptions JsonOpts = new() + { + PropertyNamingPolicy = JsonNamingPolicy.CamelCase, + DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull, + }; + + public JsonRpcServer(TextReader input, TextWriter output, Dispatch dispatch) + { + _input = input; + _output = output; + _dispatch = dispatch; + } + + /// + /// Send a JSON-RPC notification (server → client, no id). Safe to call + /// from background threads (UIA event handler thread, etc.). + /// + public void Notify(string method, object? @params) + { + var msg = new RpcNotification { Method = method, Params = @params }; + string json; + try + { + json = JsonSerializer.Serialize(msg, JsonOpts); + } + catch + { + return; // best-effort + } + lock (_writeLock) + { + try + { + _output.WriteLine(json); + _output.Flush(); + } + catch + { + // Pipe may be closed during shutdown; ignore. + } + } + } + + public async Task RunAsync(CancellationToken ct = default) + { + while (!ct.IsCancellationRequested) + { + string? line; + try + { + line = await _input.ReadLineAsync(ct).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + break; + } + + if (line == null) + { + break; + } + if (string.IsNullOrWhiteSpace(line)) + { + continue; + } + + var response = await HandleLineAsync(line, ct).ConfigureAwait(false); + Write(response); + } + } + + private async Task HandleLineAsync(string line, CancellationToken ct) + { + RpcRequest? request = null; + try + { + request = JsonSerializer.Deserialize(line, JsonOpts); + if (request == null || string.IsNullOrEmpty(request.Method)) + { + return RpcResponse.Fail(request?.Id, RpcErrorCode.InvalidRequest, "Invalid request"); + } + var result = await _dispatch.InvokeAsync(request.Method, request.Params, ct) + .ConfigureAwait(false); + return RpcResponse.Success(request.Id, result); + } + catch (JsonException ex) + { + return RpcResponse.Fail(request?.Id, RpcErrorCode.ParseError, ex.Message); + } + catch (RpcException ex) + { + return RpcResponse.Fail(request?.Id, ex.Code, ex.Message, ex.ErrorData); + } + catch (OperationCanceledException) + { + return RpcResponse.Fail(request?.Id, RpcErrorCode.InternalError, "Cancelled"); + } + catch (Exception ex) + { + return RpcResponse.Fail(request?.Id, RpcErrorCode.InternalError, ex.Message); + } + } + + private void Write(RpcResponse response) + { + string json; + try + { + json = JsonSerializer.Serialize(response, JsonOpts); + } + catch (Exception ex) + { + json = JsonSerializer.Serialize( + RpcResponse.Fail(response.Id, RpcErrorCode.InternalError, $"Failed to serialize response: {ex.Message}"), + JsonOpts); + } + lock (_writeLock) + { + _output.WriteLine(json); + _output.Flush(); + } + } +} diff --git a/dotnet/uiAutomationHelper/src/Rpc/Notifier.cs b/dotnet/uiAutomationHelper/src/Rpc/Notifier.cs new file mode 100644 index 0000000000..cb38411476 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Rpc/Notifier.cs @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace UiAutomationHelper.Rpc; + +/// +/// Static facade so event handlers (which aren't connected to dispatch) can +/// push JSON-RPC notifications back to the client. Initialized in Program.cs. +/// +internal static class Notifier +{ + private static JsonRpcServer? _server; + + public static void Init(JsonRpcServer server) + { + _server = server; + } + + public static void Send(string method, object? @params) + { + _server?.Notify(method, @params); + } +} diff --git a/dotnet/uiAutomationHelper/src/Rpc/RpcParams.cs b/dotnet/uiAutomationHelper/src/Rpc/RpcParams.cs new file mode 100644 index 0000000000..3f868468a0 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Rpc/RpcParams.cs @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text.Json; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Rpc; + +internal static class RpcParams +{ + public static T Parse(JsonElement? @params) where T : new() + { + if (@params == null + || @params.Value.ValueKind == JsonValueKind.Null + || @params.Value.ValueKind == JsonValueKind.Undefined) + { + return new T(); + } + try + { + return JsonSerializer.Deserialize(@params.Value, JsonRpcServer.JsonOpts) ?? new T(); + } + catch (JsonException ex) + { + throw new RpcException(RpcErrorCode.InvalidParams, ex.Message); + } + } + + public static T ParseRequired(JsonElement? @params) where T : new() + { + if (@params == null + || @params.Value.ValueKind == JsonValueKind.Null + || @params.Value.ValueKind == JsonValueKind.Undefined) + { + throw new RpcException(RpcErrorCode.InvalidParams, "Missing params"); + } + return Parse(@params); + } +} diff --git a/dotnet/uiAutomationHelper/src/Snapshot/FolderSnapshotter.cs b/dotnet/uiAutomationHelper/src/Snapshot/FolderSnapshotter.cs new file mode 100644 index 0000000000..798a96e1db --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Snapshot/FolderSnapshotter.cs @@ -0,0 +1,105 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace UiAutomationHelper.Snapshot; + +internal static class FolderSnapshotter +{ + /// + /// Recursively copy into . + /// Returns total bytes copied. matches relative path glob fragments. + /// + public static long Capture(string sourcePath, string destPath, IReadOnlyList? exclude = null) + { + if (!Directory.Exists(sourcePath)) + { + return 0; + } + Directory.CreateDirectory(destPath); + return CopyRecursive(sourcePath, destPath, sourcePath, exclude); + } + + /// + /// Replace 's contents with what's in + /// . Removes anything currently at target, + /// then copies snapshot back. Returns bytes restored. + /// + public static long Restore(string snapshotPath, string targetPath) + { + if (!Directory.Exists(snapshotPath)) + { + // Nothing was captured — make target empty (matches "nothing was there at capture time"). + if (Directory.Exists(targetPath)) + { + Directory.Delete(targetPath, recursive: true); + } + return 0; + } + + if (Directory.Exists(targetPath)) + { + Directory.Delete(targetPath, recursive: true); + } + Directory.CreateDirectory(targetPath); + return CopyRecursive(snapshotPath, targetPath, snapshotPath, exclude: null); + } + + private static long CopyRecursive( + string srcRoot, + string dstRoot, + string currentSrc, + IReadOnlyList? exclude) + { + long bytes = 0; + foreach (var file in Directory.EnumerateFiles(currentSrc)) + { + var rel = Path.GetRelativePath(srcRoot, file); + if (IsExcluded(rel, exclude)) + { + continue; + } + var dst = Path.Combine(dstRoot, rel); + Directory.CreateDirectory(Path.GetDirectoryName(dst)!); + try + { + File.Copy(file, dst, overwrite: true); + bytes += new FileInfo(dst).Length; + } + catch (IOException) + { + // File locked or missing — best-effort. + } + catch (UnauthorizedAccessException) + { + // Permission issue — skip. + } + } + foreach (var dir in Directory.EnumerateDirectories(currentSrc)) + { + var rel = Path.GetRelativePath(srcRoot, dir); + if (IsExcluded(rel, exclude)) + { + continue; + } + bytes += CopyRecursive(srcRoot, dstRoot, dir, exclude); + } + return bytes; + } + + private static bool IsExcluded(string relativePath, IReadOnlyList? exclude) + { + if (exclude == null || exclude.Count == 0) + { + return false; + } + var norm = relativePath.Replace('\\', '/'); + foreach (var pattern in exclude) + { + if (norm.Contains(pattern, StringComparison.OrdinalIgnoreCase)) + { + return true; + } + } + return false; + } +} diff --git a/dotnet/uiAutomationHelper/src/Snapshot/ProcessKiller.cs b/dotnet/uiAutomationHelper/src/Snapshot/ProcessKiller.cs new file mode 100644 index 0000000000..5fad696de5 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Snapshot/ProcessKiller.cs @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; + +namespace UiAutomationHelper.Snapshot; + +internal static class ProcessKiller +{ + /// + /// Kill all processes matching the given identity. UWP AUMID matching is + /// approximated via process-name match against the package name fragment; + /// callers passing aumid should also pass processName when known. + /// + public static void KillByIdentity(string? aumid, string? processName, int gracefulMs = 1500) + { + var processes = new List(); + if (!string.IsNullOrEmpty(processName)) + { + try + { + processes.AddRange(Process.GetProcessesByName(StripExe(processName))); + } + catch + { + /* Access denied is fine */ + } + } + if (!string.IsNullOrEmpty(aumid)) + { + // AUMID is "_!". Use the package + // name as a fuzzy process-name hint (works for built-in apps where + // the executable name reflects the package). + var pkg = aumid.Split('_', '!')[0]; + try + { + processes.AddRange(Process.GetProcessesByName(pkg)); + } + catch + { + /* Access denied is fine */ + } + } + + foreach (var proc in processes.Distinct()) + { + try + { + if (proc.HasExited) continue; + try { proc.CloseMainWindow(); } catch { /* ignore */ } + if (!proc.WaitForExit(gracefulMs)) + { + proc.Kill(entireProcessTree: true); + proc.WaitForExit(gracefulMs); + } + } + catch + { + /* ignore individual failures */ + } + finally + { + proc.Dispose(); + } + } + } + + private static string StripExe(string name) => + name.EndsWith(".exe", StringComparison.OrdinalIgnoreCase) + ? name[..^4] + : name; +} diff --git a/dotnet/uiAutomationHelper/src/Uia/AppRegistry.cs b/dotnet/uiAutomationHelper/src/Uia/AppRegistry.cs new file mode 100644 index 0000000000..9905e36ab9 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/AppRegistry.cs @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace UiAutomationHelper.Uia; + +/// +/// Tracks AUMIDs for apps the helper launched, since UIA exposes no way to +/// recover an AUMID from a window or process. +/// +internal static class AppRegistry +{ + private static readonly Dictionary _aumidByPid = new(); + private static readonly object _lock = new(); + + public static void Register(int pid, string? aumid) + { + if (string.IsNullOrEmpty(aumid)) + { + return; + } + lock (_lock) + { + _aumidByPid[pid] = aumid; + } + } + + public static string? GetAumid(int pid) + { + lock (_lock) + { + return _aumidByPid.TryGetValue(pid, out var a) ? a : null; + } + } + + public static void Forget(int pid) + { + lock (_lock) + { + _aumidByPid.Remove(pid); + } + } +} diff --git a/dotnet/uiAutomationHelper/src/Uia/AutomationHost.cs b/dotnet/uiAutomationHelper/src/Uia/AutomationHost.cs new file mode 100644 index 0000000000..65d3a4ca80 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/AutomationHost.cs @@ -0,0 +1,32 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using FlaUI.UIA3; + +namespace UiAutomationHelper.Uia; + +internal static class AutomationHost +{ + private static UIA3Automation? _automation; + private static readonly object _lock = new(); + + public static UIA3Automation Automation + { + get + { + lock (_lock) + { + return _automation ??= new UIA3Automation(); + } + } + } + + public static void Dispose() + { + lock (_lock) + { + _automation?.Dispose(); + _automation = null; + } + } +} diff --git a/dotnet/uiAutomationHelper/src/Uia/ComRetry.cs b/dotnet/uiAutomationHelper/src/Uia/ComRetry.cs new file mode 100644 index 0000000000..d77ccd9d44 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/ComRetry.cs @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Runtime.InteropServices; + +namespace UiAutomationHelper.Uia; + +/// +/// Wraps UIA operations that can throw transient COMExceptions when the desktop +/// tree is mutating mid-enumeration (e.g., immediately after a window closes). +/// +internal static class ComRetry +{ + public static T Run(Func op, int maxRetries = 2, int delayMs = 100) + { + Exception? last = null; + for (int attempt = 0; attempt <= maxRetries; attempt++) + { + try + { + return op(); + } + catch (COMException ex) when (IsTransient(ex)) + { + last = ex; + if (attempt == maxRetries) + { + break; + } + Thread.Sleep(delayMs); + } + } + throw last!; + } + + private static bool IsTransient(COMException ex) + { + // Common race-condition HRESULTs observed during teardown / structure-change. + return ex.HResult switch + { + unchecked((int)0x80040201) => true, // EVENT_E_QUERYSYNTAX (UIA event marshaling) + unchecked((int)0x80040E14) => true, // generic transient + unchecked((int)0x80131509) => true, // InvalidOperationException COM-mapped + unchecked((int)0x80004005) => true, // E_FAIL — frequently transient on UIA enumeration + _ => false, + }; + } +} diff --git a/dotnet/uiAutomationHelper/src/Uia/EventBridge.cs b/dotnet/uiAutomationHelper/src/Uia/EventBridge.cs new file mode 100644 index 0000000000..3288cb6cbb --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/EventBridge.cs @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace UiAutomationHelper.Uia; + +/// +/// Centralized hook for UIA events. Slice 2 only tracks "any focus change" as a +/// proxy for "UIA had activity"; full event subscription with notifications lands +/// in slice 5 (record mode). +/// +internal static class EventBridge +{ + private static long _lastEventTicks = DateTime.UtcNow.Ticks; + private static IDisposable? _focusSub; + private static readonly object _subLock = new(); + + /// + /// Idempotent. Subscribes to global focus changes the first time it's called. + /// + public static void EnsureSubscribed() + { + lock (_subLock) + { + if (_focusSub != null) + { + return; + } + _focusSub = AutomationHost.Automation.RegisterFocusChangedEvent(_ => + { + Interlocked.Exchange(ref _lastEventTicks, DateTime.UtcNow.Ticks); + }); + } + } + + /// Resets the activity timestamp to "now". Call before measuring idle. + public static void ResetActivityClock() + { + Interlocked.Exchange(ref _lastEventTicks, DateTime.UtcNow.Ticks); + } + + /// Milliseconds since the last observed UIA event. + public static long QuietMs() + { + long lastTicks = Interlocked.Read(ref _lastEventTicks); + return (DateTime.UtcNow.Ticks - lastTicks) / TimeSpan.TicksPerMillisecond; + } +} diff --git a/dotnet/uiAutomationHelper/src/Uia/FingerprintComputer.cs b/dotnet/uiAutomationHelper/src/Uia/FingerprintComputer.cs new file mode 100644 index 0000000000..dedc210672 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/FingerprintComputer.cs @@ -0,0 +1,176 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Security.Cryptography; +using System.Text; +using System.Text.RegularExpressions; +using FlaUI.Core.AutomationElements; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Uia; + +internal static class FingerprintComputer +{ + public sealed class Result + { + public string Hash { get; set; } = ""; + public int ControlCount { get; set; } + public string ActiveWindowTitle { get; set; } = ""; + public string? FocusedSelector { get; set; } + } + + public static Result Compute( + AutomationElement root, + string rootSelector, + IReadOnlyList? rules) + { + rules ??= Array.Empty(); + var sb = new StringBuilder(); + var ctx = new Context { Rules = rules }; + WriteNode(root, rootSelector, sb, ctx); + + var bytes = SHA256.HashData(Encoding.UTF8.GetBytes(sb.ToString())); + var hex = Convert.ToHexString(bytes).ToLowerInvariant()[..16]; + return new Result + { + Hash = hex, + ControlCount = ctx.NodeCount, + ActiveWindowTitle = root.Properties.Name.ValueOrDefault ?? "", + FocusedSelector = ctx.FocusedSelector, + }; + } + + private sealed class Context + { + public IReadOnlyList Rules { get; init; } = Array.Empty(); + public int NodeCount; + public string? FocusedSelector; + } + + private static void WriteNode(AutomationElement el, string selector, StringBuilder sb, Context ctx) + { + ctx.NodeCount++; + if (el.Properties.HasKeyboardFocus.ValueOrDefault) + { + ctx.FocusedSelector ??= selector; + } + + var dynamicProps = MatchDynamicProps(el, selector, ctx.Rules); + var ct = el.ControlType.ToString(); + var aid = el.Properties.AutomationId.ValueOrDefault ?? ""; + var name = dynamicProps.Contains("name") ? "" : (el.Properties.Name.ValueOrDefault ?? ""); + var cls = el.Properties.ClassName.ValueOrDefault ?? ""; + + string value = ""; + if (!dynamicProps.Contains("value")) + { + try + { + if (el.Patterns.Value.IsSupported) + { + value = el.Patterns.Value.Pattern.Value.ValueOrDefault ?? ""; + } + } + catch { /* ignore pattern access errors */ } + } + + string toggle = ""; + if (!dynamicProps.Contains("toggleState")) + { + try + { + if (el.Patterns.Toggle.IsSupported) + { + toggle = el.Patterns.Toggle.Pattern.ToggleState.ValueOrDefault.ToString(); + } + } + catch { /* ignore */ } + } + + sb.Append('{') + .Append("ct=").Append(ct) + .Append("|aid=").Append(aid) + .Append("|name=").Append(name) + .Append("|class=").Append(cls) + .Append("|val=").Append(value) + .Append("|tog=").Append(toggle) + .Append("|kids=["); + + AutomationElement[] children; + try { children = el.FindAllChildren(); } + catch { children = Array.Empty(); } + + for (int i = 0; i < children.Length; i++) + { + if (i > 0) sb.Append(','); + var childSelector = selector + Selectors.BuildSegment(children[i]); + WriteNode(children[i], childSelector, sb, ctx); + } + sb.Append(']').Append('}'); + } + + private static HashSet MatchDynamicProps( + AutomationElement el, + string selector, + IReadOnlyList rules) + { + var matched = new HashSet(StringComparer.Ordinal); + if (rules.Count == 0) return matched; + + foreach (var rule in rules) + { + if (!RuleMatches(rule, el, selector)) continue; + foreach (var p in rule.DynamicProperties) + { + matched.Add(p); + } + } + return matched; + } + + private static bool RuleMatches(DynamicControlRule rule, AutomationElement el, string selector) + { + var m = rule.Match; + switch (m.Kind) + { + case "automationId": + if (string.IsNullOrEmpty(m.Value)) return false; + return string.Equals( + el.Properties.AutomationId.ValueOrDefault ?? "", + m.Value, + StringComparison.Ordinal); + + case "selector": + return string.Equals(selector, m.Value, StringComparison.Ordinal); + + case "selectorPattern": + if (string.IsNullOrEmpty(m.Pattern)) return false; + return Regex.IsMatch(selector, GlobToRegex(m.Pattern)); + + case "container": + if (string.IsNullOrEmpty(m.Container) || string.IsNullOrEmpty(m.ControlType)) return false; + if (!selector.StartsWith(m.Container, StringComparison.Ordinal)) return false; + if (!string.Equals(el.ControlType.ToString(), m.ControlType, StringComparison.Ordinal)) return false; + if (!string.IsNullOrEmpty(m.NameRegex)) + { + var name = el.Properties.Name.ValueOrDefault ?? ""; + if (!Regex.IsMatch(name, m.NameRegex)) return false; + } + if (!string.IsNullOrEmpty(m.ClassNameRegex)) + { + var cls = el.Properties.ClassName.ValueOrDefault ?? ""; + if (!Regex.IsMatch(cls, m.ClassNameRegex)) return false; + } + return true; + + default: + return false; + } + } + + private static string GlobToRegex(string glob) + { + var escaped = Regex.Escape(glob).Replace("\\*", ".*").Replace("\\?", "."); + return "^" + escaped + "$"; + } +} diff --git a/dotnet/uiAutomationHelper/src/Uia/NativeMethods.cs b/dotnet/uiAutomationHelper/src/Uia/NativeMethods.cs new file mode 100644 index 0000000000..088b5f0211 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/NativeMethods.cs @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Runtime.InteropServices; + +namespace UiAutomationHelper.Uia; + +internal static class NativeMethods +{ + public const uint GA_PARENT = 1; + public const uint GA_ROOT = 2; + public const uint GA_ROOTOWNER = 3; + + [DllImport("user32.dll")] + public static extern IntPtr GetAncestor(IntPtr hwnd, uint gaFlags); +} diff --git a/dotnet/uiAutomationHelper/src/Uia/ScreenshotCapturer.cs b/dotnet/uiAutomationHelper/src/Uia/ScreenshotCapturer.cs new file mode 100644 index 0000000000..74995e2ea8 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/ScreenshotCapturer.cs @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Drawing; +using System.Drawing.Imaging; +using System.Runtime.InteropServices; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Uia; + +internal static class ScreenshotCapturer +{ + [DllImport("user32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + private static extern bool PrintWindow(IntPtr hWnd, IntPtr hdcBlt, int nFlags); + + [DllImport("user32.dll", SetLastError = true)] + [return: MarshalAs(UnmanagedType.Bool)] + private static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect); + + [StructLayout(LayoutKind.Sequential)] + private struct RECT + { + public int Left; + public int Top; + public int Right; + public int Bottom; + } + + private const int PW_RENDERFULLCONTENT = 0x00000002; + + public static (byte[] PngBytes, Rect Bounds) Capture(IntPtr hwnd) + { + if (!GetWindowRect(hwnd, out var r)) + { + throw new InvalidOperationException("GetWindowRect failed"); + } + int w = r.Right - r.Left; + int h = r.Bottom - r.Top; + if (w <= 0 || h <= 0) + { + throw new InvalidOperationException($"Invalid window size {w}x{h}"); + } + + using var bmp = new Bitmap(w, h, PixelFormat.Format32bppArgb); + using (var g = Graphics.FromImage(bmp)) + { + var hdc = g.GetHdc(); + try + { + if (!PrintWindow(hwnd, hdc, PW_RENDERFULLCONTENT)) + { + throw new InvalidOperationException("PrintWindow failed"); + } + } + finally + { + g.ReleaseHdc(hdc); + } + } + + using var ms = new MemoryStream(); + bmp.Save(ms, ImageFormat.Png); + return (ms.ToArray(), new Rect(r.Left, r.Top, w, h)); + } +} diff --git a/dotnet/uiAutomationHelper/src/Uia/SelectorParser.cs b/dotnet/uiAutomationHelper/src/Uia/SelectorParser.cs new file mode 100644 index 0000000000..7c103e14c1 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/SelectorParser.cs @@ -0,0 +1,170 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Uia; + +internal static class SelectorParser +{ + public static SelectorPath Parse(string input) + { + if (string.IsNullOrEmpty(input)) + { + throw new FormatException("Selector cannot be empty"); + } + if (input[0] != '/') + { + throw new FormatException("Selector must start with '/'"); + } + + var segments = new List(); + int i = 0; + while (i < input.Length) + { + if (input[i] != '/') + { + throw new FormatException($"Expected '/' at position {i}"); + } + i++; + + int idStart = i; + if (i >= input.Length || !IsIdentStart(input[i])) + { + throw new FormatException($"Expected identifier at position {i}"); + } + i++; + while (i < input.Length && IsIdentRest(input[i])) + { + i++; + } + string controlType = input.Substring(idStart, i - idStart); + + string? name = null, autoId = null, className = null; + int? index = null; + + while (i < input.Length && input[i] == '[') + { + i++; + if (i < input.Length && char.IsDigit(input[i])) + { + int numStart = i; + while (i < input.Length && char.IsDigit(input[i])) + { + i++; + } + if (i >= input.Length || input[i] != ']') + { + throw new FormatException($"Expected ']' after index at position {i}"); + } + index = int.Parse(input.AsSpan(numStart, i - numStart)); + i++; + } + else + { + int keyStart = i; + if (i >= input.Length || !IsIdentStart(input[i])) + { + throw new FormatException($"Expected predicate key at position {i}"); + } + i++; + while (i < input.Length && IsIdentRest(input[i])) + { + i++; + } + string key = input.Substring(keyStart, i - keyStart); + if (i >= input.Length || input[i] != '=') + { + throw new FormatException($"Expected '=' at position {i}"); + } + i++; + if (i >= input.Length || input[i] != '"') + { + throw new FormatException($"Expected '\"' at position {i}"); + } + i++; + + var sb = new StringBuilder(); + while (i < input.Length && input[i] != '"') + { + if (input[i] == '\\' && i + 1 < input.Length) + { + sb.Append(input[i + 1]); + i += 2; + } + else + { + sb.Append(input[i]); + i++; + } + } + if (i >= input.Length) + { + throw new FormatException("Unterminated string value"); + } + i++; + if (i >= input.Length || input[i] != ']') + { + throw new FormatException($"Expected ']' after predicate at position {i}"); + } + i++; + + string value = sb.ToString(); + switch (key) + { + case "Name": name = value; break; + case "AutomationId": autoId = value; break; + case "ClassName": className = value; break; + default: + throw new FormatException($"Unknown predicate key: {key}"); + } + } + } + + segments.Add(new SelectorSegment(controlType, name, autoId, className, index)); + } + + if (segments.Count == 0) + { + throw new FormatException("Selector must have at least one segment"); + } + + return new SelectorPath(segments); + } + + public static string Format(SelectorPath path) + { + var sb = new StringBuilder(); + foreach (var seg in path.Segments) + { + sb.Append('/').Append(seg.ControlType); + if (seg.AutomationId != null) + { + sb.Append("[AutomationId=\"").Append(Escape(seg.AutomationId)).Append("\"]"); + } + if (seg.Name != null) + { + sb.Append("[Name=\"").Append(Escape(seg.Name)).Append("\"]"); + } + if (seg.ClassName != null) + { + sb.Append("[ClassName=\"").Append(Escape(seg.ClassName)).Append("\"]"); + } + if (seg.Index.HasValue) + { + sb.Append('[').Append(seg.Index.Value).Append(']'); + } + } + return sb.ToString(); + } + + public static string FormatSegment(SelectorSegment seg) => + Format(new SelectorPath(new[] { seg })); + + private static string Escape(string s) => + s.Replace("\\", "\\\\").Replace("\"", "\\\""); + + private static bool IsIdentStart(char c) => char.IsLetter(c) || c == '_'; + private static bool IsIdentRest(char c) => char.IsLetterOrDigit(c) || c == '_'; +} diff --git a/dotnet/uiAutomationHelper/src/Uia/SelectorResolver.cs b/dotnet/uiAutomationHelper/src/Uia/SelectorResolver.cs new file mode 100644 index 0000000000..e8f0058279 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/SelectorResolver.cs @@ -0,0 +1,85 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using FlaUI.Core.AutomationElements; +using FlaUI.Core.Conditions; +using FlaUI.Core.Definitions; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Uia; + +internal static class SelectorResolver +{ + /// + /// Resolve a selector path starting from the desktop. Returns null if any segment fails. + /// + public static AutomationElement? Resolve(SelectorPath path) + { + var current = AutomationHost.Automation.GetDesktop(); + var cf = AutomationHost.Automation.ConditionFactory; + + foreach (var seg in path.Segments) + { + var condition = BuildCondition(seg, cf); + var children = current.FindAllChildren(condition); + if (children.Length == 0) + { + return null; + } + if (seg.Index.HasValue) + { + int idx = seg.Index.Value - 1; // 1-based + if (idx < 0 || idx >= children.Length) + { + return null; + } + current = children[idx]; + } + else + { + current = children[0]; + } + } + return current; + } + + public static AutomationElement ResolveOrThrow(string selector) + { + var path = SelectorParser.Parse(selector); + var el = Resolve(path); + if (el == null) + { + throw new RpcException(RpcErrorCode.ElementNotFound, $"Element not found: {selector}"); + } + return el; + } + + private static ConditionBase BuildCondition(SelectorSegment seg, ConditionFactory cf) + { + var conditions = new List(); + if (TryParseControlType(seg.ControlType, out var ct)) + { + conditions.Add(cf.ByControlType(ct)); + } + else + { + throw new RpcException(RpcErrorCode.InvalidParams, $"Unknown control type: {seg.ControlType}"); + } + if (seg.Name != null) + { + conditions.Add(cf.ByName(seg.Name)); + } + if (seg.AutomationId != null) + { + conditions.Add(cf.ByAutomationId(seg.AutomationId)); + } + if (seg.ClassName != null) + { + conditions.Add(cf.ByClassName(seg.ClassName)); + } + return conditions.Count == 1 ? conditions[0] : new AndCondition(conditions.ToArray()); + } + + private static bool TryParseControlType(string name, out ControlType ct) => + Enum.TryParse(name, ignoreCase: false, out ct); +} diff --git a/dotnet/uiAutomationHelper/src/Uia/Selectors.cs b/dotnet/uiAutomationHelper/src/Uia/Selectors.cs new file mode 100644 index 0000000000..0fbdfdd4f0 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/Selectors.cs @@ -0,0 +1,137 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Text; +using FlaUI.Core.AutomationElements; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Uia; + +internal static class Selectors +{ + /// + /// Builds a desktop-rooted selector path for the given element by walking + /// up to (but not including) the desktop root. Required when the element + /// might be a non-top-level window (e.g., FlaUI returns UWP CoreWindow as + /// the main window, but it lives under an ApplicationFrameWindow). + /// + public static string BuildAbsolutePath(AutomationElement el) + { + // We need a desktop-rooted selector. For most elements that's just + // BuildSegment. The hard case is a UWP CoreWindow returned by + // app.GetMainWindow — Win32-wise it's a top-level window, but UIA's + // logical tree puts it under an ApplicationFrameWindow (different + // process). Strategies in order: + // 1) `el` IS a desktop child (matches by RuntimeId). + // 2) `el`'s Win32 OWNER is a desktop child. + // 3) Same-named desktop child exists (UWP CoreWindow / frame share name). + // 4) Fallback to single-segment selector. + var desktop = AutomationHost.Automation.GetDesktop(); + var elRid = el.Properties.RuntimeId.ValueOrDefault; + var elName = el.Properties.Name.ValueOrDefault ?? ""; + var elHwnd = (IntPtr)el.Properties.NativeWindowHandle.ValueOrDefault; + + // UWP apps create the ApplicationFrameWindow asynchronously after the + // CoreWindow appears. Poll desktop's children for up to 2 seconds. + for (int attempt = 0; attempt < 10; attempt++) + { + var topLevels = desktop.FindAllChildren(); + + // 1) Identity: el itself is a top-level child. + if (elRid != null) + { + foreach (var top in topLevels) + { + var topRid = top.Properties.RuntimeId.ValueOrDefault; + if (topRid != null && RidEquals(elRid, topRid)) + { + return BuildSegment(top); + } + } + } + + // 2) Win32 owner is a top-level child. + if (elHwnd != IntPtr.Zero) + { + var ownerHwnd = NativeMethods.GetAncestor(elHwnd, NativeMethods.GA_ROOTOWNER); + if (ownerHwnd != IntPtr.Zero && ownerHwnd != elHwnd) + { + foreach (var top in topLevels) + { + if ((IntPtr)top.Properties.NativeWindowHandle.ValueOrDefault == ownerHwnd) + { + return BuildSegment(top); + } + } + } + } + + // 3) Name match (UWP frame and core share the app's display name). + if (!string.IsNullOrEmpty(elName)) + { + foreach (var top in topLevels) + { + if (top.Properties.Name.ValueOrDefault == elName) + { + return BuildSegment(top); + } + } + } + + Thread.Sleep(200); + } + + return BuildSegment(el); + } + + private static bool RidEquals(int[] a, int[] b) + { + if (a.Length != b.Length) return false; + for (int i = 0; i < a.Length; i++) + { + if (a[i] != b[i]) return false; + } + return true; + } + + + /// + /// Builds a "/ControlType[predicate]" segment for an element. + /// Priority: AutomationId → Name → ClassName → bare ControlType. + /// + public static string BuildSegment(AutomationElement el) + { + var ct = el.ControlType.ToString(); + var aid = NullIfEmpty(el.Properties.AutomationId.ValueOrDefault); + var name = NullIfEmpty(el.Properties.Name.ValueOrDefault); + var cls = NullIfEmpty(el.Properties.ClassName.ValueOrDefault); + + // AutomationId on its own is unique enough. Otherwise include Name + + // ClassName as joint identifiers — two siblings can easily share Name + // (UWP wraps app windows in multiple layers, all named after the app). + SelectorSegment seg; + if (aid != null) + { + seg = new SelectorSegment(ct, AutomationId: aid); + } + else if (name != null && cls != null) + { + seg = new SelectorSegment(ct, Name: name, ClassName: cls); + } + else if (name != null) + { + seg = new SelectorSegment(ct, Name: name); + } + else if (cls != null) + { + seg = new SelectorSegment(ct, ClassName: cls); + } + else + { + seg = new SelectorSegment(ct); + } + return SelectorParser.FormatSegment(seg); + } + + private static string? NullIfEmpty(string? s) => string.IsNullOrEmpty(s) ? null : s; +} diff --git a/dotnet/uiAutomationHelper/src/Uia/Subscription.cs b/dotnet/uiAutomationHelper/src/Uia/Subscription.cs new file mode 100644 index 0000000000..4284cafff6 --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/Subscription.cs @@ -0,0 +1,70 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using FlaUI.Core.AutomationElements; +using FlaUI.Core.EventHandlers; +using FlaUI.Core.Identifiers; + +namespace UiAutomationHelper.Uia; + +internal sealed class Subscription : IDisposable +{ + public string Id { get; } = Guid.NewGuid().ToString("N"); + public string RootSelector { get; init; } = ""; + public AutomationElement? Root { get; init; } + + public List<(EventId EventId, AutomationEventHandlerBase Handler)> AutomationHandlers { get; } = new(); + public List PropertyChangedHandlers { get; } = new(); + public List StructureChangedHandlers { get; } = new(); + + public void Dispose() + { + // FlaUI's EventHandlerBase implements IDisposable; disposing the + // handler unregisters it. + foreach (var (_, h) in AutomationHandlers) + { + try { h.Dispose(); } catch { } + } + AutomationHandlers.Clear(); + foreach (var h in PropertyChangedHandlers) + { + try { h.Dispose(); } catch { } + } + PropertyChangedHandlers.Clear(); + foreach (var h in StructureChangedHandlers) + { + try { h.Dispose(); } catch { } + } + StructureChangedHandlers.Clear(); + } +} + +internal static class SubscriptionRegistry +{ + private static readonly Dictionary _subs = new(); + private static readonly object _lock = new(); + + public static void Add(Subscription sub) + { + lock (_lock) + { + _subs[sub.Id] = sub; + } + } + + public static bool Remove(string id) + { + lock (_lock) + { + if (!_subs.TryGetValue(id, out var sub)) return false; + sub.Dispose(); + _subs.Remove(id); + return true; + } + } + + public static int Count + { + get { lock (_lock) return _subs.Count; } + } +} diff --git a/dotnet/uiAutomationHelper/src/Uia/TreeWalker.cs b/dotnet/uiAutomationHelper/src/Uia/TreeWalker.cs new file mode 100644 index 0000000000..ffa81df55f --- /dev/null +++ b/dotnet/uiAutomationHelper/src/Uia/TreeWalker.cs @@ -0,0 +1,119 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using FlaUI.Core.AutomationElements; +using FlaUI.Core.Definitions; +using UiAutomationHelper.Models; + +namespace UiAutomationHelper.Uia; + +internal static class TreeWalker +{ + /// + /// Walk an element subtree and produce a TreeNode hierarchy. + /// + /// The element to start walking from. + /// The selector path leading to . + /// This becomes the root node's Selector field; children's selectors extend it. + /// Walk depth: -1 for unlimited, 0 for just the root, N for N levels of descendants. + public static TreeNode Walk(AutomationElement root, string rootSelector, int maxDepth) + { + return WalkInternal(root, rootSelector, maxDepth); + } + + private static TreeNode WalkInternal(AutomationElement el, string selector, int remainingDepth) + { + var node = BuildNode(el); + node.Selector = selector; + if (remainingDepth == 0) + { + return node; + } + AutomationElement[] children; + try + { + children = el.FindAllChildren(); + } + catch + { + // Some elements throw when enumerating children (e.g., transient/disposed). Skip. + return node; + } + foreach (var c in children) + { + var childSelector = selector + Selectors.BuildSegment(c); + node.Children.Add(WalkInternal(c, childSelector, remainingDepth > 0 ? remainingDepth - 1 : -1)); + } + return node; + } + + private static TreeNode BuildNode(AutomationElement el) + { + var rect = el.Properties.BoundingRectangle.ValueOrDefault; + var node = new TreeNode + { + ControlType = el.ControlType.ToString(), + Name = NullIfEmpty(el.Properties.Name.ValueOrDefault), + AutomationId = NullIfEmpty(el.Properties.AutomationId.ValueOrDefault), + ClassName = NullIfEmpty(el.Properties.ClassName.ValueOrDefault), + IsEnabled = el.Properties.IsEnabled.ValueOrDefault, + IsOffscreen = el.Properties.IsOffscreen.ValueOrDefault, + HasKeyboardFocus = el.Properties.HasKeyboardFocus.ValueOrDefault, + BoundingRect = new Rect(rect.X, rect.Y, rect.Width, rect.Height), + Patterns = GetPatterns(el), + Value = TryGetValue(el), + ToggleState = TryGetToggleState(el), + }; + return node; + } + + private static List GetPatterns(AutomationElement el) + { + var patterns = new List(8); + try { if (el.Patterns.Invoke.IsSupported) patterns.Add("Invoke"); } catch { } + try { if (el.Patterns.Toggle.IsSupported) patterns.Add("Toggle"); } catch { } + try { if (el.Patterns.Value.IsSupported) patterns.Add("Value"); } catch { } + try { if (el.Patterns.RangeValue.IsSupported) patterns.Add("RangeValue"); } catch { } + try { if (el.Patterns.Selection.IsSupported) patterns.Add("Selection"); } catch { } + try { if (el.Patterns.SelectionItem.IsSupported) patterns.Add("SelectionItem"); } catch { } + try { if (el.Patterns.ExpandCollapse.IsSupported) patterns.Add("ExpandCollapse"); } catch { } + try { if (el.Patterns.Scroll.IsSupported) patterns.Add("Scroll"); } catch { } + try { if (el.Patterns.Window.IsSupported) patterns.Add("Window"); } catch { } + try { if (el.Patterns.Text.IsSupported) patterns.Add("Text"); } catch { } + return patterns; + } + + private static string? TryGetValue(AutomationElement el) + { + try + { + if (el.Patterns.Value.IsSupported) + { + return el.Patterns.Value.Pattern.Value.ValueOrDefault; + } + } + catch { } + return null; + } + + private static string? TryGetToggleState(AutomationElement el) + { + try + { + if (el.Patterns.Toggle.IsSupported) + { + return el.Patterns.Toggle.Pattern.ToggleState.ValueOrDefault switch + { + ToggleState.On => "on", + ToggleState.Off => "off", + ToggleState.Indeterminate => "indeterminate", + _ => null, + }; + } + } + catch { } + return null; + } + + private static string? NullIfEmpty(string? s) => string.IsNullOrEmpty(s) ? null : s; +} diff --git a/dotnet/uiAutomationHelper/test/SelectorParserTests.cs b/dotnet/uiAutomationHelper/test/SelectorParserTests.cs new file mode 100644 index 0000000000..ad983fdb15 --- /dev/null +++ b/dotnet/uiAutomationHelper/test/SelectorParserTests.cs @@ -0,0 +1,138 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using UiAutomationHelper.Models; +using UiAutomationHelper.Uia; +using Xunit; + +namespace UiAutomationHelper.Tests; + +public class SelectorParserTests +{ + [Fact] + public void Parse_SingleWindow_NoPredicates() + { + var p = SelectorParser.Parse("/Window"); + Assert.Single(p.Segments); + var s = p.Segments[0]; + Assert.Equal("Window", s.ControlType); + Assert.Null(s.Name); + Assert.Null(s.AutomationId); + Assert.Null(s.ClassName); + Assert.Null(s.Index); + } + + [Fact] + public void Parse_NamePredicate() + { + var p = SelectorParser.Parse("/Window[Name=\"Clock\"]"); + Assert.Equal("Clock", p.Segments[0].Name); + } + + [Fact] + public void Parse_AutomationIdPredicate() + { + var p = SelectorParser.Parse("/Button[AutomationId=\"StartButton\"]"); + Assert.Equal("StartButton", p.Segments[0].AutomationId); + } + + [Fact] + public void Parse_ClassNamePredicate() + { + var p = SelectorParser.Parse("/Pane[ClassName=\"Microsoft.UI.Xaml.Controls.PaneRoot\"]"); + Assert.Equal("Microsoft.UI.Xaml.Controls.PaneRoot", p.Segments[0].ClassName); + } + + [Fact] + public void Parse_IndexPredicate() + { + var p = SelectorParser.Parse("/ListItem[3]"); + Assert.Equal(3, p.Segments[0].Index); + } + + [Fact] + public void Parse_MultiplePredicates() + { + var p = SelectorParser.Parse("/Button[Name=\"Save\"][AutomationId=\"SaveBtn\"]"); + Assert.Equal("Save", p.Segments[0].Name); + Assert.Equal("SaveBtn", p.Segments[0].AutomationId); + } + + [Fact] + public void Parse_DeepPath() + { + var p = SelectorParser.Parse( + "/Window[Name=\"Clock\"]/Pane/Pivot/PivotItem[Name=\"Timer\"]/Button[AutomationId=\"StartButton\"]"); + Assert.Equal(5, p.Segments.Count); + Assert.Equal("Window", p.Segments[0].ControlType); + Assert.Equal("Clock", p.Segments[0].Name); + Assert.Equal("Pane", p.Segments[1].ControlType); + Assert.Equal("Pivot", p.Segments[2].ControlType); + Assert.Equal("Timer", p.Segments[3].Name); + Assert.Equal("StartButton", p.Segments[4].AutomationId); + } + + [Fact] + public void Parse_EscapedQuoteInValue() + { + var p = SelectorParser.Parse("/Edit[Name=\"It\\\"s mine\"]"); + Assert.Equal("It\"s mine", p.Segments[0].Name); + } + + [Fact] + public void Parse_EscapedBackslashInValue() + { + var p = SelectorParser.Parse("/Edit[Name=\"path\\\\to\\\\file\"]"); + Assert.Equal("path\\to\\file", p.Segments[0].Name); + } + + [Fact] + public void Parse_EmptyStringValue() + { + var p = SelectorParser.Parse("/Edit[Name=\"\"]"); + Assert.Equal("", p.Segments[0].Name); + } + + [Theory] + [InlineData("")] + [InlineData("Window")] // missing leading slash + [InlineData("/")] // empty segment + [InlineData("/Window[Name=Clock]")] // unquoted value + [InlineData("/Window[Name=\"Clock\"")] // unterminated bracket + [InlineData("/Window[Bogus=\"x\"]")] // unknown predicate key + [InlineData("/123Window")] // identifier starting with digit + public void Parse_InvalidInputs_Throw(string input) + { + Assert.Throws(() => SelectorParser.Parse(input)); + } + + [Fact] + public void Format_RoundTrip() + { + var input = "/Window[AutomationId=\"win1\"][Name=\"Clock\"]/Button[AutomationId=\"StartButton\"]"; + var path = SelectorParser.Parse(input); + var formatted = SelectorParser.Format(path); + var reparsed = SelectorParser.Parse(formatted); + Assert.Equal(path.Segments.Count, reparsed.Segments.Count); + for (int i = 0; i < path.Segments.Count; i++) + { + Assert.Equal(path.Segments[i], reparsed.Segments[i]); + } + } + + [Fact] + public void Format_PutsAutomationIdFirst() + { + var seg = new SelectorSegment("Button", Name: "X", AutomationId: "Y"); + var formatted = SelectorParser.FormatSegment(seg); + Assert.Equal("/Button[AutomationId=\"Y\"][Name=\"X\"]", formatted); + } + + [Fact] + public void Format_EscapesQuotes() + { + var seg = new SelectorSegment("Edit", Name: "He said \"hi\""); + var formatted = SelectorParser.FormatSegment(seg); + Assert.Equal("/Edit[Name=\"He said \\\"hi\\\"\"]", formatted); + } +} diff --git a/dotnet/uiAutomationHelper/test/UiAutomationHelper.Tests.csproj b/dotnet/uiAutomationHelper/test/UiAutomationHelper.Tests.csproj new file mode 100644 index 0000000000..40380730fa --- /dev/null +++ b/dotnet/uiAutomationHelper/test/UiAutomationHelper.Tests.csproj @@ -0,0 +1,19 @@ + + + net8.0-windows + enable + enable + latest + false + bin\$(Configuration) + false + + + + + + + + + + diff --git a/ts/packages/agents/desktop/src/actionHandler.ts b/ts/packages/agents/desktop/src/actionHandler.ts index 145e8febfe..00fff13b22 100644 --- a/ts/packages/agents/desktop/src/actionHandler.ts +++ b/ts/packages/agents/desktop/src/actionHandler.ts @@ -3,12 +3,25 @@ import { ActionContext, + ActionResult, AppAction, AppAgent, SessionContext, } from "@typeagent/agent-sdk"; -import { createActionResult } from "@typeagent/agent-sdk/helpers/action"; import { + ChoiceManager, + createActionResult, + createActionResultFromError, + createActionResultFromTextDisplay, + createYesNoChoiceResult, +} from "@typeagent/agent-sdk/helpers/action"; +import { + displayStatus, + displaySuccess, +} from "@typeagent/agent-sdk/helpers/display"; +import { + AutoShellMissingError, + buildAutoShell, disableDesktopActionContext, DesktopActionContext, runDesktopActions, @@ -20,6 +33,22 @@ export function instantiate(): AppAgent { initializeAgentContext: initializeDesktopContext, updateAgentContext: updateDesktopContext, executeAction: executeDesktopAction, + async handleChoice( + choiceId: string, + response: boolean | number[], + context: ActionContext, + ) { + const state = context.sessionContext.agentContext; + state.pendingChoiceContext = context; + try { + return await state.choiceManager.handleChoice( + choiceId, + response, + ); + } finally { + state.pendingChoiceContext = null; + } + }, }; } @@ -30,6 +59,8 @@ async function initializeDesktopContext(): Promise { backupProgramNameTable: undefined, refreshPromise: undefined, abortRefresh: undefined, + choiceManager: new ChoiceManager(), + pendingChoiceContext: null, }; } @@ -45,10 +76,10 @@ async function updateDesktopContext( } } -async function executeDesktopAction( +async function runAction( action: AppAction, context: ActionContext, -) { +): Promise { const result = await runDesktopActions( action as AllDesktopActions, context.sessionContext.agentContext, @@ -66,6 +97,81 @@ async function executeDesktopAction( return { error: result.message }; } +function offerAutoShellBuild( + context: ActionContext, + error: AutoShellMissingError, + action: AppAction, +): ActionResult { + const state = context.sessionContext.agentContext; + return createYesNoChoiceResult( + state.choiceManager, + `The desktop automation helper (autoShell.exe) isn't built yet (expected at ${error.binaryPath}). ` + + `Build it now? This runs 'dotnet build -c Release' on autoShell.sln and may take ~30s.`, + async (confirmed: boolean) => { + if (!confirmed) { + return createActionResultFromTextDisplay( + "autoShell build skipped — action cancelled.", + ); + } + const ctx = state.pendingChoiceContext ?? context; + try { + await displayStatus( + "Building desktop automation helper (dotnet build -c Release)...", + ctx, + ); + let lastUpdate = 0; + await buildAutoShell({ + onProgress: (chunk) => { + const now = Date.now(); + if (now - lastUpdate < 1000) return; + lastUpdate = now; + const last = chunk + .split(/\r?\n/) + .map((l) => l.trim()) + .filter(Boolean) + .pop(); + if (last) { + void displayStatus( + `Building desktop helper: ${last.slice(0, 100)}`, + ctx, + ); + } + }, + }); + } catch (buildError) { + return createActionResultFromError( + `autoShell build failed: ${buildError instanceof Error ? buildError.message : String(buildError)}`, + ); + } + await displaySuccess( + "Desktop helper built. Running your action...", + ctx, + ); + try { + return await runAction(action, ctx); + } catch (e) { + return createActionResultFromError( + e instanceof Error ? e.message : String(e), + ); + } + }, + ); +} + +async function executeDesktopAction( + action: AppAction, + context: ActionContext, +) { + try { + return await runAction(action, context); + } catch (e) { + if (e instanceof AutoShellMissingError) { + return offerAutoShellBuild(context, e, action); + } + throw e; + } +} + function formatResultDisplay( actionName: string, message: string, diff --git a/ts/packages/agents/desktop/src/connector.ts b/ts/packages/agents/desktop/src/connector.ts index 2c680297aa..e956c4bf47 100644 --- a/ts/packages/agents/desktop/src/connector.ts +++ b/ts/packages/agents/desktop/src/connector.ts @@ -4,7 +4,8 @@ import child_process from "node:child_process"; import { fileURLToPath } from "node:url"; import { ProgramNameIndex, loadProgramNameIndex } from "./programNameIndex.js"; -import { Storage } from "@typeagent/agent-sdk"; +import { ActionContext, Storage } from "@typeagent/agent-sdk"; +import { ChoiceManager } from "@typeagent/agent-sdk/helpers/action"; import registerDebug from "debug"; import { AllDesktopActions } from "./allActionsSchema.js"; import fs from "node:fs"; @@ -23,6 +24,8 @@ export type DesktopActionContext = { backupProgramNameTable: string[] | undefined; refreshPromise: Promise | undefined; abortRefresh: AbortController | undefined; + choiceManager: ChoiceManager; + pendingChoiceContext: ActionContext | null; }; interface ActionResult { @@ -40,13 +43,83 @@ const pendingRequests = new Map< // Buffer for incomplete lines from stdout let stdoutBuffer = ""; -const autoShellPath = resolveAutoShellPath(); +const AUTOSHELL_SLN_RELATIVE = "dotnet/autoShell/autoShell.sln"; -function resolveAutoShellPath(): URL { +function resolveAutoShellSln(): string { + return path.resolve( + path.dirname(fileURLToPath(import.meta.url)), + "../../../../..", + AUTOSHELL_SLN_RELATIVE, + ); +} + +/** + * Thrown when the autoShell.exe helper is missing. Callers can catch this + * to offer to build the helper interactively instead of surfacing a raw + * spawn error. + */ +export class AutoShellMissingError extends Error { + readonly binaryPath: string; + readonly slnPath: string; + + constructor(binaryPath: string, slnPath: string) { + super( + `autoShell binary not found at ${binaryPath}. ` + + `Build it via: dotnet build -c Release ${slnPath}`, + ); + this.name = "AutoShellMissingError"; + this.binaryPath = binaryPath; + this.slnPath = slnPath; + } +} + +/** + * Builds the autoShell .NET helper via `dotnet build -c Release`. + * Resolves on success; rejects with stderr output on failure. + */ +export async function buildAutoShell( + opts: { onProgress?: (line: string) => void } = {}, +): Promise { + const slnPath = resolveAutoShellSln(); + if (!fs.existsSync(slnPath)) { + throw new Error(`autoShell solution not found at ${slnPath}`); + } + return new Promise((resolve, reject) => { + const child = child_process.spawn( + "dotnet", + ["build", "-c", "Release", slnPath], + { + stdio: ["ignore", "pipe", "pipe"], + windowsHide: true, + }, + ); + const stderrChunks: string[] = []; + child.stdout!.on("data", (data: Buffer) => { + opts.onProgress?.(data.toString()); + }); + child.stderr!.on("data", (data: Buffer) => { + stderrChunks.push(data.toString()); + }); + child.on("error", reject); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + } else { + reject( + new Error( + `dotnet build exited with code ${code}: ${stderrChunks.join("").trim()}`, + ), + ); + } + }); + }); +} + +function resolveAutoShellPath(): string { // Allow override via environment variable const envPath = process.env.AUTOSHELL_PATH; if (envPath) { - return new URL(`file://${path.resolve(envPath)}`); + return path.resolve(envPath); } // Search relative to the compiled JS output (dist/) @@ -57,15 +130,13 @@ function resolveAutoShellPath(): URL { for (const config of ["Debug", "Release"]) { const candidate = path.join(baseDir, config, "autoShell.exe"); if (fs.existsSync(candidate)) { - return new URL(`file://${candidate}`); + return candidate; } } - // Fallback to Debug path (will fail at spawn time with a clear error) - return new URL( - "../../../../../dotnet/autoShell/bin/Debug/autoShell.exe", - import.meta.url, - ); + // No build present — return the Debug path so callers can report it via + // AutoShellMissingError without performing another fs lookup. + return path.join(baseDir, "Debug", "autoShell.exe"); } // Load known action names from .pas.json schema files for runtime validation. @@ -110,8 +181,12 @@ function loadKnownActionNames(): Set { } async function spawnAutomationProcess() { + const binaryPath = resolveAutoShellPath(); + if (!fs.existsSync(binaryPath)) { + throw new AutoShellMissingError(binaryPath, resolveAutoShellSln()); + } return new Promise((resolve, reject) => { - const child = child_process.spawn(fileURLToPath(autoShellPath)); + const child = child_process.spawn(binaryPath); child.on("error", (err) => { reject(err); }); @@ -500,7 +575,17 @@ export async function setupDesktopActionContext( agentContext: DesktopActionContext, storage?: Storage, ) { - await ensureAutomationProcess(agentContext); + try { + await ensureAutomationProcess(agentContext); + } catch (e) { + if (e instanceof AutoShellMissingError) { + // Defer reporting — the next executeAction will surface the + // typed error and prompt the user to build the helper. + debug(`autoShell missing during setup: ${e.message}`); + return; + } + throw e; + } return ensureProgramNameIndex(agentContext, storage); } @@ -532,7 +617,17 @@ async function refreshInstalledApps( agentContext.abortRefresh = abortRefresh; debug("Refreshing installed apps"); - const desktopProcess = await ensureAutomationProcess(agentContext); + let desktopProcess: child_process.ChildProcess; + try { + desktopProcess = await ensureAutomationProcess(agentContext); + } catch (e) { + if (e instanceof AutoShellMissingError) { + debug(`autoShell missing during refresh: ${e.message}`); + agentContext.abortRefresh = undefined; + return; + } + throw e; + } const programs = await fetchInstalledApps(desktopProcess); if (programs) { for (const element of programs) { diff --git a/ts/packages/agents/onboarding/README.md b/ts/packages/agents/onboarding/README.md index b2e7da9254..de3e0f5cde 100644 --- a/ts/packages/agents/onboarding/README.md +++ b/ts/packages/agents/onboarding/README.md @@ -65,6 +65,12 @@ This spawns the binary with `--help` (falling back to `-h`), parses subcommands, When the scaffolder detects CLI-sourced actions, it auto-generates a working handler with `buildArgs()` and `runCli()` functions instead of a stub. +### UI Automation crawling (experimental) + +For Windows desktop apps that have no documented API surface, an **experimental** UI-driven crawler can discover actions by autonomously driving the app via Microsoft's UI Automation framework. It produces the same `discoveredActions.json` shape as the API-based crawlers, plus per-action playback recipes that can be replayed at runtime. + +This path is not yet wired into the standard `startOnboarding` flow — surface, helper RPC, and synthesis prompts are still changing. See [`src/uiCapture/README.md`](./src/uiCapture/README.md) for status, pipeline overview, and how to run it. + ## Workspace layout ``` diff --git a/ts/packages/agents/onboarding/USER_GUIDE.md b/ts/packages/agents/onboarding/USER_GUIDE.md index f6146e5992..290b45ecff 100644 --- a/ts/packages/agents/onboarding/USER_GUIDE.md +++ b/ts/packages/agents/onboarding/USER_GUIDE.md @@ -530,3 +530,11 @@ AI: I'll kick off the Jira onboarding. Let me start by discovering the API surfa **Test failures drive improvement.** A 70% pass rate on first run is typical. Two rounds of `proposeRepair` → `runTests` usually gets to 90%+. The LLM is good at diagnosing pattern mismatches. **Re-use grows over time.** The second integration you onboard will reuse the doc crawler, phrase generator, and schema generator — only the integration-specific configuration changes. + +--- + +## Experimental: UI Automation onboarding + +For Windows desktop apps without a public API, there is an **experimental** UI-crawling path that drives the app autonomously via Microsoft's UI Automation framework and synthesizes a `discoveredActions.json` with replayable playback recipes. It is not wired into the natural-language flow above and is not yet stable — surface, prompts, and on-disk artifact shapes may change. + +If you want to evaluate it, see [`src/uiCapture/README.md`](./src/uiCapture/README.md) for the pipeline overview and the smoke tests used to drive it end-to-end. diff --git a/ts/packages/agents/onboarding/package.json b/ts/packages/agents/onboarding/package.json index 5673c92590..2807dbd1d1 100644 --- a/ts/packages/agents/onboarding/package.json +++ b/ts/packages/agents/onboarding/package.json @@ -14,7 +14,8 @@ "type": "module", "exports": { "./agent/manifest": "./src/onboardingManifest.json", - "./agent/handlers": "./dist/onboardingActionHandler.js" + "./agent/handlers": "./dist/onboardingActionHandler.js", + "./uiCapture": "./dist/uiCapture/index.js" }, "scripts": { "agc:discovery": "agc -i ./src/discovery/discoverySchema.agr -o ./dist/discoverySchema.ag.json", @@ -34,7 +35,7 @@ "asc:schemagen": "asc -i ./src/schemaGen/schemaGenSchema.ts -o ./dist/schemaGenSchema.pas.json -t SchemaGenActions", "asc:testing": "asc -i ./src/testing/testingSchema.ts -o ./dist/testingSchema.pas.json -t TestingActions", "build": "concurrently npm:tsc npm:asc:* npm:agc:*", - "postbuild": "copyfiles -u 1 \"src/**/discoveryLlmSchema.ts\" dist", + "postbuild": "copyfiles -u 1 \"src/**/discoveryLlmSchema.ts\" \"src/**/exploreLlmSchema.ts\" \"src/**/synthesisLlmSchema.ts\" \"src/**/reconLlmSchema.ts\" \"src/**/iterativeReconLlmSchema.ts\" dist", "clean": "rimraf --glob dist *.tsbuildinfo *.done.build.log", "prettier": "prettier --check . --ignore-path ../../../.prettierignore", "prettier:fix": "prettier --write . --ignore-path ../../../.prettierignore", diff --git a/ts/packages/agents/onboarding/src/lib/llm.ts b/ts/packages/agents/onboarding/src/lib/llm.ts index 0cc6e5c9b3..7d7c364e00 100644 --- a/ts/packages/agents/onboarding/src/lib/llm.ts +++ b/ts/packages/agents/onboarding/src/lib/llm.ts @@ -47,3 +47,31 @@ export function getPackagingModel(endpoint?: string): ChatModel { "onboarding:packaging", ]); } + +export function getExploreModel(endpoint?: string): ChatModel { + // Default to GPT-5 — exploration benefits from reasoning when picking + // the next frontier action and recognizing modal vs. neutral states. + return openai.createChatModel(endpoint ?? "GPT_5", undefined, undefined, [ + "onboarding:explore", + ]); +} + +export function getSynthesisModel(endpoint?: string): ChatModel { + // Synthesis (neutral classification, chunk clustering, per-cluster + // action emission, validation) is structural reasoning over a large + // graph — a reasoning model produces dramatically better aggregation. + return openai.createChatModel(endpoint ?? "GPT_5", undefined, undefined, [ + "onboarding:synthesis", + ]); +} + +export function getReconModel(endpoint?: string): ChatModel { + // Reconnaissance is vision-driven (sends screenshots) so we must use a + // multimodal-capable deployment. GPT-v is the dedicated vision endpoint + // in this Azure config. (GPT-5 deployments here returned "API version + // not supported" for image_url content; GPT-4o uses a /v1/ URL shape + // that aiclient doesn't construct correctly.) + return openai.createChatModel(endpoint ?? "GPT_v", undefined, undefined, [ + "onboarding:recon", + ]); +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/README.md b/ts/packages/agents/onboarding/src/uiCapture/README.md new file mode 100644 index 0000000000..845e82d349 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/README.md @@ -0,0 +1,317 @@ +# UI Capture — autonomous UI Automation onboarding (experimental) + +> **Status: Experimental.** This UI-crawling approach is under active development. Selectors, helper RPC surface, synthesis prompts, and on-disk artifact shapes are not yet stable and may change without notice. Use it for evaluation and feedback, not as a production onboarding path. + +This subsystem turns a Windows desktop app into a TypeAgent-replayable action set by **driving the app autonomously and observing what it does**. Given an app's AUMID (or executable path), it produces a `discoveredActions.json` file containing user-meaningful actions with parameters and step-by-step playback recipes that can be replayed against a fresh instance of the app. + +It's the experimental alternative to API-based onboarding (`crawlDocUrl`, `parseOpenApiSpec`, `crawlCliHelp`): when the app has no public API surface but does have a UI, we crawl the UI directly via Microsoft's UI Automation framework. + +## Status (2026-05-05) + +**Working end-to-end through TypeAgent.** The full pipeline ships — helper, exploration, snapshot capture/restore, dynamic-controls calibration, record mode, synthesis with merge-into-workspace, validation pass, vision-driven reconnaissance, playback executor, scaffolder for runtime agents, and dispatcher integration. + +Verified on Windows Clock with a 4-tab focused-crawl run: + +- 35 candidate actions surfaced by vision recon +- 4 per-tab crawls (alarm, stopwatch, worldclock, focus) merged into 14 actions in `discoveredActions.json` +- Scaffolded into `packages/agents/windowsClock/`, registered in dispatcher +- `run request "in windows clock, set an alarm for 8:30 named morning"` → real "morning" alarm at 8:30 AM in Clock's tree + +**What's still left (by priority):** + +1. **createAlarm assumes the right tab is already active.** Synthesis correctly extracted the alarm-creation flow but discarded the tab-navigate prefix (it became its own `navigateToAlarmTab` action). Multi-step user requests through TypeAgent need to chain `navigateToAlarmTab` → `createAlarm`. Real fix: the runtime handler should auto-call `navigateToTab` matching the action's `tabOrSection` if the app isn't already there. Synthesizer could also inject the prefix step explicitly. +2. **Toggle boolean parameter examples are nonsense.** Auto-merged actions like `setStopwatchRunning(running: boolean)` get `examples: ['stopwatch']` instead of `[true, false]` — `applyMergeRecommendations` falls back to `collectExamples` which derives from the action-name suffix. Recipes still execute correctly because the boolean isn't referenced in the 1-step toggle playback, but the schema example values are misleading. Fix: pass `[true, false]` for boolean enum params during merge. +3. **Dispatcher construction cache misroutes common phrasings.** Phrases like _"create an alarm"_ hit the onboarding-agent's `scaffoldAgent`, _"go to X"_ hits `excel.navigateToCell`, _"switch to"_ hits `player.selectDevice`. Workaround during testing: include "windows clock" in the request to force fresh translation. Real fix: clear the construction cache after adding a new agent OR write explicit grammar (`.agr`) for windowsClock so its phrases populate the cache with the right routing. +4. **TypeAgent integration via the scaffolder is now the canonical path.** Synthesizer prompts haven't been iterated against the recon-driven richer input — some sub-step actions (`nameAlarm`, `setAlarmTime`) emitted by recon should roll up into one `createAlarm(name, hour, minute)`. The current synthesis pass mostly handles this but a focused review is warranted. +5. **Selector decay through dynamic ancestors** (e.g., `Group[Name="Stopwatch, Paused, 12 seconds"]` ancestors invalidating once the stopwatch starts). `Selectors.BuildSegment` adds ClassName for disambiguation when AutomationId is missing, but a Group with ONLY a dynamic Name can't be salvaged that way. Future work: a selector-relative resolver that searches descendants from the nearest stable ancestor. Showed up in the multi-tab crawl as the only repeated failure (`recordLap` and `setStopwatchRunning` had stale selectors after the stopwatch started ticking). +6. **Selector fallback for action playback.** Single-identifier selectors break when an app version changes a Name or AutomationId. A multi-identifier selector format (record AutomationId AND Name AND ClassName at capture; resolver tries them in order) would harden replay. +7. **Helper bundling for shipped agents.** The runtime agent currently resolves the helper binary via a repo-relative dev path. For an agent that ships independently, the helper exe needs to be bundled into the agent's `bin/` and the resolver updated. + +## Pipeline overview + +``` + ┌─ phase 0: snapshot baseline (UWP folders / registry / etc.) + │ + ▼ optional vision-LLM phase: + ┌────────┐ ┌────────────────┐ per-tab survey or iterative +AUMID ►│ helper │ ─► │ reconnaissance │ ─► ExpectedAction[] (TODO list) + │ │ │ (vision LLM) │ │ + └────────┘ └────────────────┘ ▼ + │ ┌──────────────────┐ + └─────────────────────────► │ explore loop │ + │ (LLM oracle — │ + │ drives the TODO │ + │ list, observes) │ + └────────┬─────────┘ + ▼ + state graph (states.jsonl + + transitions.jsonl + per-state + TreeNode JSON) + │ + ▼ + ┌──────────────────────┐ + │ synthesis (GPT-5) │ + │ neutral-classify → │ + │ chunk → cluster → │ + │ synthesize → │ + │ validate (auto- │ + │ merge duplicates) │ + └────────┬─────────────┘ + ▼ + discoveredActions.json + ▲ (parameters + playback recipes, + │ merged into workspace-level file) + │ │ + └─ phase N: snapshot restore ◄──────┤ + ▼ + playback executor replays any + action with new parameter values +``` + +## Components + +### Helper (`dotnet/uiAutomationHelper/`, C#) + +A long-lived child process that exposes a JSON-RPC stdio surface backed by **FlaUI / UIA3**. The TS side (`HelperClient`) drives it. Why a separate process: UIA's COM apartment + Windows-only types are easier to handle in .NET than via Node N-API, and isolating the helper means a UIA crash doesn't take down the explorer. + +Methods (one-line summaries — see `helperClient.ts` for full types): + +| Surface | Methods | +| --------- | ------------------------------------------------------------------------------------- | +| Lifecycle | `app.launch / attach / list / kill` | +| Capture | `tree.dump`, `tree.fingerprint`, `screenshot` | +| Drive | `do.invoke / toggle / setValue / select / expand / scroll / focus / click / sendKeys` | +| Find | `find` (with optional polling timeout) | +| Idle | `events.idle` (debounce on UIA focus events) | +| Record | `events.subscribe / unsubscribe` (server-pushed `event.fired` notifications) | +| Snapshot | `snapshot.capture / restore / delete` (folder copy + replace-not-merge) | +| Health | `health.ping` | + +Selectors are a custom XPath-like DSL: `/Window[Name="Clock"][ClassName="ApplicationFrameWindow"]/Window[Name="Clock"][ClassName="Windows.UI.Core.CoreWindow"]/Custom[AutomationId="NavView"]/...` + +Capture-time the helper picks the most stable identifier in priority order (AutomationId → Name + ClassName → ClassName → bare type) and resolves the path as a UIA descendant chain. + +### Reconnaissance (`tabReconnaissance.ts`, `iterativeReconnaissance.ts`) + +Optional but highly effective: send screenshots + filtered control trees to a vision-capable LLM and have it enumerate the actions each tab/screen supports BEFORE running the autonomous explorer. The output (`ExpectedAction[]`) becomes a numbered TODO list that's fed into the explore loop's goal — much more deterministic coverage than free-form exploration. + +Two flavors: + +- **`tabReconnaissance.reconnoiterApp`** — deterministic tab discovery (largest cluster of sibling ListItems with SelectionItem pattern), then ONE vision call per tab. Fast (~1–2 min), shallow (only sees top-level tab content). + +- **`iterativeReconnaissance.iterativeReconnoiter`** — multi-turn loop. Per turn the vision LLM sees the current screenshot + filtered tree + already-discovered list, returns `newDiscoveries[]` plus a `click` / `back` / `done` decision. **Drills INTO modals/dialogs** to enumerate their fields, then clicks Cancel to back out. Way richer — Clock test with 20 turns produced 34 distinct actions across 5 tabs including secondary features (`keepTimerOnTop`, `linkSpotify`, `repeatAlarm` with days-of-week enum, `setAlarmSound` with sound enum). Correctly flagged `resetStopwatch` as destructive. + +Vision model selection: `getReconModel()` defaults to **GPT-v** (the dedicated vision deployment in this Azure config). GPT-5 deployments here returned "API version not supported" for `image_url` content; GPT-4o uses a `/openai/v1/...` URL shape that aiclient doesn't construct correctly. GPT-v on the standard `/openai/deployments/...` path works directly. + +TypeChat wiring for vision: image content goes in `promptHistory` as a prior user message; the schema-bearing text prompt goes via `translate(request)` so TypeChat's standard "respond with this JSON schema" wrapper is appended. The markdown agent uses the same pattern. + +`renderIterativeReconAsGoal(recon)` turns the discovered list into a TODO-style goal string for the explore loop. The explorer then drives each action concretely (with the recon's example parameter values), producing one chunk per intent — clustering and synthesis become easier downstream. + +### Explorer (`explorer.ts` + `llmOracle.ts`) + +Deterministic outer loop with a **pluggable `DecisionOracle`**. Per iteration: + +1. `events.idle` (wait for UIA to settle) +2. `tree.dump` + `tree.fingerprint` → register state (or dedupe against existing) +3. Compute frontier (`frontier.ts`): every actionable, on-screen control gets a FrontierItem with available verbs (`invoke`, `toggle`, `setValue`, `select`, `expand`, `scroll`, …) and a destructive heuristic +4. Oracle decides: `act{frontierId, verb, value, expectedDelta, rationale}` | `stop` | `restore` +5. Execute the chosen verb against the chosen selector +6. Re-capture, append a transition (`source: "agent"`) +7. Persist `states.jsonl` + `transitions.jsonl` + `states/state-NNN.json` incrementally — runs are crash-recoverable + +Budget knobs: `maxIterations` / `maxWallClockMs` / `maxStates` / `convergenceThreshold` (iterations since last new state). + +The default `LlmOracle` uses **GPT-5** via TypeChat with a structured output schema (`exploreLlmSchema.ts`). The system prompt instructs breadth-over-depth exploration, popup/modal dismissal, avoidance of destructive actions, and committing to multi-step task completion. Prompt-caching-friendly structure: system prompt + goal stay constant; per-turn input is the current state's frontier + recent history. + +A `StubOracle` exists for tests — it picks deterministically without calling an LLM. + +### Synthesis (`synthesizer.ts`) + +Five-stage pipeline that converts the raw graph into discovered actions: + +1. **Neutral classification** — one GPT-5 call. For each captured state, judge: is this a settled rest point (user could start a new task) or mid-flow (modal, wizard, animation)? Modals / popups / flyouts / "Save"-bearing states are hard rules: NEVER neutral. + +2. **Chunking** — deterministic. Split the transition log at neutral-state boundaries. A chunk is a path from one neutral state to the next neutral state; mid-flow transitions stay together inside one chunk. + +3. **Clustering** — one GPT-5 call covering all chunks. Group by user-meaningful intent. Strict rules: + + - Aggressively merge multi-step task flows. `open dialog → fill fields → click Save` is ONE intent, not three. + - Parameterize by variation. Same selector pattern with different values across chunks → same cluster. + - Toggle-aware. The same Play/Pause button being clicked alternately is two clusters (`startStopwatch` and `pauseStopwatch`), not one cluster of nine clicks. + - Don't emit fragments. No `setNameField` cluster — that's a sub-step of `createAlarm`. + +4. **Per-cluster synthesis** — one GPT-5 call per cluster. Build the canonical playback by taking the LONGEST chunk in the cluster (so we don't drop intermediate steps), then for each step: if values vary across chunks → `valueRef "${paramName}"`, else `valueLiteral`. Detect destructive intents (delete/remove/reset/clear). + +5. **Validation pass** — one GPT-5 call. Re-read the full action set and flag fragments / duplicates / broken / ambiguous actions. If duplicates are found (e.g., three `navigateToTabAlarm`/`Timer`/`Clock` actions doing the same thing differently), emit a `MergeRecommendation` with a proposed combined name (`navigateToTab`) and a parameter (`tab: "alarm"|"timer"|"clock"|...`). Apply merges automatically. + +The output is a `discoveredActions.json` with the same outer shape that `crawlDocUrl` etc. produce, plus a `playback` field that's specific to the UI-capture path: + +```jsonc +{ + "actionName": "createAlarm", + "description": "Create a new alarm with a specified name and time.", + "parameters": [ + { "name": "name", "type": "string", "examples": ["Morning Alarm"] }, + { "name": "minutes", "type": "number", "examples": [30] }, + ], + "playback": [ + { + "selector": "/.../Button[AutomationId=\"AddAlarmButton\"]", + "verb": "invoke", + }, + { + "selector": "/.../Edit[ClassName=\"TextBox\"]", + "verb": "setValue", + "valueRef": "${name}", + }, + { + "selector": "/.../Custom[AutomationId=\"MinutePicker\"]", + "verb": "setValue", + "valueRef": "${minutes}", + }, + { + "selector": "/.../Button[AutomationId=\"PrimaryButton\"]", + "verb": "invoke", + }, + ], + "preconditions": { + "neutralState": "alarmTab", + "description": "On the Alarm tab", + }, + "postconditions": { "description": "New alarm appears in the alarm list" }, + "destructive": false, +} +``` + +### Playback executor (`playbackExecutor.ts`) + +Generic. Takes a `SynthesizedAction` + `params: Record` + helper client → executes the playback. Resolves `valueRef` against `params`, dispatches each step's verb to the appropriate `do.*` RPC, waits for UIA idle between steps that mutate structure (`invoke` / `select`), and returns a per-step success/failure log. + +The executor is the same machinery used by both: + +- The explorer's per-iteration action execution (via `runExploration`) +- A future runtime agent that exposes the discovered actions to TypeAgent + +So whatever the explorer captured during the crawl is by construction replayable at runtime. + +### Snapshot policy (`snapshotPolicy.ts`) + +Per-integration safety net so a crawl that creates alarms / timers / cities can be reverted. Auto-detects UWP storage via `Get-AppxPackage`-derived `PackageFamilyName`, producing a candidate `SnapshotPolicy`: + +```jsonc +{ + "version": 1, + "integrationName": "windowsClock", + "detectionStatus": "auto-candidate", + "processIdentity": { "aumid": "Microsoft.WindowsAlarms_8wekyb3d8bbwe!App" }, + "state": [ + { + "kind": "folder", + "path": "%LOCALAPPDATA%\\Packages\\Microsoft.WindowsAlarms_8wekyb3d8bbwe\\LocalState", + "recursive": true, + }, + { + "kind": "folder", + "path": "%LOCALAPPDATA%\\Packages\\Microsoft.WindowsAlarms_8wekyb3d8bbwe\\Settings", + "recursive": true, + }, + { + "kind": "folder", + "path": "%LOCALAPPDATA%\\Packages\\Microsoft.WindowsAlarms_8wekyb3d8bbwe\\RoamingState", + "recursive": true, + }, + ], +} +``` + +Capture: kill the process if any source needs file locks released, then copy each source. Restore: kill the process, **delete** the target paths (replace-not-merge so files added during the crawl actually disappear), then copy the snapshot back. + +The user is expected to review the auto-detected policy before approving it (`detectionStatus: "user-confirmed"`). A `markStateless` option is available for apps with no persisted state. + +### Dynamic-controls calibration (`dynamicControls.ts`) + +Some UIA controls change value without any user input — clock faces, running timer text, animations. If those leak into the state fingerprint, every fingerprint is unique and dedup breaks. + +A calibration pass takes 3 tree dumps spaced 3 seconds apart with no input, diffs by selector, and emits `DynamicControlRule[]` flagging which controls' `value` / `name` / `toggleState` are unstable. The fingerprint computer (in the C# helper) skips those properties when matched. + +Rules accumulate over time — the explorer can mark new dynamic controls as it observes drift it didn't explicitly cause (`reason: "explore-drift"`). + +### Record mode (`recorder.ts`) + +A separate path, mostly for **augmenting** an autonomous crawl with user-driven gestures the LLM didn't think to try. Subscribe to UIA's `InvokedEvent` / `PropertyChangedEvent` / `StructureChangedEvent` on a target window, and every event becomes a `transitions.jsonl` line. + +Caveat: UIA's `InvokedEvent` doesn't propagate to in-process listeners for UWP apps under same-process synthetic-input invocation — `StructureChangedEvent` works but `InvokedEvent` only reliably fires for true user-driven events from outside the helper's process. + +## On-disk layout per integration + +``` +~/.typeagent/onboarding// + snapshotPolicy.json ← persisted from inferSnapshotPolicy + user approval + dynamicControls.json ← from calibrateDynamicControls + accumulating drift + discoveredActions.json ← canonical merged action set (input to phraseGen / runtime) + snapshots// ← baseline + per-state if state-keyed strategy used + manifest.json + sources// ← captured folder contents + recordings// + transitions.jsonl ← from record mode + runs// ← one per exploration run + states.jsonl ← state metadata index + transitions.jsonl ← edge log + states/state-NNN.json ← full TreeNode per state + screenshots/state-NNN.png ← optional + discoveredActions.json ← THIS RUN's contribution (merged into the workspace one) + synthesisReport.md ← human review, fed into the approve-actions phase + metrics.json ← iteration count, walltime, stop reason, etc. +``` + +## Running a crawl + +The shipping smoke tests under `test/` exercise each phase: + +| Smoke | What it does | +| ------------------------------------- | ------------------------------------------------------------------------------- | +| `clockSmoke.ts` | helper basics: launch → tree.dump → screenshot → invoke → kill (slice 1+2) | +| `snapshotSmoke.ts` | infer + capture + dirty + restore round-trip on a synthetic state dir (slice 3) | +| `calibrateSmoke.ts` | dynamic-controls calibration on a running stopwatch (slice 4) | +| `recorderSmoke.ts` | event subscription + JSONL recording (slice 5) | +| `exploreSmoke.ts` | autonomous explore loop with the deterministic StubOracle (slice 6a) | +| `llmExploreSmoke.ts` | autonomous explore loop with the LLM oracle (slice 6b) | +| `synthesizeSmoke.ts` | explore + synthesize → discoveredActions.json (slice 7) | +| `clockCrawl.ts` / `clockFullCrawl.ts` | full crawl with snapshot baseline + restore at end | +| `clockAgentDemo.ts` | replay a crawled action with new parameters + verify in the UI | +| `resynthesize.ts` | re-run synthesis on an existing `runs//` without re-crawling | +| `clockIterativeRecon.ts` | iterative vision recon only (fast iteration on the recon prompt) | +| `clockReconCrawl.ts` | recon → goal-from-recon → crawl → synthesize → restore (the "best" pipeline) | + +Run any of them with `node packages/agents/onboarding/dist/uiCapture/test/.js` after `pnpm --filter onboarding-agent run build`. + +For a real crawl, env vars `AZURE_OPENAI_API_KEY_GPT_5` + `AZURE_OPENAI_ENDPOINT_GPT_5` must be set in `ts/.env`. Note that aiclient's env reading short-circuits on its empty-string default and doesn't fall back from `_GPT_5`-suffixed vars to base vars, so for non-default settings (timeouts etc.) the suffixed variant must also be set explicitly. The smoke tests handle this in their preamble. + +## Quality observations from the Clock crawl + +The pipeline produces real, replay-ready actions, but a few quality patterns are worth knowing: + +- **GPT-5 is the difference for synthesis.** The same crawl data goes from "12 fragmented actions" with the default model to "7 well-shaped actions" with GPT-5 + tightened prompts + validation. The reasoning model is doing structural work that smaller models can't. +- **Vision recon is the difference for coverage.** Free-form explore alone catches ~7-8 actions because the LLM oracle gravitates to obvious primary buttons. Vision-driven iterative recon caught 34 actions on Clock by drilling into Add-X dialogs and noticing secondary features (`keepTimerOnTop`, `linkSpotify`, `repeatAlarm` enum, etc.). Recon is now the recommended starting point. +- **Single-chunk clusters can't parameterize.** If only one chunk in the cluster has `setValue "New York"`, the synthesizer can't guess that the city should be a parameter. A second crawl that exercises the same intent with a different city fixes it. The validator flags these (`ambiguous` verdict, "consider parameterizing X"). +- **Modal-name selectors can decay mid-flow.** Some UWP Group containers embed running state into their `Name` (e.g., `Stopwatch, Paused, 12 seconds 23 centiseconds`). Selectors built on those Names go stale immediately when the state changes. The current selector grammar handles this by adding ClassName disambiguation when no AutomationId is present, but ancestors with dynamic names remain a real issue. +- **UWP InvokedEvent doesn't fire in-process.** The autonomous loop doesn't depend on this (it re-dumps the tree after each action), but record-mode against a same-process driver only catches StructureChanged events. +- **Snapshot-restore is critical.** Without the baseline, every crawl leaves alarms / timers / cities behind. Make sure `snapshotPolicy.detectionStatus === "user-confirmed"` before any non-trivial run. +- **API-version + URL-shape gotchas in aiclient.** GPT-5 and GPT-v deployments work via standard Azure paths. GPT-4o uses `/openai/v1/chat/completions` (Responses-API style) which aiclient doesn't construct correctly — request returns "API version not supported." Stick to GPT-5 for synthesis and GPT-v for vision; revisit if we need GPT-4o specifically. +- **Endpoint-suffixed env vars don't fall back.** aiclient's `getEnvSetting` short-circuits on its empty-string default and doesn't fall back from `_GPT_5`-suffixed vars to base vars when an endpoint suffix is set. Smoke tests have to set BOTH `AZURE_OPENAI_MAX_TIMEOUT` and `AZURE_OPENAI_MAX_TIMEOUT_GPT_5` (and `_GPT_v`) explicitly. + +## Adding a new integration + +For a UWP app with a Microsoft AUMID and a recognizable English UI, the recommended sequence: + +1. `inferSnapshotPolicy({ aumid })` → review the candidate folders, set `detectionStatus: "user-confirmed"` +2. Optional: `calibrateDynamicControls()` if the app has live time / progress / animation that would otherwise pollute the state fingerprint +3. **`iterativeReconnoiter({ appHint, maxIterations: 20–25 })`** — vision LLM enumerates actions per tab including modals. This is the new step that dramatically lifts coverage. +4. `runExploration({ goal: renderIterativeReconAsGoal(recon), budget: { maxIterations: 30–60 } })` — the explorer drives the recon's TODO list; its LLM oracle picks moves to complete each action. +5. `synthesize({ runDir, integrationName, workspaceDir })` — neutral classify → chunk → cluster → synthesize → validate; merges into the workspace-level `discoveredActions.json`. +6. Inspect `discoveredActions.json` and `synthesisReport.md`. If validation flagged duplicates / fragments, the merge step already auto-fixed obvious ones; the rest are notes for human review. +7. If gaps: re-run `runExploration` with a _focused_ goal naming only the missing area; synthesis merges automatically. Per-tab focused crawls produce cleaner output than one mega-crawl. + +The `clockReconCrawl.ts` smoke runs this whole sequence end-to-end against Windows Clock — read it as a reference implementation. + +For non-UWP apps: snapshot auto-detection won't find folders, so the user fills in the policy manually (or sets `markStateless` if there's no persisted state). Reconnaissance + exploration work the same way. diff --git a/ts/packages/agents/onboarding/src/uiCapture/dynamicControls.ts b/ts/packages/agents/onboarding/src/uiCapture/dynamicControls.ts new file mode 100644 index 0000000000..f351269265 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/dynamicControls.ts @@ -0,0 +1,268 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import path from "node:path"; + +import { HelperClient } from "./helperClient.js"; +import type { + ControlMatcher, + DynamicControlRule, + DynamicControlsFile, + DynamicProperty, + TreeNode, +} from "./types.js"; + +/** + * Run a calibration pass: take N tree dumps with delays between them, + * compare for value/name drift, and emit per-element rules with + * `reason: "calibration-drift"`. + * + * The app should be in a stable, observable state — no agent or user input + * during calibration. Anything that changes value/name is presumed dynamic. + */ +export async function calibrateDynamicControls(opts: { + client: HelperClient; + rootSelector: string; + integrationName: string; + dumpCount?: number; + delayMs?: number; + maxDepth?: number; +}): Promise { + const dumpCount = opts.dumpCount ?? 3; + const delayMs = opts.delayMs ?? 3000; + const maxDepth = opts.maxDepth ?? 8; + const startedAt = Date.now(); + + const dumps: TreeNode[] = []; + for (let i = 0; i < dumpCount; i++) { + if (i > 0) { + await sleep(delayMs); + } + dumps.push( + await opts.client.treeDump({ + root: opts.rootSelector, + maxDepth, + }), + ); + } + + const rules = diffDumpsToRules(dumps); + const now = new Date().toISOString(); + for (const r of rules) { + r.firstSeen = now; + r.lastConfirmed = now; + } + + return { + version: 1, + integrationName: opts.integrationName, + calibration: { + lastRun: now, + durationMs: Date.now() - startedAt, + dumpsCompared: dumps.length, + }, + rules, + }; +} + +/** + * Index a tree by selector for cross-dump comparison. + */ +function indexBySelector(node: TreeNode, out: Map): void { + out.set(node.selector, node); + for (const c of node.children) { + indexBySelector(c, out); + } +} + +function diffDumpsToRules(dumps: TreeNode[]): DynamicControlRule[] { + if (dumps.length < 2) { + return []; + } + + const indexed: Map[] = dumps.map((d) => { + const m = new Map(); + indexBySelector(d, m); + return m; + }); + + // Find selectors present in ALL dumps. + const common: string[] = []; + for (const sel of indexed[0]!.keys()) { + if (indexed.every((m) => m.has(sel))) { + common.push(sel); + } + } + + const rules: DynamicControlRule[] = []; + let id = 1; + for (const sel of common) { + const nodes = indexed.map((m) => m.get(sel)!); + const dynProps = detectDynamicProperties(nodes); + if (dynProps.length === 0) { + continue; + } + const exemplar = nodes[0]!; + const matcher = chooseMatcher(exemplar); + const transitions = countTransitions(nodes, dynProps); + rules.push({ + id: `cal-${id++}`, + match: matcher, + dynamicProperties: dynProps, + ...(deriveSemantic(exemplar) !== undefined + ? { semantic: deriveSemantic(exemplar) as string } + : {}), + reason: "calibration-drift", + confidence: Math.min(1, transitions / (dumps.length - 1)), + observations: transitions, + firstSeen: "", + lastConfirmed: "", + }); + } + return rules; +} + +function detectDynamicProperties(nodes: TreeNode[]): DynamicProperty[] { + const props: DynamicProperty[] = []; + if ( + nodes.some( + (n, i) => i > 0 && (nodes[i - 1]!.value ?? "") !== (n.value ?? ""), + ) + ) { + props.push("value"); + } + if ( + nodes.some( + (n, i) => i > 0 && (nodes[i - 1]!.name ?? "") !== (n.name ?? ""), + ) + ) { + props.push("name"); + } + if ( + nodes.some( + (n, i) => + i > 0 && + (nodes[i - 1]!.toggleState ?? "") !== (n.toggleState ?? ""), + ) + ) { + props.push("toggleState"); + } + return props; +} + +function countTransitions(nodes: TreeNode[], props: DynamicProperty[]): number { + let transitions = 0; + for (let i = 1; i < nodes.length; i++) { + for (const p of props) { + if ( + (getProp(nodes[i - 1]!, p) ?? "") !== + (getProp(nodes[i]!, p) ?? "") + ) { + transitions++; + break; + } + } + } + return transitions; +} + +function getProp(n: TreeNode, p: DynamicProperty): string | undefined { + return p === "value" ? n.value : p === "name" ? n.name : n.toggleState; +} + +function chooseMatcher(n: TreeNode): ControlMatcher { + if (n.automationId) { + return { kind: "automationId", value: n.automationId }; + } + return { kind: "selector", value: n.selector }; +} + +function deriveSemantic(n: TreeNode): string | undefined { + if (n.automationId) { + return n.automationId; + } + if (n.name) { + return n.name.length > 40 ? n.name.slice(0, 40) : n.name; + } + return undefined; +} + +async function sleep(ms: number): Promise { + await new Promise((res) => setTimeout(res, ms)); +} + +/* persistence */ + +export function loadDynamicControls( + workspaceDir: string, +): DynamicControlsFile | null { + const file = path.join(workspaceDir, "dynamicControls.json"); + if (!existsSync(file)) { + return null; + } + return JSON.parse(readFileSync(file, "utf8")) as DynamicControlsFile; +} + +export function saveDynamicControls( + workspaceDir: string, + file: DynamicControlsFile, +): void { + mkdirSync(workspaceDir, { recursive: true }); + writeFileSync( + path.join(workspaceDir, "dynamicControls.json"), + JSON.stringify(file, null, 2), + ); +} + +/** + * Merge new rules into an existing file, deduping by matcher equivalence. + * Bumps observations/lastConfirmed on hits. + */ +export function mergeDynamicControls( + base: DynamicControlsFile, + incoming: DynamicControlRule[], +): DynamicControlsFile { + const now = new Date().toISOString(); + const mergedRules: DynamicControlRule[] = [...base.rules]; + for (const inc of incoming) { + const existing = mergedRules.find((r) => + sameMatcher(r.match, inc.match), + ); + if (existing) { + existing.observations += inc.observations; + existing.lastConfirmed = now; + for (const p of inc.dynamicProperties) { + if (!existing.dynamicProperties.includes(p)) { + existing.dynamicProperties.push(p); + } + } + } else { + mergedRules.push({ ...inc, lastConfirmed: now }); + } + } + return { ...base, rules: mergedRules }; +} + +function sameMatcher(a: ControlMatcher, b: ControlMatcher): boolean { + if (a.kind !== b.kind) { + return false; + } + switch (a.kind) { + case "automationId": + return a.value === (b as typeof a).value; + case "selector": + return a.value === (b as typeof a).value; + case "selectorPattern": + return a.pattern === (b as typeof a).pattern; + case "container": { + const cb = b as typeof a; + return ( + a.container === cb.container && + a.controlType === cb.controlType && + (a.nameRegex ?? "") === (cb.nameRegex ?? "") && + (a.classNameRegex ?? "") === (cb.classNameRegex ?? "") + ); + } + } +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/exploreGraph.ts b/ts/packages/agents/onboarding/src/uiCapture/exploreGraph.ts new file mode 100644 index 0000000000..cabe28afa0 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/exploreGraph.ts @@ -0,0 +1,186 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { + appendFileSync, + createWriteStream, + existsSync, + mkdirSync, + readFileSync, + writeFileSync, + WriteStream, +} from "node:fs"; +import path from "node:path"; + +import type { CapturedState, CapturedTransition } from "./exploreTypes.js"; +import type { TreeNode } from "./types.js"; + +/** + * On-disk layout for one exploration run: + * /states.jsonl metadata index + * /transitions.jsonl edges + * /states/state-NNN.json full TreeNode per state + * /screenshots/state-NNN.png optional + * /run.json run config + status + * + * Append-only JSONL allows in-place resume after a crash; the in-memory + * fingerprint→stateId map is rebuilt from states.jsonl on load. + */ +export class ExploreGraph { + private readonly statesByFp = new Map(); + private readonly statesById = new Map(); + private readonly transitions: CapturedTransition[] = []; + private nextStateNum: number; + private nextTransitionNum: number; + + private readonly statesStream: WriteStream; + private readonly transitionsStream: WriteStream; + + constructor(public readonly runDir: string) { + mkdirSync(path.join(runDir, "states"), { recursive: true }); + mkdirSync(path.join(runDir, "screenshots"), { recursive: true }); + + // Restore prior state if files already exist. + const statesFile = path.join(runDir, "states.jsonl"); + const transitionsFile = path.join(runDir, "transitions.jsonl"); + if (existsSync(statesFile)) { + for (const line of readFileSync(statesFile, "utf8") + .split("\n") + .filter((l) => l.length > 0)) { + const s = JSON.parse(line) as CapturedState; + this.statesByFp.set(s.fingerprint, s.id); + this.statesById.set(s.id, s); + } + } + if (existsSync(transitionsFile)) { + for (const line of readFileSync(transitionsFile, "utf8") + .split("\n") + .filter((l) => l.length > 0)) { + this.transitions.push(JSON.parse(line) as CapturedTransition); + } + } + this.nextStateNum = this.statesById.size + 1; + this.nextTransitionNum = this.transitions.length + 1; + this.statesStream = createWriteStream(statesFile, { flags: "a" }); + this.transitionsStream = createWriteStream(transitionsFile, { + flags: "a", + }); + } + + get stateCount(): number { + return this.statesById.size; + } + + get transitionCount(): number { + return this.transitions.length; + } + + get successfulTransitionCount(): number { + let n = 0; + for (const t of this.transitions) if (t.success) n++; + return n; + } + + get failedTransitionCount(): number { + return this.transitions.length - this.successfulTransitionCount; + } + + findStateByFingerprint(fingerprint: string): CapturedState | undefined { + const id = this.statesByFp.get(fingerprint); + return id ? this.statesById.get(id) : undefined; + } + + listStateSummaries(): Array<{ + id: string; + label?: string; + fingerprint: string; + }> { + const out: Array<{ id: string; label?: string; fingerprint: string }> = + []; + for (const s of this.statesById.values()) { + const item: { id: string; label?: string; fingerprint: string } = { + id: s.id, + fingerprint: s.fingerprint, + }; + if (s.label !== undefined) item.label = s.label; + out.push(item); + } + return out; + } + + /** + * Register a new state if its fingerprint is novel; otherwise return the + * existing state record. The full tree JSON is persisted on disk. + */ + upsertState(opts: { + fingerprint: string; + windowTitle: string; + tree: TreeNode; + screenshotPngBase64?: string; + label?: string; + }): { state: CapturedState; isNew: boolean } { + const existingId = this.statesByFp.get(opts.fingerprint); + if (existingId) { + return { state: this.statesById.get(existingId)!, isNew: false }; + } + const id = `state-${this.nextStateNum.toString().padStart(3, "0")}`; + this.nextStateNum++; + const treeFile = path.join("states", `${id}.json`); + writeFileSync( + path.join(this.runDir, treeFile), + JSON.stringify(opts.tree, null, 2), + ); + let screenshotFile: string | undefined; + if (opts.screenshotPngBase64) { + const sf = path.join("screenshots", `${id}.png`); + writeFileSync( + path.join(this.runDir, sf), + Buffer.from(opts.screenshotPngBase64, "base64"), + ); + screenshotFile = sf; + } + const state: CapturedState = { + id, + fingerprint: opts.fingerprint, + capturedAt: Date.now(), + windowTitle: opts.windowTitle, + treeFile, + ...(screenshotFile !== undefined ? { screenshotFile } : {}), + ...(opts.label !== undefined ? { label: opts.label } : {}), + }; + this.statesByFp.set(state.fingerprint, state.id); + this.statesById.set(state.id, state); + this.statesStream.write(JSON.stringify(state) + "\n"); + return { state, isNew: true }; + } + + addTransition(t: Omit): CapturedTransition { + const id = `trans-${this.nextTransitionNum.toString().padStart(4, "0")}`; + this.nextTransitionNum++; + const full: CapturedTransition = { ...t, id }; + this.transitions.push(full); + this.transitionsStream.write(JSON.stringify(full) + "\n"); + return full; + } + + recentTransitions(n: number): CapturedTransition[] { + return this.transitions.slice(Math.max(0, this.transitions.length - n)); + } + + async close(): Promise { + await Promise.all([ + new Promise((res) => this.statesStream.end(() => res())), + new Promise((res) => this.transitionsStream.end(() => res())), + ]); + } + + /** + * Append a JSON line to a sibling file (used for run.json snapshots). + */ + writeRunMeta(name: string, content: object): void { + appendFileSync( + path.join(this.runDir, name), + JSON.stringify(content) + "\n", + ); + } +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/exploreLlmSchema.ts b/ts/packages/agents/onboarding/src/uiCapture/exploreLlmSchema.ts new file mode 100644 index 0000000000..db08d0f6a8 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/exploreLlmSchema.ts @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// TypeChat schema for the autonomous-explore decision oracle. +// Loaded as text by TypeChat — keep self-contained, no runtime imports. + +/** One decision per iteration: act / stop / restore. */ +export type ExploreDecision = ActDecision | StopDecision | RestoreDecision; + +/** Take an action against a control on the current frontier. */ +export type ActDecision = { + /** Always "act". */ + kind: "act"; + /** ID of a frontier item from the input (e.g. "F-007"). Must be one shown to you. */ + frontierId: string; + /** Verb to apply. Must be one of the verbs declared on the chosen frontier item. */ + verb: + | "invoke" + | "toggle" + | "setValue" + | "select" + | "expand" + | "scroll" + | "focus" + | "click"; + /** + * Target value for setValue / toggle / select. Omit for verbs that take no value + * (invoke, focus, click, scroll, expand without an explicit boolean). + */ + value?: string | number | boolean; + /** + * Short prediction of how the app state will change after this action. + * Compared against the observed delta on the next iteration. + */ + expectedDelta: string; + /** One sentence: why this action advances the goal. */ + rationale: string; +}; + +/** End exploration. */ +export type StopDecision = { + /** Always "stop". */ + kind: "stop"; + /** Why exploration is complete (e.g. "all observed states have empty frontier"). */ + reason: string; +}; + +/** + * Reset to baseline. Use when current branch is exhausted or the app is in + * an unhelpful state and a clean slate is needed. + */ +export type RestoreDecision = { + /** Always "restore". */ + kind: "restore"; + /** One sentence: why a restore is preferable to acting in the current state. */ + rationale: string; +}; diff --git a/ts/packages/agents/onboarding/src/uiCapture/exploreTypes.ts b/ts/packages/agents/onboarding/src/uiCapture/exploreTypes.ts new file mode 100644 index 0000000000..6c8e05cf43 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/exploreTypes.ts @@ -0,0 +1,109 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import type { ActionVerb } from "./types.js"; + +export type CapturedState = { + id: string; + fingerprint: string; + capturedAt: number; + windowTitle: string; + treeFile: string; + screenshotFile?: string; + label?: string; + notes?: string; +}; + +export type TransitionSource = "agent" | "user" | "external"; + +export type CapturedTransition = { + id: string; + iteration: number; + fromStateId: string; + toStateId: string; + trigger: { + selector: string; + verb: ActionVerb; + value?: string | number | boolean; + }; + rationale?: string; + expectedDelta?: string; + observedDeltaSummary?: string; + source: TransitionSource; + timestamp: number; + success: boolean; + errorMessage?: string; +}; + +export type FrontierVerb = { + verb: ActionVerb; + valueShape?: "free-text" | "range" | "selection" | "boolean" | "none"; + rangeMeta?: { min: number; max: number; step?: number }; + selectionItems?: string[]; +}; + +export type FrontierItem = { + id: string; + selector: string; + controlType: string; + name?: string; + automationId?: string; + className?: string; + verbs: FrontierVerb[]; + destructiveHint: boolean; + boundingRect?: { x: number; y: number; width: number; height: number }; +}; + +export type ExploreDecisionAct = { + kind: "act"; + frontierId: string; + verb: ActionVerb; + value?: string | number | boolean; + expectedDelta: string; + rationale: string; +}; + +export type ExploreDecisionStop = { kind: "stop"; reason: string }; +export type ExploreDecisionRestore = { kind: "restore"; rationale: string }; +export type ExploreDecisionUserPause = { kind: "userPause"; rationale: string }; + +export type ExploreDecision = + | ExploreDecisionAct + | ExploreDecisionStop + | ExploreDecisionRestore + | ExploreDecisionUserPause; + +export type DecisionInput = { + iteration: number; + state: CapturedState; + frontier: FrontierItem[]; + visitedStates: Array<{ id: string; label?: string; fingerprint: string }>; + recentTransitions: CapturedTransition[]; + budget: { remainingIterations: number; remainingMs: number }; +}; + +export interface DecisionOracle { + decide(input: DecisionInput): Promise; +} + +export type ExploreBudget = { + maxIterations?: number; + maxWallClockMs?: number; + maxStates?: number; + convergenceThreshold?: number; + historyTailSize?: number; +}; + +export type ExploreRunMetrics = { + runId: string; + startedAt: string; + endedAt: string; + walltimeMs: number; + iterations: number; + statesDiscovered: number; + transitionsRecorded: number; + successfulTransitions: number; + failedTransitions: number; + stopReason: string; + convergenceIterations: number; +}; diff --git a/ts/packages/agents/onboarding/src/uiCapture/explorer.ts b/ts/packages/agents/onboarding/src/uiCapture/explorer.ts new file mode 100644 index 0000000000..d712d96964 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/explorer.ts @@ -0,0 +1,382 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { mkdirSync, writeFileSync } from "node:fs"; +import path from "node:path"; + +import type { + CapturedState, + CapturedTransition, + DecisionInput, + DecisionOracle, + ExploreBudget, + ExploreDecision, + ExploreRunMetrics, + FrontierItem, +} from "./exploreTypes.js"; +import { ExploreGraph } from "./exploreGraph.js"; +import { computeFrontier } from "./frontier.js"; +import type { HelperClient } from "./helperClient.js"; +import type { DynamicControlRule } from "./types.js"; + +const DEFAULT_BUDGET: Required = { + maxIterations: 200, + maxWallClockMs: 30 * 60_000, + maxStates: 50, + convergenceThreshold: 15, + historyTailSize: 5, +}; + +export type ExploreOptions = { + client: HelperClient; + oracle: DecisionOracle; + workspaceDir: string; + rootSelector: string; + runId?: string; + dynamicRules?: DynamicControlRule[]; + captureScreenshots?: boolean; + treeMaxDepth?: number; + idleDebounceMs?: number; + idleMaxWaitMs?: number; + budget?: ExploreBudget; + onIteration?: (info: { + iteration: number; + state: CapturedState; + decision: ExploreDecision; + }) => void; +}; + +/** + * Deterministic outer loop: capture state → ask oracle → execute → capture + * post-state → record transition → persist. The oracle decides; we just + * orchestrate. + * + * Snapshot/restore integration is intentionally NOT here yet — slice 6 + * focuses on the loop+graph mechanics. Wiring snapshot capture/restore + * around runs lands in a follow-up. + */ +export async function runExploration( + opts: ExploreOptions, +): Promise { + const budget = { ...DEFAULT_BUDGET, ...(opts.budget ?? {}) }; + const runId = opts.runId ?? new Date().toISOString().replace(/[:.]/g, "-"); + const runDir = path.join(opts.workspaceDir, "runs", runId); + mkdirSync(runDir, { recursive: true }); + + const graph = new ExploreGraph(runDir); + + const startedAt = new Date(); + const startTime = Date.now(); + const idleDebounceMs = opts.idleDebounceMs ?? 500; + const idleMaxWaitMs = opts.idleMaxWaitMs ?? 4000; + + let iteration = 0; + let stopReason = "loop-completed"; + let lastNewStateIteration = 0; + + try { + // Pre-loop: capture initial state. + await opts.client.eventsIdle({ + debounceMs: idleDebounceMs, + maxWaitMs: idleMaxWaitMs, + }); + let { state, frontier } = await captureState( + opts, + graph, + opts.rootSelector, + ); + if (graph.stateCount === 1) { + lastNewStateIteration = 0; + } + + while (true) { + const elapsed = Date.now() - startTime; + const remainingIterations = budget.maxIterations - iteration; + const remainingMs = budget.maxWallClockMs - elapsed; + + if (iteration >= budget.maxIterations) { + stopReason = "max-iterations"; + break; + } + if (elapsed >= budget.maxWallClockMs) { + stopReason = "max-walltime"; + break; + } + if (graph.stateCount >= budget.maxStates) { + stopReason = "max-states"; + break; + } + if ( + iteration - lastNewStateIteration >= + budget.convergenceThreshold + ) { + stopReason = "converged"; + break; + } + + iteration++; + const input: DecisionInput = { + iteration, + state, + frontier, + visitedStates: graph.listStateSummaries(), + recentTransitions: graph.recentTransitions( + budget.historyTailSize, + ), + budget: { remainingIterations, remainingMs }, + }; + + const decision = await opts.oracle.decide(input); + opts.onIteration?.({ iteration, state, decision }); + + if (decision.kind === "stop") { + stopReason = `oracle-stop: ${decision.reason}`; + break; + } + if (decision.kind === "userPause") { + // For slice 6a: no implementation; treat as a soft idle. + await sleep(2000); + ({ state, frontier } = await captureState( + opts, + graph, + opts.rootSelector, + )); + continue; + } + if (decision.kind === "restore") { + // Slice 6a stub: just re-capture (no snapshot integration yet). + ({ state, frontier } = await captureState( + opts, + graph, + opts.rootSelector, + )); + continue; + } + + // decision.kind === "act" + const item = frontier.find((f) => f.id === decision.frontierId); + if (!item) { + // Oracle picked a stale frontier id; record + skip. + graph.addTransition({ + iteration, + fromStateId: state.id, + toStateId: state.id, + trigger: { selector: "(invalid)", verb: decision.verb }, + rationale: decision.rationale, + expectedDelta: decision.expectedDelta, + source: "agent", + timestamp: Date.now(), + success: false, + errorMessage: `unknown frontier id ${decision.frontierId}`, + }); + continue; + } + + const transition = await executeAction(opts, graph, { + iteration, + fromState: state, + item, + decision, + rootSelector: opts.rootSelector, + idleDebounceMs, + idleMaxWaitMs, + }); + + // Re-capture for next iteration. + const next = await captureState(opts, graph, opts.rootSelector); + if ( + graph.findStateByFingerprint(next.state.fingerprint)?.id === + next.state.id && + next.state.id !== state.id + ) { + lastNewStateIteration = iteration; + } + state = next.state; + frontier = next.frontier; + // ensure graph has transition's toState resolved for callers reading it + void transition; + } + } finally { + await graph.close(); + } + + const endedAt = new Date(); + const metrics: ExploreRunMetrics = { + runId, + startedAt: startedAt.toISOString(), + endedAt: endedAt.toISOString(), + walltimeMs: endedAt.getTime() - startedAt.getTime(), + iterations: iteration, + statesDiscovered: graph.stateCount, + transitionsRecorded: graph.transitionCount, + successfulTransitions: graph.successfulTransitionCount, + failedTransitions: graph.failedTransitionCount, + stopReason, + convergenceIterations: lastNewStateIteration, + }; + writeFileSync( + path.join(runDir, "metrics.json"), + JSON.stringify(metrics, null, 2), + ); + return metrics; +} + +async function captureState( + opts: ExploreOptions, + graph: ExploreGraph, + rootSelector: string, +): Promise<{ state: CapturedState; frontier: FrontierItem[] }> { + const fp = await opts.client.treeFingerprint({ + root: rootSelector, + ...(opts.dynamicRules !== undefined + ? { dynamicRules: opts.dynamicRules } + : {}), + }); + const tree = await opts.client.treeDump({ + root: rootSelector, + maxDepth: opts.treeMaxDepth ?? 12, + }); + let screenshotPngBase64: string | undefined; + if (opts.captureScreenshots) { + try { + const shot = await opts.client.screenshot({ root: rootSelector }); + screenshotPngBase64 = shot.pngBase64; + } catch { + /* screenshots are best-effort */ + } + } + const { state } = graph.upsertState({ + fingerprint: fp.hash, + windowTitle: fp.activeWindowTitle, + tree, + ...(screenshotPngBase64 !== undefined ? { screenshotPngBase64 } : {}), + }); + const frontier = computeFrontier(tree); + return { state, frontier }; +} + +async function executeAction( + opts: ExploreOptions, + graph: ExploreGraph, + args: { + iteration: number; + fromState: CapturedState; + item: FrontierItem; + decision: { + verb: string; + value?: string | number | boolean; + expectedDelta: string; + rationale: string; + }; + rootSelector: string; + idleDebounceMs: number; + idleMaxWaitMs: number; + }, +): Promise { + const { client } = opts; + const { item, decision } = args; + const verb = decision.verb; + + let success = true; + let errorMessage: string | undefined; + try { + switch (verb) { + case "invoke": + await client.doInvoke({ selector: item.selector }); + break; + case "toggle": + await client.doToggle({ + selector: item.selector, + ...(typeof decision.value === "boolean" + ? { value: decision.value } + : {}), + }); + break; + case "select": + await client.doSelect({ + selector: item.selector, + ...(decision.value !== undefined + ? { item: decision.value as string | number } + : {}), + }); + break; + case "expand": + await client.doExpand({ + selector: item.selector, + expand: decision.value !== false, + }); + break; + case "setValue": + await client.doSetValue({ + selector: item.selector, + value: decision.value ?? "", + }); + break; + case "scroll": + await client.doScroll({ + selector: item.selector, + direction: "down", + }); + break; + case "focus": + await client.doFocus({ selector: item.selector }); + break; + case "click": + await client.doClick({ selector: item.selector }); + break; + default: + success = false; + errorMessage = `unsupported verb ${verb}`; + } + } catch (e) { + success = false; + errorMessage = e instanceof Error ? e.message : String(e); + } + + await client.eventsIdle({ + debounceMs: args.idleDebounceMs, + maxWaitMs: args.idleMaxWaitMs, + }); + + // Capture post-state to link transition. + const fp = await client.treeFingerprint({ + root: args.rootSelector, + ...(opts.dynamicRules !== undefined + ? { dynamicRules: opts.dynamicRules } + : {}), + }); + const tree = await client.treeDump({ + root: args.rootSelector, + maxDepth: opts.treeMaxDepth ?? 12, + }); + const { state: toState } = graph.upsertState({ + fingerprint: fp.hash, + windowTitle: fp.activeWindowTitle, + tree, + }); + + return graph.addTransition({ + iteration: args.iteration, + fromStateId: args.fromState.id, + toStateId: toState.id, + trigger: { + selector: item.selector, + verb: verb as any, + ...(decision.value !== undefined ? { value: decision.value } : {}), + }, + rationale: decision.rationale, + expectedDelta: decision.expectedDelta, + observedDeltaSummary: + args.fromState.id === toState.id + ? "no observable state change" + : `${args.fromState.id} → ${toState.id}`, + source: "agent", + timestamp: Date.now(), + success, + ...(errorMessage !== undefined ? { errorMessage } : {}), + }); +} + +async function sleep(ms: number): Promise { + await new Promise((res) => setTimeout(res, ms)); +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/frontier.ts b/ts/packages/agents/onboarding/src/uiCapture/frontier.ts new file mode 100644 index 0000000000..c66cb6b464 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/frontier.ts @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import type { FrontierItem, FrontierVerb } from "./exploreTypes.js"; +import type { Pattern, TreeNode } from "./types.js"; + +const DESTRUCTIVE_RE = + /\b(delete|remove|reset|clear|erase|destroy|trash|discard)\b/i; + +/** + * Walk a tree dump and emit one FrontierItem per actionable element. + * "Actionable" = supports a pattern that maps to one of our verbs AND + * is enabled and on-screen. + */ +export function computeFrontier(root: TreeNode): FrontierItem[] { + const items: FrontierItem[] = []; + let counter = 1; + walk(root, items, () => `F-${(counter++).toString().padStart(3, "0")}`); + sortByPriority(items); + return items; +} + +function walk(node: TreeNode, out: FrontierItem[], nextId: () => string): void { + if (node.isEnabled && !node.isOffscreen) { + const verbs = verbsFor(node); + if (verbs.length > 0) { + const item: FrontierItem = { + id: nextId(), + selector: node.selector, + controlType: node.controlType, + verbs, + destructiveHint: isDestructive(node), + boundingRect: node.boundingRect, + }; + if (node.name !== undefined) item.name = node.name; + if (node.automationId !== undefined) + item.automationId = node.automationId; + if (node.className !== undefined) item.className = node.className; + out.push(item); + } + } + for (const c of node.children) { + walk(c, out, nextId); + } +} + +function verbsFor(node: TreeNode): FrontierVerb[] { + const verbs: FrontierVerb[] = []; + const has = (p: Pattern) => node.patterns.includes(p); + + if (has("Invoke")) { + verbs.push({ verb: "invoke", valueShape: "none" }); + } + if (has("Toggle")) { + verbs.push({ verb: "toggle", valueShape: "boolean" }); + } + // SelectionItem first (the item itself can be selected) — overrides Selection container if both present. + if (has("SelectionItem")) { + verbs.push({ verb: "select", valueShape: "none" }); + } else if (has("Selection")) { + verbs.push({ verb: "select", valueShape: "selection" }); + } + if (has("ExpandCollapse")) { + verbs.push({ verb: "expand", valueShape: "boolean" }); + } + if (has("Value")) { + verbs.push({ verb: "setValue", valueShape: "free-text" }); + } + if (has("RangeValue") && !has("Value")) { + verbs.push({ verb: "setValue", valueShape: "range" }); + } + if (has("Scroll")) { + verbs.push({ verb: "scroll", valueShape: "none" }); + } + return verbs; +} + +function isDestructive(node: TreeNode): boolean { + const text = `${node.name ?? ""} ${node.automationId ?? ""}`; + return DESTRUCTIVE_RE.test(text); +} + +function sortByPriority(items: FrontierItem[]): void { + items.sort((a, b) => priority(a) - priority(b)); +} + +function priority(item: FrontierItem): number { + // Lower is better. + let p = 0; + if (item.destructiveHint) { + p += 1000; // push destructive items to the back + } + // High-signal control types come first. + const ct = item.controlType; + if (ct === "Button" || ct === "MenuItem" || ct === "ListItem") { + p += 0; + } else if (ct === "Edit" || ct === "ComboBox" || ct === "CheckBox") { + p += 10; + } else { + p += 50; + } + // Stable identifiers preferred. + if (!item.automationId) { + p += 5; + } + return p; +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/helperClient.ts b/ts/packages/agents/onboarding/src/uiCapture/helperClient.ts new file mode 100644 index 0000000000..98a05c09ea --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/helperClient.ts @@ -0,0 +1,466 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { ChildProcess, spawn } from "node:child_process"; +import { existsSync } from "node:fs"; +import path from "node:path"; +import { createInterface, Interface } from "node:readline"; +import { fileURLToPath } from "node:url"; + +import type { + DynamicControlRule, + FingerprintResult, + Rect, + Screenshot, + SnapshotPolicy, + TreeNode, + WindowInfo, +} from "./types.js"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +const HELPER_SLN_RELATIVE = "dotnet/uiAutomationHelper/UiAutomationHelper.sln"; + +type Pending = { + resolve: (value: unknown) => void; + reject: (error: Error) => void; +}; + +export type HelperRpcError = Error & { code?: number }; + +export type EventType = + | "Invoked" + | "ValueChanged" + | "ToggleStateChanged" + | "StructureChanged"; + +export type ControlSnapshot = { + controlType: string; + name?: string; + automationId?: string; + className?: string; + value?: string; + toggleState?: string; +}; + +export type CapturedEvent = { + subscriptionId: string; + eventType: string; + selector: string; + controlSnapshot?: ControlSnapshot; + newValue?: string; + changeType?: string; + timestamp: string; +}; + +export type EventHandler = (evt: CapturedEvent) => void; + +export interface HelperClientOptions { + binaryPath?: string; + debug?: boolean; +} + +/** + * Resolves the helper binary path. Order: + * 1. opts.binaryPath + * 2. TYPEAGENT_UIA_HELPER env var + * 3. Repo-relative dev path (for local development) + */ +function resolveBinary(opts: HelperClientOptions): string { + if (opts.binaryPath) { + return opts.binaryPath; + } + if (process.env.TYPEAGENT_UIA_HELPER) { + return process.env.TYPEAGENT_UIA_HELPER; + } + // From dist/uiCapture/helperClient.js, repo root is six levels up. + const repoRelative = path.resolve( + __dirname, + "../../../../../..", + "dotnet/uiAutomationHelper/bin/Release/UiAutomationHelper.exe", + ); + return repoRelative; +} + +function resolveHelperSln(): string { + return path.resolve(__dirname, "../../../../../..", HELPER_SLN_RELATIVE); +} + +/** + * Thrown by HelperClient.start() when the helper executable is missing. + * Callers can catch this to offer to build the helper interactively + * instead of surfacing a raw error. + */ +export class HelperBinaryMissingError extends Error { + readonly binaryPath: string; + readonly slnPath: string; + + constructor(binaryPath: string, slnPath: string) { + super( + `Helper binary not found at ${binaryPath}. ` + + `Build it via: dotnet build -c Release ${slnPath}`, + ); + this.name = "HelperBinaryMissingError"; + this.binaryPath = binaryPath; + this.slnPath = slnPath; + } +} + +/** + * Builds the .NET UI Automation helper via `dotnet build -c Release`. + * Resolves on success; rejects with stderr output on failure. + */ +export async function buildHelperBinary( + opts: { onProgress?: (line: string) => void } = {}, +): Promise { + const slnPath = resolveHelperSln(); + if (!existsSync(slnPath)) { + throw new Error(`Helper solution not found at ${slnPath}`); + } + return new Promise((resolve, reject) => { + const child = spawn("dotnet", ["build", "-c", "Release", slnPath], { + stdio: ["ignore", "pipe", "pipe"], + windowsHide: true, + }); + const stderrChunks: string[] = []; + child.stdout!.on("data", (data: Buffer) => { + opts.onProgress?.(data.toString()); + }); + child.stderr!.on("data", (data: Buffer) => { + stderrChunks.push(data.toString()); + }); + child.on("error", reject); + child.on("exit", (code) => { + if (code === 0) { + resolve(); + } else { + reject( + new Error( + `dotnet build exited with code ${code}: ${stderrChunks.join("").trim()}`, + ), + ); + } + }); + }); +} + +/** + * JSON-RPC 2.0 client over stdio for the .NET UIA helper. + * + * Slice 1 surface: ping, app.launch/attach/list/kill, tree.dump, screenshot, do.invoke. + * Single-flight is fine for slice 1; events and concurrent requests come later. + */ +export class HelperClient { + private nextId = 1; + private readonly pending = new Map(); + private readonly eventHandlers = new Set(); + private exited = false; + private exitCode: number | null = null; + + private constructor( + private readonly child: ChildProcess, + private readonly stdoutLines: Interface, + private readonly debug: boolean, + ) {} + + static async start(opts: HelperClientOptions = {}): Promise { + const binary = resolveBinary(opts); + if (!existsSync(binary)) { + throw new HelperBinaryMissingError(binary, resolveHelperSln()); + } + const child = spawn(binary, [], { + stdio: ["pipe", "pipe", "pipe"], + windowsHide: true, + }); + const stdoutLines = createInterface({ + input: child.stdout!, + crlfDelay: Infinity, + }); + const client = new HelperClient( + child, + stdoutLines, + opts.debug ?? false, + ); + client.attach(); + // Verify the helper is actually responding before returning. + await client.ping(); + return client; + } + + private attach(): void { + this.stdoutLines.on("line", (line) => this.handleLine(line)); + this.child.on("exit", (code) => { + this.exited = true; + this.exitCode = code; + for (const [id, p] of this.pending.entries()) { + p.reject( + new Error( + `Helper exited (code ${code}) before responding to request ${id}`, + ), + ); + } + this.pending.clear(); + }); + this.child.stderr!.on("data", (data: Buffer) => { + if (this.debug) { + process.stderr.write(`[uia-helper] ${data.toString()}`); + } + }); + } + + private handleLine(line: string): void { + if (!line.trim()) { + return; + } + let msg: { + id?: number | string | null; + method?: string; + params?: unknown; + result?: unknown; + error?: { code: number; message: string; data?: unknown }; + }; + try { + msg = JSON.parse(line); + } catch { + if (this.debug) { + process.stderr.write(`[uia-helper bad-json] ${line}\n`); + } + return; + } + const id = typeof msg.id === "number" ? msg.id : null; + if (id == null) { + // JSON-RPC notification (server → client). + if (msg.method === "event.fired" && msg.params) { + for (const h of this.eventHandlers) { + try { + h(msg.params as CapturedEvent); + } catch (e) { + if (this.debug) { + process.stderr.write( + `[uia-helper handler-throw] ${e}\n`, + ); + } + } + } + } + return; + } + const p = this.pending.get(id); + if (!p) { + return; + } + this.pending.delete(id); + if (msg.error) { + const err = new Error( + `[${msg.error.code}] ${msg.error.message}`, + ) as HelperRpcError; + err.code = msg.error.code; + p.reject(err); + } else { + p.resolve(msg.result); + } + } + + /** + * Register a callback for `event.fired` notifications. Returns a + * disposer that removes the handler. + */ + onEvent(handler: EventHandler): () => void { + this.eventHandlers.add(handler); + return () => { + this.eventHandlers.delete(handler); + }; + } + + private call(method: string, params?: unknown): Promise { + if (this.exited) { + return Promise.reject( + new Error(`Helper has exited (code ${this.exitCode})`), + ); + } + const id = this.nextId++; + const req = { jsonrpc: "2.0", id, method, params }; + return new Promise((resolve, reject) => { + this.pending.set(id, { + resolve: resolve as (value: unknown) => void, + reject, + }); + this.child.stdin!.write(JSON.stringify(req) + "\n", (err) => { + if (err) { + this.pending.delete(id); + reject(err); + } + }); + }); + } + + ping(): Promise<{ ok: true; version: string }> { + return this.call("health.ping"); + } + + appLaunch(p: { + aumid?: string; + exePath?: string; + args?: string[]; + }): Promise<{ pid: number; mainWindow: string }> { + return this.call("app.launch", p); + } + + appAttach(p: { + pid?: number; + windowTitle?: string; + }): Promise<{ pid: number; mainWindow: string }> { + return this.call("app.attach", p); + } + + appList(): Promise { + return this.call("app.list"); + } + + appKill(p: { pid: number }): Promise<{ ok: true }> { + return this.call("app.kill", p); + } + + treeDump(p: { + root: string; + maxDepth?: number; + filter?: "actionable" | "all"; + }): Promise { + return this.call("tree.dump", p); + } + + treeFingerprint(p: { + root: string; + dynamicRules?: DynamicControlRule[]; + }): Promise { + return this.call("tree.fingerprint", p); + } + + screenshot(p: { root: string }): Promise { + return this.call("screenshot", p); + } + + doInvoke(p: { selector: string }): Promise<{ ok: true }> { + return this.call("do.invoke", p); + } + + doToggle(p: { + selector: string; + value?: boolean; + }): Promise<{ ok: true; toggleState: string }> { + return this.call("do.toggle", p); + } + + doSetValue(p: { + selector: string; + value: string | number | boolean; + }): Promise<{ ok: true }> { + return this.call("do.setValue", p); + } + + doSelect(p: { + selector: string; + item?: string | number; + }): Promise<{ ok: true }> { + return this.call("do.select", p); + } + + doExpand(p: { selector: string; expand: boolean }): Promise<{ ok: true }> { + return this.call("do.expand", p); + } + + doScroll(p: { + selector: string; + direction: "up" | "down" | "left" | "right"; + amount?: "small" | "large"; + }): Promise<{ ok: true }> { + return this.call("do.scroll", p); + } + + doFocus(p: { selector: string }): Promise<{ ok: true }> { + return this.call("do.focus", p); + } + + doClick(p: { + selector: string; + button?: "left" | "right"; + position?: { x?: number; y?: number }; + }): Promise<{ ok: true }> { + return this.call("do.click", p); + } + + doSendKeys(p: { selector?: string; keys: string }): Promise<{ ok: true }> { + return this.call("do.sendKeys", p); + } + + find(p: { + selector: string; + timeoutMs?: number; + }): Promise<{ found: boolean; resolved?: string }> { + return this.call("find", p); + } + + eventsIdle( + p: { debounceMs?: number; maxWaitMs?: number } = {}, + ): Promise<{ ok: true; idle: boolean; waitedMs: number }> { + return this.call("events.idle", p); + } + + snapshotCapture(p: { + snapshotDir: string; + policy: SnapshotPolicy; + }): Promise<{ snapshotId: string; bytes: number; sourceCount: number }> { + return this.call("snapshot.capture", p); + } + + snapshotRestore(p: { + snapshotDir: string; + policy: SnapshotPolicy; + }): Promise<{ ok: true; bytes: number }> { + return this.call("snapshot.restore", p); + } + + snapshotDelete(p: { snapshotDir: string }): Promise<{ ok: true }> { + return this.call("snapshot.delete", p); + } + + eventsSubscribe(p: { + root: string; + eventTypes: EventType[]; + }): Promise<{ subscriptionId: string }> { + return this.call("events.subscribe", p); + } + + eventsUnsubscribe(p: { subscriptionId: string }): Promise<{ ok: boolean }> { + return this.call("events.unsubscribe", p); + } + + /** + * Close the helper's stdin and wait up to `timeoutMs` for graceful exit. + * If it doesn't exit, send SIGKILL. + */ + async dispose(timeoutMs = 2000): Promise { + if (this.exited) { + return; + } + this.child.stdin!.end(); + const exited = new Promise((res) => + this.child.once("exit", () => res()), + ); + const timeout = new Promise((res) => setTimeout(res, timeoutMs)); + await Promise.race([exited, timeout]); + if (!this.exited) { + this.child.kill("SIGKILL"); + } + } +} + +export type { + DynamicControlRule, + FingerprintResult, + Rect, + Screenshot, + SnapshotPolicy, + TreeNode, + WindowInfo, +}; diff --git a/ts/packages/agents/onboarding/src/uiCapture/index.ts b/ts/packages/agents/onboarding/src/uiCapture/index.ts new file mode 100644 index 0000000000..d1e662acc4 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/index.ts @@ -0,0 +1,54 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Public re-exports for runtime agents that consume uiCapture artifacts +// (discoveredActions.json + the helper binary). Generated agents import +// from this entry point: `import { ... } from "onboarding-agent/uiCapture"`. + +export { + HelperClient, + HelperBinaryMissingError, + buildHelperBinary, +} from "./helperClient.js"; +export type { + CapturedEvent, + ControlSnapshot, + EventHandler, + EventType, + HelperClientOptions, + HelperRpcError, +} from "./helperClient.js"; + +export { executePlayback } from "./playbackExecutor.js"; +export type { + PlaybackExecutorOptions, + PlaybackParams, + PlaybackResult, + PlaybackStepResult, +} from "./playbackExecutor.js"; + +export type { + ActionVerb, + DynamicControlRule, + FingerprintResult, + Pattern, + Rect, + Screenshot, + SnapshotPolicy, + SnapshotSource, + ToggleState, + TreeNode, + WindowInfo, +} from "./types.js"; + +export type { + ParamSpec, + PlaybackStep, + SynthesizedAction, +} from "./synthesisLlmSchema.js"; + +export { + inferSnapshotPolicy, + loadSnapshotPolicy, + saveSnapshotPolicy, +} from "./snapshotPolicy.js"; diff --git a/ts/packages/agents/onboarding/src/uiCapture/iterativeReconLlmSchema.ts b/ts/packages/agents/onboarding/src/uiCapture/iterativeReconLlmSchema.ts new file mode 100644 index 0000000000..27bb414a6d --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/iterativeReconLlmSchema.ts @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// TypeChat schema for the ITERATIVE recon loop. The vision LLM gets a +// screenshot + the actionable controls + the discoveries-so-far list at +// every step, and decides where to drill next or when to stop. +// +// Loaded as text by TypeChat — keep self-contained, no runtime imports. + +/** Vision LLM's per-step output during iterative reconnaissance. */ +export type IterativeReconStep = { + /** Short label for the screen currently visible (e.g. "Alarm tab — list view", + * "Add alarm dialog", "Stopwatch tab — running"). Used in the recon log. */ + currentScreenLabel: string; + /** + * Actions OBSERVED on the current screen that are not already in the + * `alreadyDiscovered` list. List EVERYTHING that's plausibly a user + * action here — including secondary features, settings panels, etc. + */ + newDiscoveries: ReconAction[]; + /** What to do next. */ + decision: ReconDecision; +}; + +/** + * One of three next-step decisions: drill into a control, back out, or stop. + */ +export type ReconDecision = + | { + /** Always "click". Drill into / activate a control to see its effect. */ + kind: "click"; + /** Selector of the control to click. Must come from the actionable controls list shown in the input. */ + selector: string; + /** "invoke" for buttons / menu items / etc; "select" for ListItems with SelectionItem pattern (tabs, list rows). */ + verb: "invoke" | "select"; + /** One sentence: what you expect to learn or see by clicking this. */ + rationale: string; + } + | { + /** + * Always "back". Use to dismiss a modal/dialog/popup and return to + * the previous screen. PROVIDE a selector for a Cancel / Close / + * Back / X button visible on the current screen. + */ + kind: "back"; + /** Selector of a Cancel/Close/Back/X button visible on the current screen. Must be invokable. */ + cancelSelector: string; + /** One sentence: why we're backing out (typically: "I've cataloged this dialog's fields"). */ + rationale: string; + } + | { + /** Always "done". Use when you've cataloged enough — primary actions of all major sections. */ + kind: "done"; + /** One sentence: why exploration is complete. */ + rationale: string; + }; + +/** A user action observed during reconnaissance (cataloged, not necessarily executed). */ +export type ReconAction = { + /** camelCase verb-noun: createAlarm, startStopwatch, addCity, dismissNotification, signIn, etc. */ + intentName: string; + /** One-sentence user-facing description of the outcome. */ + description: string; + /** Parameters the user supplies. */ + parameters: ReconParam[]; + /** Plain-English example invocation. */ + exampleInvocation: string; + /** Which tab / section of the app this action lives in. */ + tabOrSection: string; + /** Whether this is the main intent of its tab ("primary") or an adjacent feature ("secondary"). */ + priority: "primary" | "secondary"; + /** True for delete/remove/reset/clear actions. */ + destructive: boolean; +}; + +export type ReconParam = { + name: string; + type: "string" | "number" | "boolean" | "enum"; + enumValues?: string[]; + /** Plausible example value drawn from the visible UI (e.g., 7 for hour, "Wake up" for name). */ + example: string | number | boolean; + description: string; +}; diff --git a/ts/packages/agents/onboarding/src/uiCapture/iterativeReconnaissance.ts b/ts/packages/agents/onboarding/src/uiCapture/iterativeReconnaissance.ts new file mode 100644 index 0000000000..f4086ce986 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/iterativeReconnaissance.ts @@ -0,0 +1,319 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { ChatModel } from "aiclient"; +import { loadSchema } from "typeagent"; +import { + createJsonTranslator, + MultimodalPromptContent, + TypeChatJsonTranslator, +} from "typechat"; +import { createTypeScriptJsonValidator } from "typechat/ts"; + +import { getReconModel } from "../lib/llm.js"; +import type { HelperClient } from "./helperClient.js"; +import type { + IterativeReconStep, + ReconAction, +} from "./iterativeReconLlmSchema.js"; +import type { TreeNode } from "./types.js"; + +export type IterativeReconResult = { + appHint: string; + expectedActions: ReconAction[]; + iterationsUsed: number; + /** History of currentScreenLabel values per turn, in order. */ + screenLog: string[]; + /** Reasons the LLM gave for each decision (debugging). */ + decisions: string[]; +}; + +export type IterativeReconOptions = { + client: HelperClient; + rootSelector: string; + appHint: string; + model?: ChatModel; + maxIterations?: number; + /** Wait between actions; UWP NavView in particular is slow. */ + settleMs?: number; +}; + +/** + * Multi-turn vision-driven catalog: the LLM looks at each screen, lists + * actions visible from it, then picks the NEXT control to drill into (or + * a Cancel button to back out, or "done" to stop). The reconner just + * executes the LLM's decision and feeds the next screenshot back. + * + * Output is a deduped ReconAction[] suitable for feeding the explore + * loop's goal as a TODO list. + */ +export async function iterativeReconnoiter( + opts: IterativeReconOptions, +): Promise { + const model = opts.model ?? getReconModel(); + const settleMs = opts.settleMs ?? 1000; + const maxIterations = opts.maxIterations ?? 25; + + const discovered: ReconAction[] = []; + const screenLog: string[] = []; + const decisions: string[] = []; + + let iter = 0; + for (; iter < maxIterations; iter++) { + try { + await opts.client.eventsIdle({ + debounceMs: 500, + maxWaitMs: 3000, + }); + } catch { + /* idle is best-effort */ + } + const tree = await opts.client.treeDump({ + root: opts.rootSelector, + maxDepth: 8, + }); + const screenshot = await opts.client.screenshot({ + root: opts.rootSelector, + }); + + const step = await askVisionStep(model, { + screenshot: screenshot.pngBase64, + tree, + discovered, + screenLog, + appHint: opts.appHint, + iteration: iter + 1, + budget: maxIterations, + }); + + if (!step) { + // Translation failed — record and try one more iteration with a recovery hint. + decisions.push( + "(LLM translation failed, attempting one more turn)", + ); + continue; + } + + screenLog.push(step.currentScreenLabel); + decisions.push(`${step.decision.kind}: ${step.decision.rationale}`); + + // Merge discoveries (dedupe by intentName). + for (const a of step.newDiscoveries) { + if (!discovered.some((x) => x.intentName === a.intentName)) { + discovered.push(a); + } + } + + process.stderr.write( + `[recon] iter ${iter + 1}: '${step.currentScreenLabel}' — ` + + `+${step.newDiscoveries.length} discoveries (total ${discovered.length}), ` + + `next: ${step.decision.kind}\n`, + ); + + if (step.decision.kind === "done") { + iter++; // count the final iteration + break; + } + + try { + if (step.decision.kind === "click") { + if (step.decision.verb === "select") { + await opts.client.doSelect({ + selector: step.decision.selector, + }); + } else { + await opts.client.doInvoke({ + selector: step.decision.selector, + }); + } + } else if (step.decision.kind === "back") { + await opts.client.doInvoke({ + selector: step.decision.cancelSelector, + }); + } + } catch (e) { + process.stderr.write( + `[recon] iter ${iter + 1} action failed: ${e instanceof Error ? e.message : e}\n`, + ); + // Continue — the next turn will see whatever state the app is in. + } + if (settleMs > 0) { + await sleep(settleMs); + } + } + + return { + appHint: opts.appHint, + expectedActions: discovered, + iterationsUsed: iter, + screenLog, + decisions, + }; +} + +async function askVisionStep( + model: ChatModel, + args: { + screenshot: string; + tree: TreeNode; + discovered: ReconAction[]; + screenLog: string[]; + appHint: string; + iteration: number; + budget: number; + }, +): Promise { + const translator = makeIterativeReconTranslator(model); + const text = buildIterativePrompt(args); + const dataUrl = `data:image/png;base64,${args.screenshot}`; + // Put the screenshot in promptHistory as a prior user message so the + // model can see it. Pass the text prompt as `request` so TypeChat + // appends its standard schema-aware instruction wrapper. + const imageOnlyContent: MultimodalPromptContent[] = [ + { + type: "text", + text: "Screenshot of the app's current screen for the next request:", + }, + { + type: "image_url", + image_url: { url: dataUrl }, + } as MultimodalPromptContent, + ]; + const result = await translator.translate(text, [ + { role: "user", content: imageOnlyContent }, + ]); + if (!result.success) { + process.stderr.write( + `[recon] iter ${args.iteration} translation failed: ${result.message}\n`, + ); + return null; + } + return result.data; +} + +function buildIterativePrompt(args: { + tree: TreeNode; + discovered: ReconAction[]; + screenLog: string[]; + appHint: string; + iteration: number; + budget: number; +}): string { + const lines: string[] = []; + lines.push( + "You are cataloging a Windows desktop application's user-facing actions by clicking through it screen by screen. At each step you decide what to drill into next.", + ); + lines.push(""); + lines.push(`App: ${args.appHint}`); + lines.push(`Iteration: ${args.iteration} of ${args.budget}. Be efficient.`); + lines.push(""); + lines.push("Strategy:"); + lines.push( + "- For each screen you visit, list the user actions IT supports (createAlarm, startStopwatch, etc.).", + ); + lines.push( + "- Drill into representative buttons to see what dialogs they open. Cataloging the FIELDS of a dialog (hour, minute, name, snooze...) means you don't actually have to commit it — back out via Cancel.", + ); + lines.push( + "- After a tab is cataloged, navigate to the next tab. Don't waste turns repeating yourself on a screen you've already covered.", + ); + lines.push( + "- Stop ('done') when you've cataloged the primary actions of every section the app offers — usually after visiting all top-level tabs and drilling into one creation flow per tab.", + ); + lines.push( + "- For 'back' decisions, you must specify a cancelSelector. Look for a Cancel / Close / X / Back button on the current screen.", + ); + lines.push(""); + if (args.screenLog.length > 0) { + lines.push( + `Screens visited so far: ${args.screenLog.slice(-10).join(" → ")}`, + ); + lines.push(""); + } + if (args.discovered.length > 0) { + lines.push(`Already discovered (${args.discovered.length} action(s)):`); + for (const a of args.discovered) { + const params = a.parameters.map((p) => p.name).join(", "); + lines.push( + ` - ${a.intentName}(${params}) [${a.tabOrSection}, ${a.priority}]`, + ); + } + lines.push(""); + } + lines.push( + "Actionable controls on the CURRENT screen (selector, type, name, patterns):", + ); + lines.push(summarizeActionableControls(args.tree)); + lines.push(""); + lines.push("Return an IterativeReconStep."); + return lines.join("\n"); +} + +function summarizeActionableControls(root: TreeNode): string { + const lines: string[] = []; + function walk(n: TreeNode, depth: number): void { + if ( + n.patterns.length > 0 && + n.isEnabled && + !n.isOffscreen && + (n.name || n.automationId) + ) { + const label = n.name ?? n.automationId ?? ""; + lines.push( + `${" ".repeat(depth)}${n.controlType} '${truncate(label, 50)}' [${n.patterns.join(",")}] sel=${n.selector}`, + ); + } + for (const c of n.children) walk(c, depth + 1); + } + walk(root, 0); + // Limit but don't truncate selectors — the LLM needs them whole. + return lines.slice(0, 80).join("\n"); +} + +function truncate(s: string, n: number): string { + return s.length > n ? s.slice(0, n - 1) + "…" : s; +} + +function makeIterativeReconTranslator( + model: ChatModel, +): TypeChatJsonTranslator { + const schema = loadSchema(["iterativeReconLlmSchema.ts"], import.meta.url); + const validator = createTypeScriptJsonValidator( + schema, + "IterativeReconStep", + ); + return createJsonTranslator(model, validator); +} + +async function sleep(ms: number): Promise { + await new Promise((res) => setTimeout(res, ms)); +} + +/** + * Render iterative recon output as a goal string for the explore loop. + */ +export function renderIterativeReconAsGoal( + recon: IterativeReconResult, +): string { + const lines: string[] = []; + lines.push( + `Drive ${recon.appHint} through these specific user actions discovered during reconnaissance. Work through them in order; multi-step tasks (open dialog, fill fields, save) are normal. Skip and move on if a task gets stuck. Avoid actions marked DESTRUCTIVE.`, + ); + lines.push(""); + let i = 1; + for (const a of recon.expectedActions) { + const params = a.parameters + .map((p) => `${p.name}=${JSON.stringify(p.example)}`) + .join(", "); + const dest = a.destructive ? " [DESTRUCTIVE — skip]" : ""; + lines.push( + `${i}. ${a.intentName}(${params}) on ${a.tabOrSection} — ${a.description}${dest}`, + ); + i++; + } + lines.push(""); + lines.push( + "After each action, observe the result and move to the next. If you've completed all of these, choose 'stop'.", + ); + return lines.join("\n"); +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/llmOracle.ts b/ts/packages/agents/onboarding/src/uiCapture/llmOracle.ts new file mode 100644 index 0000000000..5a6f72635c --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/llmOracle.ts @@ -0,0 +1,161 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { ChatModel } from "aiclient"; +import { loadSchema } from "typeagent"; +import { createJsonTranslator, TypeChatJsonTranslator } from "typechat"; +import { createTypeScriptJsonValidator } from "typechat/ts"; + +import { getExploreModel } from "../lib/llm.js"; +import type { + DecisionInput, + DecisionOracle, + ExploreDecision, + FrontierItem, +} from "./exploreTypes.js"; +import type { ExploreDecision as LlmExploreDecision } from "./exploreLlmSchema.js"; + +const SYSTEM_PROMPT = `You are exploring a Windows desktop application's UI to discover the user-facing actions it offers. Your goal is to drive the app via the controls on its current frontier and observe how the app's state changes. + +Ground rules: +- Pick the action most likely to reveal a NEW state (one we haven't visited). +- Avoid destructive actions (delete/remove/reset/clear) unless the goal requires it. +- Avoid window-management actions (close, minimize, maximize, app-switch). +- Prefer Buttons/MenuItems/ListItems over scrollbars or text fields unless the goal targets them. +- If the same frontier has been picked over and explored without yielding new states, choose "stop" or "restore". +- expectedDelta should be a short prediction in plain English; we'll compare it against what actually happens. + +Modal/popup handling: +- If the current state appears to be a modal dialog or popup (controls in a Popup/Dialog/Flyout container, an overlay, or a fixed-position card), recognize it. +- When a popup is in the way of your goal, dismiss it: look for Cancel / Close / "X" / Back buttons in the popup's controls, and click them. +- When a popup IS the goal (e.g., a "Save alarm" dialog where you've set fields), commit it via Save / OK / Confirm rather than Cancel. +- Don't get stuck repeating the same setValue on a control that didn't change state — try a sibling control, or dismiss and re-approach.`; + +export type LlmOracleOptions = { + goal: string; + model?: ChatModel; + /** + * Maximum number of consecutive translation failures before giving up + * with a "stop" decision. + */ + maxRetries?: number; +}; + +export class LlmOracle implements DecisionOracle { + private readonly translator: TypeChatJsonTranslator; + private readonly goal: string; + private readonly maxRetries: number; + private consecutiveFailures = 0; + + constructor(opts: LlmOracleOptions) { + this.goal = opts.goal; + this.maxRetries = opts.maxRetries ?? 2; + const model = opts.model ?? getExploreModel(); + const schema = loadSchema(["exploreLlmSchema.ts"], import.meta.url); + const validator = createTypeScriptJsonValidator( + schema, + "ExploreDecision", + ); + this.translator = createJsonTranslator( + model, + validator, + ); + } + + async decide(input: DecisionInput): Promise { + const prompt = this.buildPrompt(input); + const result = await this.translator.translate(prompt); + if (!result.success) { + this.consecutiveFailures++; + if (this.consecutiveFailures >= this.maxRetries) { + return { + kind: "stop", + reason: `LLM oracle: ${this.consecutiveFailures} consecutive translation failures (last: ${result.message})`, + }; + } + // Soft fallback: pick the first non-destructive frontier item. + const fallback = input.frontier.find( + (f) => !f.destructiveHint && f.verbs.length > 0, + ); + if (!fallback) { + return { + kind: "stop", + reason: `LLM translation failed and no fallback available: ${result.message}`, + }; + } + const verb = fallback.verbs[0]!.verb; + return { + kind: "act", + frontierId: fallback.id, + verb, + expectedDelta: "(fallback after LLM failure)", + rationale: `fallback: ${result.message}`, + }; + } + this.consecutiveFailures = 0; + return result.data as ExploreDecision; + } + + private buildPrompt(input: DecisionInput): string { + const lines: string[] = []; + lines.push(SYSTEM_PROMPT); + lines.push(""); + lines.push(`Goal: ${this.goal}`); + lines.push(""); + lines.push( + `Iteration: ${input.iteration} (remaining: ${input.budget.remainingIterations}; budget ${input.budget.remainingMs}ms)`, + ); + lines.push( + `Active state: ${input.state.id} '${input.state.windowTitle}'`, + ); + lines.push( + `Visited states: ${input.visitedStates.length} (ids: ${input.visitedStates + .slice(-8) + .map((s) => s.id) + .join(", ")})`, + ); + lines.push(""); + lines.push("Frontier:"); + if (input.frontier.length === 0) { + lines.push(" (empty — no actionable controls in this state)"); + } else { + for (const f of input.frontier.slice(0, 60)) { + lines.push(" " + renderFrontierItem(f)); + } + if (input.frontier.length > 60) { + lines.push(` ... and ${input.frontier.length - 60} more`); + } + } + lines.push(""); + if (input.recentTransitions.length > 0) { + lines.push("Recent actions:"); + for (const t of input.recentTransitions) { + const arrow = t.success ? "→" : "✗"; + const noChange = + t.fromStateId === t.toStateId ? " (no change)" : ""; + lines.push( + ` iter ${t.iteration}: ${t.fromStateId} ${arrow} ${t.trigger.verb} ${t.trigger.selector.split("/").pop()} → ${t.toStateId}${noChange}`, + ); + } + lines.push(""); + } + lines.push( + "Decide your next action. Output strictly matches the ExploreDecision schema.", + ); + return lines.join("\n"); + } +} + +function renderFrontierItem(f: FrontierItem): string { + const id = `[${f.id}]`; + const ct = f.controlType; + const label = f.name ?? f.automationId ?? f.className ?? ""; + const aid = f.automationId ? ` aid=${f.automationId}` : ""; + const verbs = f.verbs.map((v) => v.verb).join(","); + const dest = f.destructiveHint ? " (destructive!)" : ""; + return `${id} ${ct} '${truncate(label, 40)}'${aid} verbs:${verbs}${dest}`; +} + +function truncate(s: string, n: number): string { + return s.length > n ? s.slice(0, n - 1) + "…" : s; +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/playbackExecutor.ts b/ts/packages/agents/onboarding/src/uiCapture/playbackExecutor.ts new file mode 100644 index 0000000000..a3ca8567f8 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/playbackExecutor.ts @@ -0,0 +1,182 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import type { HelperClient } from "./helperClient.js"; +import type { PlaybackStep, SynthesizedAction } from "./synthesisLlmSchema.js"; + +export type PlaybackParams = Record; + +export type PlaybackStepResult = { + stepIndex: number; + selector: string; + verb: string; + value?: string | number | boolean; + success: boolean; + errorMessage?: string; + durationMs: number; +}; + +export type PlaybackResult = { + actionName: string; + success: boolean; + steps: PlaybackStepResult[]; + failedAtStep?: number; +}; + +export type PlaybackExecutorOptions = { + client: HelperClient; + /** Wait between steps that don't have waitForIdle set explicitly. */ + defaultIdleDebounceMs?: number; + defaultIdleMaxWaitMs?: number; + /** Stop on first failed step. Default true. */ + stopOnError?: boolean; +}; + +/** + * Replay a synthesized action against the helper. + * + * Resolves `valueRef` references against the supplied `params` map. For each + * step, dispatches the verb to the appropriate `do.*` RPC. Optionally waits + * for UIA idle between steps. + */ +export async function executePlayback( + action: SynthesizedAction, + params: PlaybackParams, + opts: PlaybackExecutorOptions, +): Promise { + const stopOnError = opts.stopOnError ?? true; + const debounceMs = opts.defaultIdleDebounceMs ?? 600; + const maxWaitMs = opts.defaultIdleMaxWaitMs ?? 4000; + const stepResults: PlaybackStepResult[] = []; + let success = true; + let failedAtStep: number | undefined; + + for (let i = 0; i < action.playback.length; i++) { + const step = action.playback[i]!; + const value = resolveValue(step, params); + const start = Date.now(); + let stepSuccess = true; + let errorMessage: string | undefined; + + try { + await executeStep(opts.client, step, value); + } catch (e) { + stepSuccess = false; + errorMessage = e instanceof Error ? e.message : String(e); + } + + const stepResult: PlaybackStepResult = { + stepIndex: i, + selector: step.selector, + verb: step.verb, + ...(value !== undefined ? { value } : {}), + success: stepSuccess, + ...(errorMessage !== undefined ? { errorMessage } : {}), + durationMs: Date.now() - start, + }; + stepResults.push(stepResult); + + if (!stepSuccess) { + success = false; + if (failedAtStep === undefined) failedAtStep = i; + if (stopOnError) break; + } + + // Wait for idle after this step unless explicitly disabled. + const wait = step.waitForIdle ?? defaultWait(step.verb); + if (wait) { + try { + await opts.client.eventsIdle({ debounceMs, maxWaitMs }); + } catch { + /* idle is best-effort */ + } + } + } + + return { + actionName: action.actionName, + success, + steps: stepResults, + ...(failedAtStep !== undefined ? { failedAtStep } : {}), + }; +} + +function defaultWait(verb: string): boolean { + // Verbs that typically open/close dialogs or transition panels need an idle wait. + return verb === "invoke" || verb === "select"; +} + +function resolveValue( + step: PlaybackStep, + params: PlaybackParams, +): string | number | boolean | undefined { + if (step.valueRef !== undefined) { + // Accept either "${name}" or bare "name". + const m = step.valueRef.match(/^\$\{(.+)\}$/); + const key = m ? m[1]! : step.valueRef; + const v = params[key]; + if (v === undefined) { + throw new Error(`Missing parameter '${key}' for valueRef`); + } + return v; + } + if (step.valueLiteral !== undefined) { + return step.valueLiteral; + } + return undefined; +} + +async function executeStep( + client: HelperClient, + step: PlaybackStep, + value: string | number | boolean | undefined, +): Promise { + switch (step.verb) { + case "invoke": + await client.doInvoke({ selector: step.selector }); + break; + case "click": + await client.doClick({ selector: step.selector }); + break; + case "focus": + await client.doFocus({ selector: step.selector }); + break; + case "toggle": + await client.doToggle({ + selector: step.selector, + ...(typeof value === "boolean" ? { value } : {}), + }); + break; + case "expand": + await client.doExpand({ + selector: step.selector, + expand: typeof value === "boolean" ? value : true, + }); + break; + case "select": + await client.doSelect({ + selector: step.selector, + ...(value !== undefined && typeof value !== "boolean" + ? { item: value as string | number } + : {}), + }); + break; + case "setValue": + if (value === undefined) { + throw new Error(`setValue step requires a value`); + } + await client.doSetValue({ + selector: step.selector, + value, + }); + break; + case "scroll": + await client.doScroll({ + selector: step.selector, + direction: "down", + }); + break; + default: + throw new Error(`Unsupported playback verb: ${step.verb}`); + } +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/reconLlmSchema.ts b/ts/packages/agents/onboarding/src/uiCapture/reconLlmSchema.ts new file mode 100644 index 0000000000..5c14b320ca --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/reconLlmSchema.ts @@ -0,0 +1,51 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// TypeChat schema for vision-driven reconnaissance: the LLM looks at a +// screenshot of one tab/screen + its filtered control tree and produces +// a description of what the screen does, plus a list of plausible user +// actions it offers. Loaded as text by TypeChat — keep self-contained, +// no runtime imports. + +/** Reconnaissance findings for one tab/screen of the app. */ +export type TabRecon = { + /** Short descriptive label for this tab/screen, e.g. "Alarm", "Stopwatch". */ + tabLabel: string; + /** One-sentence summary of what this tab does. */ + purpose: string; + /** User-facing actions the tab supports. List the PLAUSIBLE actions — + * things a user is likely to want to do here, regardless of whether + * the screenshot shows the action mid-execution. */ + expectedActions: ExpectedAction[]; + /** Optional: things that look interesting but you're unsure about + * (ambiguous controls, settings buttons whose effect isn't obvious). */ + uncertain?: string[]; +}; + +export type ExpectedAction = { + /** camelCase verb-noun, e.g. "createAlarm", "startStopwatch", "addCity". */ + intentName: string; + /** One-sentence description of the user-facing outcome. */ + description: string; + /** Parameters the user supplies. Empty array if the action takes none. */ + parameters: ExpectedParam[]; + /** Plain-English example invocation, e.g. "Create alarm 'Morning' at 7:00 AM". */ + exampleInvocation: string; + /** "primary" = main reason for this tab; "secondary" = adjacent feature. */ + priority: "primary" | "secondary"; + /** Is this destructive (delete/clear/reset)? */ + destructive: boolean; +}; + +export type ExpectedParam = { + /** camelCase, e.g. "name", "minutes", "city". */ + name: string; + /** "string" | "number" | "boolean" | "enum". */ + type: "string" | "number" | "boolean" | "enum"; + /** When type is "enum", allowed values. */ + enumValues?: string[]; + /** Plausible example value (your best guess from the screenshot). */ + example: string | number | boolean; + /** One short sentence describing what this parameter controls. */ + description: string; +}; diff --git a/ts/packages/agents/onboarding/src/uiCapture/recorder.ts b/ts/packages/agents/onboarding/src/uiCapture/recorder.ts new file mode 100644 index 0000000000..c5acfe16c4 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/recorder.ts @@ -0,0 +1,98 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { createWriteStream, mkdirSync, WriteStream } from "node:fs"; +import path from "node:path"; + +import type { CapturedEvent, EventType, HelperClient } from "./helperClient.js"; + +const DEFAULT_EVENT_TYPES: EventType[] = [ + "Invoked", + "ValueChanged", + "ToggleStateChanged", + "StructureChanged", +]; + +/** + * Subscribes to UIA events on a target window and writes each captured event + * as one JSONL line to `/recordings//transitions.jsonl`. + * + * No source-attribution (agent vs. user) here — slice 5 just captures + * everything. Tagging by initiator comes with the explore loop in slice 6. + */ +export class Recorder { + private readonly writer: WriteStream; + private readonly removeHandler: () => void; + private subscriptionId: string | null = null; + private eventCount = 0; + private stopped = false; + + private constructor( + private readonly client: HelperClient, + public readonly sessionDir: string, + ) { + mkdirSync(sessionDir, { recursive: true }); + this.writer = createWriteStream( + path.join(sessionDir, "transitions.jsonl"), + { flags: "a" }, + ); + this.removeHandler = client.onEvent((evt) => this.handle(evt)); + } + + static async start(opts: { + client: HelperClient; + workspaceDir: string; + sessionId?: string; + root: string; + eventTypes?: EventType[]; + }): Promise { + const sessionId = + opts.sessionId ?? new Date().toISOString().replace(/[:.]/g, "-"); + const sessionDir = path.join( + opts.workspaceDir, + "recordings", + sessionId, + ); + const recorder = new Recorder(opts.client, sessionDir); + const sub = await opts.client.eventsSubscribe({ + root: opts.root, + eventTypes: opts.eventTypes ?? DEFAULT_EVENT_TYPES, + }); + recorder.subscriptionId = sub.subscriptionId; + return recorder; + } + + private handle(evt: CapturedEvent): void { + if (this.stopped) { + return; + } + if (this.subscriptionId && evt.subscriptionId !== this.subscriptionId) { + return; + } + this.eventCount++; + this.writer.write(JSON.stringify(evt) + "\n"); + } + + get count(): number { + return this.eventCount; + } + + async stop(): Promise<{ eventCount: number; sessionDir: string }> { + if (this.stopped) { + return { eventCount: this.eventCount, sessionDir: this.sessionDir }; + } + this.stopped = true; + if (this.subscriptionId) { + try { + await this.client.eventsUnsubscribe({ + subscriptionId: this.subscriptionId, + }); + } catch { + /* helper may already be down */ + } + } + this.removeHandler(); + await new Promise((res) => this.writer.end(() => res())); + return { eventCount: this.eventCount, sessionDir: this.sessionDir }; + } +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/scaffoldUiAgent.ts b/ts/packages/agents/onboarding/src/uiCapture/scaffoldUiAgent.ts new file mode 100644 index 0000000000..6a02350ca9 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/scaffoldUiAgent.ts @@ -0,0 +1,423 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// Scaffolder: takes a workspace's discoveredActions.json and emits a runtime +// TypeAgent agent package that can replay each action via the helper. + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import type { SynthesizedAction } from "./synthesisLlmSchema.js"; + +export type ScaffoldOptions = { + /** Source discoveredActions.json (typically /discoveredActions.json). */ + discoveredActionsPath: string; + /** Output package directory (typically `ts/packages/agents//`). */ + targetDir: string; + /** camelCase integration name (e.g., "windowsClock"). */ + integrationName: string; + /** Human-readable description for the agent manifest. */ + description?: string; + /** Emoji shown next to the agent in the shell. */ + emoji?: string; + /** AUMID (UWP) or absolute exePath for app auto-launch. */ + appLaunch: { aumid?: string; exePath?: string }; + /** + * Substring to match against window titles when probing for an + * already-running instance of the app (UWP apps can't be launched twice). + * Defaults to the integrationName. + */ + appTitleMatch?: string; +}; + +export function scaffoldUiAgent(opts: ScaffoldOptions): void { + if (!existsSync(opts.discoveredActionsPath)) { + throw new Error( + `discoveredActions.json not found at ${opts.discoveredActionsPath}`, + ); + } + const discovered = JSON.parse( + readFileSync(opts.discoveredActionsPath, "utf8"), + ) as { actions: SynthesizedAction[] }; + if (!Array.isArray(discovered.actions) || discovered.actions.length === 0) { + throw new Error("discoveredActions.json has no actions"); + } + + const name = opts.integrationName; + const cap = capitalize(name); + const description = + opts.description ?? `${cap} agent — UI Automation playback.`; + const emoji = opts.emoji ?? "⚙️"; + + mkdirSync(path.join(opts.targetDir, "src"), { recursive: true }); + mkdirSync(path.join(opts.targetDir, "data"), { recursive: true }); + + // Copy discoveredActions.json into data/. + writeFileSync( + path.join(opts.targetDir, "data", "discoveredActions.json"), + JSON.stringify(discovered, null, 2), + ); + + // Schema (TS action union + per-action types). + writeFileSync( + path.join(opts.targetDir, "src", `${name}Schema.ts`), + renderSchema(name, cap, discovered.actions), + ); + + // Manifest. + writeFileSync( + path.join(opts.targetDir, "src", `${name}Manifest.json`), + renderManifest(name, cap, description, emoji), + ); + + // ActionHandler. + const appTitleMatch = opts.appTitleMatch ?? name; + writeFileSync( + path.join(opts.targetDir, "src", `${name}ActionHandler.ts`), + renderActionHandler(name, cap, opts.appLaunch, appTitleMatch), + ); + + // package.json + tsconfigs. + writeFileSync( + path.join(opts.targetDir, "package.json"), + renderPackageJson(name), + ); + writeFileSync( + path.join(opts.targetDir, "tsconfig.json"), + renderRootTsconfig(), + ); + writeFileSync( + path.join(opts.targetDir, "src", "tsconfig.json"), + renderSrcTsconfig(), + ); +} + +function renderSchema( + name: string, + cap: string, + actions: SynthesizedAction[], +): string { + const lines: string[] = []; + lines.push("// Copyright (c) Microsoft Corporation."); + lines.push("// Licensed under the MIT License."); + lines.push(""); + lines.push(`export type ${cap}Action =`); + for (let i = 0; i < actions.length; i++) { + const last = i === actions.length - 1; + lines.push(` | ${actionTypeName(actions[i]!)}${last ? ";" : ""}`); + } + lines.push(""); + + for (const a of actions) { + // The action-schema-compiler doesn't accept /** ... */ blocks; use //. + for (const dl of a.description.split("\n")) { + lines.push(`// ${dl}`); + } + lines.push(`export type ${actionTypeName(a)} = {`); + lines.push(` actionName: "${a.actionName}";`); + if (a.parameters.length === 0) { + lines.push(` parameters: {};`); + } else { + lines.push(` parameters: {`); + for (const p of a.parameters) { + if (p.description) { + lines.push( + ` // ${p.description.replace(/\n/g, " ")}`, + ); + } + lines.push( + ` ${p.name}${p.examples.length === 0 ? "?" : ""}: ${tsType(p)};`, + ); + } + lines.push(` };`); + } + lines.push(`};`); + lines.push(""); + } + return lines.join("\n"); +} + +function actionTypeName(a: SynthesizedAction): string { + return `${capitalize(a.actionName)}Action`; +} + +function tsType(p: SynthesizedAction["parameters"][number]): string { + if (p.type === "enum" && p.enumValues && p.enumValues.length > 0) { + return p.enumValues.map((v) => JSON.stringify(v)).join(" | "); + } + switch (p.type) { + case "string": + return "string"; + case "number": + return "number"; + case "boolean": + return "boolean"; + default: + return "string"; + } +} + +function renderManifest( + name: string, + cap: string, + description: string, + emoji: string, +): string { + return JSON.stringify( + { + emojiChar: emoji, + description, + schema: { + description, + originalSchemaFile: `./${name}Schema.ts`, + schemaFile: `../dist/${name}Schema.pas.json`, + schemaType: { + action: `${cap}Action`, + }, + }, + }, + null, + 2, + ); +} + +function renderActionHandler( + name: string, + cap: string, + appLaunch: ScaffoldOptions["appLaunch"], + appTitleMatch: string, +): string { + const launchArg = appLaunch.aumid + ? `{ aumid: ${JSON.stringify(appLaunch.aumid)} }` + : `{ exePath: ${JSON.stringify(appLaunch.exePath ?? "")} }`; + const titleMatchLit = JSON.stringify(appTitleMatch); + return `// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { + ActionContext, + AppAgent, + SessionContext, + TypeAgentAction, +} from "@typeagent/agent-sdk"; +import { + createActionResultFromError, + createActionResultFromTextDisplay, +} from "@typeagent/agent-sdk/helpers/action"; +import { + executePlayback, + HelperClient, + SynthesizedAction, +} from "onboarding-agent/uiCapture"; +import { readFileSync } from "node:fs"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import { ${cap}Action } from "./${name}Schema.js"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const APP_TITLE_MATCH = ${titleMatchLit}; + +type DiscoveredActionsFile = { actions: SynthesizedAction[] }; + +let cachedActions: DiscoveredActionsFile | null = null; +function loadDiscoveredActions(): DiscoveredActionsFile { + if (cachedActions) return cachedActions; + // From dist/${name}ActionHandler.js, the data dir is two levels up + /data/. + const file = path.resolve( + __dirname, + "..", + "data", + "discoveredActions.json", + ); + cachedActions = JSON.parse( + readFileSync(file, "utf8"), + ) as DiscoveredActionsFile; + return cachedActions; +} + +type AgentState = { + client: HelperClient | null; + appPid: number | null; + appMainWindow: string | null; +}; + +async function ensureClient(state: AgentState): Promise { + if (!state.client) { + state.client = await HelperClient.start(); + } + return state.client; +} + +async function ensureAppRunning(state: AgentState): Promise { + const client = await ensureClient(state); + if (state.appPid !== null) { + // Verify still running. + const list = await client.appList(); + const found = list.find((w) => w.pid === state.appPid); + if (found) { + state.appMainWindow = found.mainWindow; + return; + } + state.appPid = null; + state.appMainWindow = null; + } + // The app may already be running from a prior invocation (each CLI / agent + // call gets a fresh AgentState). Probe app.list for an existing window + // matching APP_TITLE_MATCH before launching — UWP apps can't be launched + // twice and FlaUI returns "no main window" when they are. + const existing = await client.appList(); + const match = existing.find((w) => + w.title.toLowerCase().includes(APP_TITLE_MATCH.toLowerCase()), + ); + if (match) { + state.appPid = match.pid; + state.appMainWindow = match.mainWindow; + await client.eventsIdle({ debounceMs: 600, maxWaitMs: 3000 }); + return; + } + const launch = await client.appLaunch(${launchArg}); + state.appPid = launch.pid; + state.appMainWindow = launch.mainWindow; + await client.eventsIdle({ debounceMs: 800, maxWaitMs: 5000 }); +} + +export function instantiate(): AppAgent { + return { + async initializeAgentContext() { + return { + client: null, + appPid: null, + appMainWindow: null, + } as AgentState; + }, + async updateAgentContext( + _enable: boolean, + _context: SessionContext, + _schemaName: string, + ) { + // No per-session work needed; the helper is launched lazily. + }, + async executeAction( + action: TypeAgentAction<${cap}Action>, + context: ActionContext, + ) { + const state = context.sessionContext.agentContext; + const def = loadDiscoveredActions().actions.find( + (a) => a.actionName === action.actionName, + ); + if (!def) { + return createActionResultFromError( + \`No discovered action named '\${action.actionName}'\`, + ); + } + try { + await ensureAppRunning(state); + const client = await ensureClient(state); + const result = await executePlayback( + def, + (action.parameters ?? {}) as Record< + string, + string | number | boolean + >, + { + client, + defaultIdleDebounceMs: 700, + defaultIdleMaxWaitMs: 4000, + }, + ); + if (!result.success) { + const failed = result.steps[result.failedAtStep ?? 0]; + return createActionResultFromError( + \`Playback failed at step \${(result.failedAtStep ?? 0) + 1}: \${failed?.errorMessage ?? "unknown"}\`, + ); + } + return createActionResultFromTextDisplay( + \`Done: \${action.actionName} (\${result.steps.length} steps)\`, + ); + } catch (e) { + return createActionResultFromError( + e instanceof Error ? e.message : String(e), + ); + } + }, + async closeAgentContext(context: SessionContext) { + const state = context.agentContext; + if (state.client) { + await state.client.dispose(); + state.client = null; + } + }, + }; +} +`; +} + +function renderPackageJson(name: string): string { + return JSON.stringify( + { + name: `${name}-agent`, + version: "0.0.1", + private: true, + description: `${name} TypeAgent — UI Automation playback`, + type: "module", + exports: { + "./agent/manifest": `./src/${name}Manifest.json`, + "./agent/handlers": `./dist/${name}ActionHandler.js`, + }, + scripts: { + "asc:main": `asc -i ./src/${name}Schema.ts -o ./dist/${name}Schema.pas.json -t ${capitalize(name)}Action`, + build: `concurrently npm:tsc npm:asc:*`, + clean: "rimraf --glob dist *.tsbuildinfo *.done.build.log", + tsc: "tsc -b", + }, + dependencies: { + "@typeagent/agent-sdk": "workspace:*", + "onboarding-agent": "workspace:*", + }, + devDependencies: { + "@typeagent/action-schema-compiler": "workspace:*", + concurrently: "^9.1.2", + rimraf: "^6.0.1", + typescript: "~5.4.5", + }, + engines: { + node: ">=20", + }, + }, + null, + 2, + ); +} + +function renderRootTsconfig(): string { + return JSON.stringify( + { + extends: "../../../tsconfig.base.json", + compilerOptions: { composite: true }, + include: [], + references: [{ path: "./src" }], + }, + null, + 2, + ); +} + +function renderSrcTsconfig(): string { + return JSON.stringify( + { + extends: "../../../../tsconfig.base.json", + compilerOptions: { + composite: true, + rootDir: ".", + outDir: "../dist", + }, + include: ["./**/*"], + }, + null, + 2, + ); +} + +function capitalize(s: string): string { + return s.length === 0 ? s : s[0]!.toUpperCase() + s.slice(1); +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/snapshotPolicy.ts b/ts/packages/agents/onboarding/src/uiCapture/snapshotPolicy.ts new file mode 100644 index 0000000000..6cc02f5477 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/snapshotPolicy.ts @@ -0,0 +1,153 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { execFile } from "node:child_process"; +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import { promisify } from "node:util"; + +import type { + DetectionStatus, + ProcessIdentity, + SnapshotPolicy, + SnapshotSource, +} from "./types.js"; + +const execFileAsync = promisify(execFile); + +/** + * Resolve PackageFamilyName for a UWP package name (the part of the AUMID + * before the underscore). Returns null if the package isn't installed. + */ +async function getPackageFamilyName( + packageName: string, +): Promise { + try { + const { stdout } = await execFileAsync( + "powershell.exe", + [ + "-NoProfile", + "-NonInteractive", + "-Command", + `(Get-AppxPackage -Name '${packageName}' | Select-Object -First 1).PackageFamilyName`, + ], + { timeout: 10_000 }, + ); + const pfn = stdout.trim(); + return pfn.length > 0 ? pfn : null; + } catch { + return null; + } +} + +/** + * Auto-detect a snapshot policy for a UWP app by AUMID. + * AUMID format: `_!` — + * e.g., `Microsoft.WindowsAlarms_8wekyb3d8bbwe!App`. + */ +export async function inferSnapshotPolicy(opts: { + integrationName: string; + aumid?: string; + exePath?: string; +}): Promise { + const policy: SnapshotPolicy = { + version: 1, + integrationName: opts.integrationName, + detectionStatus: "auto-candidate", + processIdentity: { + ...(opts.aumid !== undefined ? { aumid: opts.aumid } : {}), + ...(opts.exePath !== undefined ? { exePath: opts.exePath } : {}), + }, + state: [], + }; + + if (opts.aumid) { + const packageName = opts.aumid.split("_")[0]!; + const pfn = await getPackageFamilyName(packageName); + if (pfn) { + const localAppData = process.env.LOCALAPPDATA ?? ""; + const baseDir = path.join(localAppData, "Packages", pfn); + for (const sub of ["LocalState", "Settings", "RoamingState"]) { + const candidate = path.join(baseDir, sub); + if (existsSync(candidate)) { + const folderSource: SnapshotSource = { + kind: "folder", + path: candidate, + recursive: true, + }; + policy.state.push(folderSource); + } + } + const processName = inferProcessName(packageName); + if (processName !== undefined) { + policy.processIdentity.processName = processName; + } + } + } else if (opts.exePath) { + // Win32 fallback. We don't currently auto-discover state directories + // for Win32 apps; the user is expected to fill in the policy. + policy.processIdentity.processName = path.basename(opts.exePath); + } + + if (policy.state.length === 0) { + policy.detectionStatus = "auto-candidate"; + } + return policy; +} + +/** + * Best-effort process-name guess for known packages. The package name doesn't + * always match the executable name; this table is small for now and grows as + * we onboard real apps. + */ +function inferProcessName(packageName: string): string | undefined { + const map: Record = { + "Microsoft.WindowsAlarms": "Time.exe", + "Microsoft.WindowsCalculator": "CalculatorApp.exe", + }; + return map[packageName]; +} + +export function loadSnapshotPolicy( + workspaceDir: string, +): SnapshotPolicy | null { + const file = path.join(workspaceDir, "snapshotPolicy.json"); + if (!existsSync(file)) { + return null; + } + return JSON.parse(readFileSync(file, "utf8")) as SnapshotPolicy; +} + +export function saveSnapshotPolicy( + workspaceDir: string, + policy: SnapshotPolicy, +): void { + mkdirSync(workspaceDir, { recursive: true }); + writeFileSync( + path.join(workspaceDir, "snapshotPolicy.json"), + JSON.stringify(policy, null, 2), + ); +} + +/** + * Mark a policy as confirmed by user review. + */ +export function approveSnapshotPolicy(policy: SnapshotPolicy): SnapshotPolicy { + return { ...policy, detectionStatus: "user-confirmed" as DetectionStatus }; +} + +/** + * Build an empty policy declaring an integration has no persisted state. + */ +export function makeStatelessPolicy( + integrationName: string, + processIdentity: ProcessIdentity = {}, +): SnapshotPolicy { + return { + version: 1, + integrationName, + detectionStatus: "no-state", + processIdentity, + state: [], + }; +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/stubOracle.ts b/ts/packages/agents/onboarding/src/uiCapture/stubOracle.ts new file mode 100644 index 0000000000..2149eba5cf --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/stubOracle.ts @@ -0,0 +1,60 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import type { + DecisionInput, + DecisionOracle, + ExploreDecision, +} from "./exploreTypes.js"; + +/** + * Deterministic test oracle. Picks the first non-destructive frontier item, + * preferring `invoke` and `select` verbs. Stops when the frontier is empty + * or when a maxDecisions cap is hit. + * + * Useful for proving the loop mechanics without LLM calls. Slice 6b replaces + * this with a typechat-backed oracle. + */ +export class StubOracle implements DecisionOracle { + private decisions = 0; + + constructor( + private readonly opts: { + maxDecisions?: number; + preferVerbs?: string[]; + } = {}, + ) {} + + async decide(input: DecisionInput): Promise { + const cap = this.opts.maxDecisions ?? 5; + if (this.decisions >= cap) { + return { kind: "stop", reason: `stub-cap (${cap})` }; + } + const preferred = this.opts.preferVerbs ?? ["invoke", "select"]; + const isWindowMgmt = (f: { name?: string; automationId?: string }) => + /(?:Close|Minimize|Maximize|Restore)\b/i.test( + `${f.name ?? ""} ${f.automationId ?? ""}`, + ); + const candidates = input.frontier.filter( + (f) => !f.destructiveHint && !isWindowMgmt(f), + ); + const pick = + candidates.find((f) => + f.verbs.some((v) => preferred.includes(v.verb)), + ) ?? candidates[0]; + if (!pick) { + return { kind: "stop", reason: "no candidates in frontier" }; + } + const verb = + pick.verbs.find((v) => preferred.includes(v.verb))?.verb ?? + pick.verbs[0]!.verb; + this.decisions++; + return { + kind: "act", + frontierId: pick.id, + verb, + expectedDelta: `(stub) ${verb} on ${pick.controlType} '${pick.name ?? pick.automationId ?? ""}'`, + rationale: `stub-oracle pick #${this.decisions}: ${verb} ${pick.id}`, + }; + } +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/synthesisLlmSchema.ts b/ts/packages/agents/onboarding/src/uiCapture/synthesisLlmSchema.ts new file mode 100644 index 0000000000..786a769007 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/synthesisLlmSchema.ts @@ -0,0 +1,146 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +// TypeChat schemas for the synthesis pass: neutral-state classification, +// chunk clustering, and synthesized-action generation. Loaded as text by +// TypeChat — keep self-contained, no runtime imports. + +/* ---------- Neutral state classification ---------- */ + +/** Classify a batch of captured states as "neutral" (settled, awaiting next user action) or not. */ +export type NeutralStatesClassification = { + classifications: NeutralStateClassification[]; +}; + +export type NeutralStateClassification = { + /** State id from the input (e.g. "state-007"). */ + stateId: string; + /** True if this state is a settled rest point — the user could start a new task from here. + * False if this state is mid-flow (e.g., a modal dialog requires resolution, an animation in progress). */ + isNeutral: boolean; + /** One sentence: why neutral or not. */ + reason: string; + /** Optional human-friendly label like "alarmTab.empty" or "timerTab.running". Camel-and-dot case. */ + tabOrSection?: string; +}; + +/* ---------- Chunk clustering ---------- */ + +/** Group chunks by user-meaningful intent. */ +export type ClusteringResult = { + clusters: ChunkCluster[]; + /** Chunks that don't fit any cluster (e.g., partial paths the explorer abandoned). */ + orphans?: string[]; +}; + +export type ChunkCluster = { + /** Stable id for this cluster (e.g. "cl-001"). */ + clusterId: string; + /** camelCase verb-noun, e.g. "createAlarm", "startTimer", "navigateToTab". */ + intentName: string; + /** One sentence describing what the user accomplishes by performing this intent. */ + shortDescription: string; + /** Ids of chunks (from the input) that belong to this cluster. */ + chunkIds: string[]; +}; + +/* ---------- Synthesized action ---------- */ + +/** A user-meaningful action ready for downstream phases (phraseGen / schemaGen). */ +export type SynthesizedAction = { + /** camelCase verb-noun (matches the cluster's intentName by default). */ + actionName: string; + /** Short user-facing description, suitable for help text. */ + description: string; + /** Parameters extracted from the cluster's chunk variations (empty if all chunks were identical). */ + parameters: ParamSpec[]; + /** Sequenced steps to replay this action at runtime. */ + playback: PlaybackStep[]; + /** Required app state before invoking this action. */ + preconditions: { neutralState: string; description: string }; + /** What the app looks like after the action completes successfully. */ + postconditions: { description: string }; + /** True if the action irreversibly destroys user data (deletes, resets, clears). */ + destructive: boolean; +}; + +export type ParamSpec = { + /** camelCase, e.g. "name", "minutes", "enabled". */ + name: string; + /** "string" | "number" | "boolean" | "enum". */ + type: "string" | "number" | "boolean" | "enum"; + /** When type is "enum", the allowed values. */ + enumValues?: string[]; + /** One short sentence describing what this parameter controls. */ + description: string; + /** Concrete values observed in the source chunks. */ + examples: Array; +}; + +export type PlaybackStep = { + /** Selector path of the control to act on. */ + selector: string; + /** Verb to apply to that control. */ + verb: + | "invoke" + | "toggle" + | "setValue" + | "select" + | "expand" + | "scroll" + | "focus" + | "click"; + /** + * If this step's value is parameterized, ${paramName} reference. Set + * either valueRef OR valueLiteral, not both. Omit both for verbs with + * no value (invoke/focus/click). + */ + valueRef?: string; + /** Constant value for verbs that need one. */ + valueLiteral?: string | number | boolean; + /** Wait for UIA to settle after this step. Default true for invoke/select; false otherwise. */ + waitForIdle?: boolean; + /** Optional: short description of what changed after this step (sanity check at replay time). */ + expectedDeltaSummary?: string; +}; + +/* ---------- Validation ---------- */ + +/** Result of reviewing the full synthesized action set for quality issues. */ +export type ValidationResult = { + /** Per-action review. */ + reviews: ActionReview[]; + /** + * Action names that should be MERGED — they're really the same intent + * with different parameters and should be one parameterized action. + * Each entry lists the names to combine. + */ + mergeRecommendations?: MergeRecommendation[]; + /** High-level notes about the overall set: gaps, naming conventions, etc. */ + overallNotes?: string; +}; + +export type ActionReview = { + actionName: string; + /** Quality verdict. */ + verdict: "ok" | "fragment" | "duplicate" | "broken" | "ambiguous"; + /** One sentence explanation of any concern. */ + note: string; + /** Suggested fix in plain English (optional — only when verdict != "ok"). */ + suggestion?: string; +}; + +export type MergeRecommendation = { + /** Names of the existing actions that should be merged into one. */ + actionNames: string[]; + /** Proposed combined name (camelCase verb-noun). */ + proposedName: string; + /** Proposed parameter name that distinguishes the variants (the dimension along which they differ). */ + proposedParam: { + name: string; + type: "string" | "number" | "boolean" | "enum"; + enumValues?: string[]; + }; + /** One sentence: why these belong together. */ + rationale: string; +}; diff --git a/ts/packages/agents/onboarding/src/uiCapture/synthesizer.ts b/ts/packages/agents/onboarding/src/uiCapture/synthesizer.ts new file mode 100644 index 0000000000..ba6cd38cf6 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/synthesizer.ts @@ -0,0 +1,884 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { ChatModel } from "aiclient"; +import { existsSync, readFileSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import { loadSchema } from "typeagent"; +import { createJsonTranslator, TypeChatJsonTranslator } from "typechat"; +import { createTypeScriptJsonValidator } from "typechat/ts"; + +import { getSynthesisModel } from "../lib/llm.js"; +import type { CapturedState, CapturedTransition } from "./exploreTypes.js"; +import type { + ClusteringResult, + MergeRecommendation, + NeutralStatesClassification, + NeutralStateClassification, + SynthesizedAction, + ValidationResult, +} from "./synthesisLlmSchema.js"; +import type { TreeNode } from "./types.js"; + +export type SynthesisInput = { + runDir: string; + integrationName: string; + /** Override the default LLM model. Defaults to getExploreModel(). */ + model?: ChatModel; + /** + * If set, the synthesized actions are merged into a workspace-level + * discoveredActions.json. The merged file is the canonical source for + * downstream consumers (runtime agent, phraseGen, etc.). + */ + workspaceDir?: string; +}; + +export type SynthesisOutput = { + integrationName: string; + actions: SynthesizedAction[]; + neutralStates: NeutralStateClassification[]; + clusters: ClusteringResult; + chunkCount: number; + discoveredActionsPath: string; + reportPath: string; + /** Path to the workspace-level merged file, when workspaceDir was set. */ + mergedActionsPath?: string; + mergeStats?: { + priorActionCount: number; + addedActionCount: number; + updatedActionCount: number; + finalActionCount: number; + }; + /** Validation pass output, if it ran. */ + validation?: ValidationResult; +}; + +export type DiscoveredActionsFile = { + version: 1; + integrationName: string; + discoveredAt: string; + source: string; + actions: SynthesizedAction[]; +}; + +type Chunk = { + id: string; + transitionIds: string[]; + startStateId: string; + endStateId: string; + isNeutralStart: boolean; + isNeutralEnd: boolean; +}; + +type LoadedGraph = { + states: CapturedState[]; + transitions: CapturedTransition[]; + runDir: string; +}; + +const SYSTEM_PROMPT_BASE = `You are post-processing a UI-Automation exploration trace into a list of user-meaningful actions for a Windows desktop app. Be precise and conservative — don't invent capabilities the trace doesn't show. Take time to think structurally about the data; over-fragmenting or duplicating actions is worse than under-discovering them.`; + +const NEUTRAL_RULES = `Strict rules for isNeutral: +- A modal dialog, popup, flyout, wizard step, picker overlay, or "edit" pane is NEVER neutral. Even if it has stable controls, the user cannot "start a new task" from it — they must commit or cancel first. +- A confirmation prompt is NEVER neutral. +- A loading or transient state is NEVER neutral. +- An app's tab landing area (e.g., "Alarm tab list view", "Timer tab list view") IS neutral, even if it has data; it's the rest point. +- A running operation that the user has no obligation to commit (e.g., "Stopwatch running") IS neutral — it's an idle state of the running tool. +- Use the actionable controls listed for the state to decide. If the controls include "Save", "OK", "Cancel", "Discard", or other commit/dismiss verbs, that's a strong signal of a non-neutral mid-flow state.`; + +const CLUSTERING_RULES = `Strict rules for clustering chunks into intents: + +1) AGGRESSIVELY MERGE multi-step task flows. If a chunk performs an "open dialog → fill fields → click Save/OK/Confirm" sequence, the WHOLE chunk represents ONE user-intent (e.g., "createAlarm"), not three separate intents. Do not split. + +2) PARAMETERIZE BY VARIATION. If multiple chunks share the same selector pattern but differ in values (e.g., 5 chunks each click a different tab; 2 chunks each create an alarm with a different time), they are the SAME cluster. The variation becomes a parameter at synthesis time. + +3) RECOGNIZE TOGGLE BUTTONS. If the SAME selector is invoked across chunks but its effect differs by app state (e.g., a play/pause button, a start/stop button), emit TWO clusters — one per logical action — even though they share a selector. Don't lump alternating clicks of one button into a single 9-step recipe. + +4) DON'T EMIT FRAGMENTS. Do not produce a cluster that contains only "set the name field" or only "click Save in a modal". Those are sub-steps of a parent task. Find the parent task and roll them up. + +5) AIM FOR FEW CLUSTERS. A typical Windows app has 10-25 user-meaningful actions. If you're emitting 40+ clusters from <100 chunks, you're fragmenting. If you're emitting <5 clusters from rich exploration, you're over-merging. + +6) Use camelCase verb-noun names (createAlarm, startTimer, navigateToTab, recordLap, etc.). + +7) If a chunk's purpose is genuinely unclear or it's a partial path the explorer abandoned, list its id under \`orphans\` rather than forcing a cluster.`; + +const SYNTHESIS_RULES = `Strict rules for synthesizing one action from a cluster: + +A) USE THE MOST COMPLETE CHUNK AS THE BASIS. Some chunks in the cluster will be partial (the explorer bailed early, the user shortcut to defaults). Pick the CHUNK WITH THE MOST STEPS as the canonical playback shape. Do NOT take the intersection of chunks — that drops field-setting steps the user needs. + +B) PARAMETER EXTRACTION. For each step in the canonical playback, look at the corresponding step across all chunks. If values vary, declare a parameter and use valueRef "\${paramName}". If values are constant, use valueLiteral. Even with only ONE chunk that has a value, declare a parameter for setValue/select-with-item steps when the value is clearly user-supplied (a name, a number, a city). Don't hardcode user-input values. + +C) COMPLETE PLAYBACKS. The recipe must include EVERY step needed to perform the action from the precondition state — open dialog, fill fields, click Save. Don't omit the final commit step. Don't include unrelated steps from before/after the action. + +D) TOGGLE-AWARE. If the same selector appears multiple times in adjacent chunks (e.g., 3 lap presses, 5 alternating play/pause clicks), the action is the SINGLE click — not the repeated clicking. Emit ONE step per logical action. + +E) PARAMETER TYPES. number for numeric values; string for free text; boolean for toggle states; enum (with enumValues) for fixed sets of choices (like tab names). + +F) DESTRUCTIVE FLAG. Set destructive=true for delete/remove/reset/clear actions. Otherwise false. + +G) DESCRIPTIONS are short user-facing help text — what the action accomplishes, not how.`; + +export async function synthesize( + input: SynthesisInput, +): Promise { + const model = input.model ?? getSynthesisModel(); + const graph = loadGraph(input.runDir); + if (graph.states.length === 0) { + throw new Error(`No states found in ${input.runDir}`); + } + if (graph.transitions.length === 0) { + throw new Error(`No transitions found in ${input.runDir}`); + } + + // Step 1: classify neutral states (one LLM call covering all states). + const neutralResult = await classifyNeutralStates(model, graph); + const neutralByState = new Map(); + for (const c of neutralResult.classifications) { + neutralByState.set(c.stateId, c); + } + + // Step 2: chunk transitions deterministically using the neutrals. + const chunks = chunkTransitions(graph.transitions, neutralByState); + + // Step 3: cluster chunks by intent (one LLM call). + let clusters: ClusteringResult = { clusters: [], orphans: [] }; + if (chunks.length > 0) { + clusters = await clusterChunks(model, chunks, graph, neutralByState); + } + + // Step 4: synthesize one action per cluster. + let actions: SynthesizedAction[] = []; + for (const cluster of clusters.clusters) { + const action = await synthesizeOneCluster( + model, + cluster, + chunks, + graph, + neutralByState, + ); + if (action) { + actions.push(action); + } + } + + // Step 4b: validation pass — review the full set, flag fragments/duplicates, + // emit merge recommendations. Then apply merges deterministically. + let validation: ValidationResult | undefined; + if (actions.length > 0) { + validation = await validateActions(model, actions); + if ( + validation.mergeRecommendations && + validation.mergeRecommendations.length > 0 + ) { + actions = applyMergeRecommendations( + actions, + validation.mergeRecommendations, + ); + } + } + + // Step 5: persist outputs. + const discoveredActionsPath = path.join( + input.runDir, + "discoveredActions.json", + ); + writeFileSync( + discoveredActionsPath, + JSON.stringify( + { + version: 1, + integrationName: input.integrationName, + discoveredAt: new Date().toISOString(), + source: "uiCapture", + actions, + }, + null, + 2, + ), + ); + const reportPath = path.join(input.runDir, "synthesisReport.md"); + writeFileSync( + reportPath, + renderReport({ + integrationName: input.integrationName, + graph, + chunks, + neutralByState, + clusters, + actions, + }), + ); + + // Step 6: optional merge into workspace-level discoveredActions.json. + let mergedActionsPath: string | undefined; + let mergeStats: SynthesisOutput["mergeStats"]; + if (input.workspaceDir) { + const result = mergeIntoWorkspace({ + workspaceDir: input.workspaceDir, + integrationName: input.integrationName, + newActions: actions, + }); + mergedActionsPath = result.path; + mergeStats = result.stats; + } + + return { + integrationName: input.integrationName, + actions, + neutralStates: neutralResult.classifications, + clusters, + chunkCount: chunks.length, + discoveredActionsPath, + reportPath, + ...(mergedActionsPath !== undefined ? { mergedActionsPath } : {}), + ...(mergeStats !== undefined ? { mergeStats } : {}), + ...(validation !== undefined ? { validation } : {}), + }; +} + +/* ---------- Merge into workspace-level file ---------- */ + +export function mergeIntoWorkspace(opts: { + workspaceDir: string; + integrationName: string; + newActions: SynthesizedAction[]; +}): { path: string; stats: NonNullable } { + const filePath = path.join(opts.workspaceDir, "discoveredActions.json"); + const prior = loadDiscoveredActions(filePath); + const stats = { + priorActionCount: prior.length, + addedActionCount: 0, + updatedActionCount: 0, + finalActionCount: 0, + }; + const byName = new Map(); + for (const a of prior) byName.set(a.actionName, a); + + for (const fresh of opts.newActions) { + const existing = byName.get(fresh.actionName); + if (!existing) { + byName.set(fresh.actionName, fresh); + stats.addedActionCount++; + } else { + byName.set(fresh.actionName, mergeAction(existing, fresh)); + stats.updatedActionCount++; + } + } + + const merged: DiscoveredActionsFile = { + version: 1, + integrationName: opts.integrationName, + discoveredAt: new Date().toISOString(), + source: "uiCapture", + actions: [...byName.values()].sort((a, b) => + a.actionName.localeCompare(b.actionName), + ), + }; + stats.finalActionCount = merged.actions.length; + writeFileSync(filePath, JSON.stringify(merged, null, 2)); + return { path: filePath, stats }; +} + +function loadDiscoveredActions(filePath: string): SynthesizedAction[] { + if (!existsSync(filePath)) return []; + try { + const f = JSON.parse( + readFileSync(filePath, "utf8"), + ) as DiscoveredActionsFile; + return Array.isArray(f.actions) ? f.actions : []; + } catch { + return []; + } +} + +function mergeAction( + existing: SynthesizedAction, + incoming: SynthesizedAction, +): SynthesizedAction { + // Newer playback wins (it likely refines or generalizes the older one). + const playback = + incoming.playback.length >= existing.playback.length + ? incoming.playback + : existing.playback; + // Description: keep the longer one. + const description = + incoming.description.length > existing.description.length + ? incoming.description + : existing.description; + // Destructive: union (true if either run flagged it). + const destructive = existing.destructive || incoming.destructive; + // Preconditions / postconditions: prefer the newer if present, else keep the prior. + const preconditions = incoming.preconditions ?? existing.preconditions; + const postconditions = incoming.postconditions ?? existing.postconditions; + // Parameters: merge by name, accumulate examples. + const params = mergeParameters(existing.parameters, incoming.parameters); + + return { + actionName: existing.actionName, + description, + parameters: params, + playback, + preconditions, + postconditions, + destructive, + }; +} + +function mergeParameters( + existing: SynthesizedAction["parameters"], + incoming: SynthesizedAction["parameters"], +): SynthesizedAction["parameters"] { + const byName = new Map(); + for (const p of existing) byName.set(p.name, p); + for (const p of incoming) { + const prev = byName.get(p.name); + if (!prev) { + byName.set(p.name, p); + continue; + } + // Same name: merge examples (dedupe), prefer newer description, union enumValues. + const examples = dedupeExamples([...prev.examples, ...p.examples]); + const description = + p.description.length > prev.description.length + ? p.description + : prev.description; + const enumValues = p.enumValues ?? prev.enumValues; + const merged: SynthesizedAction["parameters"][number] = { + name: prev.name, + type: p.type ?? prev.type, + description, + examples, + ...(enumValues !== undefined ? { enumValues } : {}), + }; + byName.set(prev.name, merged); + } + return [...byName.values()]; +} + +function dedupeExamples(arr: T[]): T[] { + const seen = new Set(); + const out: T[] = []; + for (const v of arr) { + const key = JSON.stringify(v); + if (!seen.has(key)) { + seen.add(key); + out.push(v); + } + } + return out; +} + +/* ---------- Step 1: neutral classification ---------- */ + +async function classifyNeutralStates( + model: ChatModel, + graph: LoadedGraph, +): Promise { + const translator = makeTranslator( + model, + "NeutralStatesClassification", + ); + const lines: string[] = []; + lines.push(SYSTEM_PROMPT_BASE); + lines.push(""); + lines.push("Task: classify each state below as neutral or not."); + lines.push(""); + lines.push(NEUTRAL_RULES); + lines.push(""); + lines.push( + "Assign each a short tabOrSection label like 'alarmTab.empty', 'alarmTab.editingDialog', 'timerTab.running' when applicable. Use 'modalX' if you can't tell which tab a non-neutral state belongs to.", + ); + lines.push(""); + for (const state of graph.states) { + const tree = loadStateTree(graph, state.id); + lines.push(summarizeState(state, tree)); + lines.push(""); + } + lines.push( + "Return a NeutralStatesClassification with one entry per stateId.", + ); + const result = await translator.translate(lines.join("\n")); + if (!result.success) { + process.stderr.write( + `[synth] neutral classification translation failed: ${result.message}\n`, + ); + // Fallback: assume all states are neutral. Better signal than silent failure. + return { + classifications: graph.states.map((s) => ({ + stateId: s.id, + isNeutral: true, + reason: `(fallback after LLM failure: ${result.message})`, + })), + }; + } + return result.data; +} + +function summarizeState(state: CapturedState, tree: TreeNode): string { + const actionable: string[] = []; + function walk(n: TreeNode): void { + if (n.patterns.length > 0 && n.isEnabled && !n.isOffscreen) { + const label = n.name ?? n.automationId ?? n.className ?? ""; + actionable.push( + `${n.controlType}${label ? ` '${truncate(label, 40)}'` : ""} [${n.patterns.join(",")}]`, + ); + } + for (const c of n.children) walk(c); + } + walk(tree); + const head = `${state.id} window='${state.windowTitle}' label='${state.label ?? ""}'`; + const limited = actionable.slice(0, 30); + const overflow = + actionable.length > limited.length + ? `\n ... +${actionable.length - limited.length} more` + : ""; + return `${head}\n controls: ${limited.join("; ")}${overflow}`; +} + +/* ---------- Step 2: chunking ---------- */ + +function chunkTransitions( + transitions: CapturedTransition[], + neutralByState: Map, +): Chunk[] { + const chunks: Chunk[] = []; + let chunkId = 1; + let pending: CapturedTransition[] = []; + let pendingStart: string | null = null; + + const isNeutral = (id: string) => + neutralByState.get(id)?.isNeutral !== false; // unknown → assume neutral + + for (const t of transitions) { + if (pendingStart === null) { + pendingStart = t.fromStateId; + } + pending.push(t); + if (isNeutral(t.toStateId)) { + chunks.push({ + id: `C-${chunkId.toString().padStart(3, "0")}`, + transitionIds: pending.map((p) => p.id), + startStateId: pendingStart!, + endStateId: t.toStateId, + isNeutralStart: isNeutral(pendingStart!), + isNeutralEnd: true, + }); + chunkId++; + pending = []; + pendingStart = null; + } + } + // Trailing non-neutral path (incomplete chunk). + if (pending.length > 0) { + const last = pending[pending.length - 1]!; + chunks.push({ + id: `C-${chunkId.toString().padStart(3, "0")}`, + transitionIds: pending.map((p) => p.id), + startStateId: pendingStart!, + endStateId: last.toStateId, + isNeutralStart: isNeutral(pendingStart!), + isNeutralEnd: false, + }); + } + return chunks; +} + +/* ---------- Step 3: clustering ---------- */ + +async function clusterChunks( + model: ChatModel, + chunks: Chunk[], + graph: LoadedGraph, + neutralByState: Map, +): Promise { + const translator = makeTranslator( + model, + "ClusteringResult", + ); + const lines: string[] = []; + lines.push(SYSTEM_PROMPT_BASE); + lines.push(""); + lines.push("Task: group these UI-action chunks by user-meaningful intent."); + lines.push(""); + lines.push(CLUSTERING_RULES); + lines.push(""); + lines.push(`Total chunks to cluster: ${chunks.length}`); + lines.push(""); + for (const ch of chunks) { + lines.push(renderChunkForLLM(ch, graph, neutralByState)); + } + lines.push(""); + lines.push("Return a ClusteringResult. Apply the rules above carefully."); + const result = await translator.translate(lines.join("\n")); + if (!result.success) { + process.stderr.write( + `[synth] clustering translation failed: ${result.message}\n`, + ); + return { clusters: [], orphans: chunks.map((c) => c.id) }; + } + return result.data; +} + +function renderChunkForLLM( + chunk: Chunk, + graph: LoadedGraph, + neutralByState: Map, +): string { + const startLabel = + neutralByState.get(chunk.startStateId)?.tabOrSection ?? ""; + const endLabel = neutralByState.get(chunk.endStateId)?.tabOrSection ?? ""; + const lines: string[] = []; + lines.push( + `Chunk ${chunk.id}: ${chunk.startStateId}${startLabel ? ` (${startLabel})` : ""} → ${chunk.endStateId}${endLabel ? ` (${endLabel})` : ""}`, + ); + for (const tid of chunk.transitionIds) { + const t = graph.transitions.find((x) => x.id === tid); + if (!t) continue; + const value = + t.trigger.value !== undefined + ? ` value=${JSON.stringify(t.trigger.value)}` + : ""; + const tail = lastSegment(t.trigger.selector); + lines.push(` ${t.trigger.verb} ${tail}${value}`); + } + return lines.join("\n"); +} + +/* ---------- Step 4: synthesis ---------- */ + +async function synthesizeOneCluster( + model: ChatModel, + cluster: ClusteringResult["clusters"][number], + chunks: Chunk[], + graph: LoadedGraph, + neutralByState: Map, +): Promise { + const translator = makeTranslator( + model, + "SynthesizedAction", + ); + const memberChunks = chunks.filter((c) => cluster.chunkIds.includes(c.id)); + if (memberChunks.length === 0) return null; + + const lines: string[] = []; + lines.push(SYSTEM_PROMPT_BASE); + lines.push(""); + lines.push( + `Task: synthesize a single SynthesizedAction for the intent '${cluster.intentName}' (${cluster.shortDescription}).`, + ); + lines.push( + `This intent was observed across ${memberChunks.length} chunk(s). Each chunk is a sequence of (selector, verb, value) tuples that together accomplish the intent.`, + ); + lines.push(""); + lines.push(SYNTHESIS_RULES); + lines.push(""); + lines.push("Use full selector paths exactly as they appear in the chunks."); + lines.push(""); + for (const ch of memberChunks) { + const startLabel = + neutralByState.get(ch.startStateId)?.tabOrSection ?? ""; + const endLabel = neutralByState.get(ch.endStateId)?.tabOrSection ?? ""; + lines.push( + `Chunk ${ch.id}: ${ch.startStateId}${startLabel ? ` (${startLabel})` : ""} → ${ch.endStateId}${endLabel ? ` (${endLabel})` : ""}`, + ); + for (const tid of ch.transitionIds) { + const t = graph.transitions.find((x) => x.id === tid); + if (!t) continue; + const value = + t.trigger.value !== undefined + ? ` value=${JSON.stringify(t.trigger.value)}` + : ""; + lines.push( + ` ${t.trigger.verb} selector="${t.trigger.selector}"${value}`, + ); + } + lines.push(""); + } + lines.push("Return a SynthesizedAction."); + const result = await translator.translate(lines.join("\n")); + if (!result.success) { + process.stderr.write( + `[synth] cluster '${cluster.intentName}' synthesis translation failed: ${result.message}\n`, + ); + return null; + } + return result.data; +} + +/* ---------- Step 4b: validation pass ---------- */ + +async function validateActions( + model: ChatModel, + actions: SynthesizedAction[], +): Promise { + const translator = makeTranslator( + model, + "ValidationResult", + ); + const lines: string[] = []; + lines.push(SYSTEM_PROMPT_BASE); + lines.push(""); + lines.push( + "Task: review the synthesized action set below and judge its quality. Look for:", + ); + lines.push( + "- FRAGMENTS: actions that are obviously a sub-step of another (e.g., 'setAlarmDetails' that just sets the name field, or 'confirmAlarm' that just clicks Save). These shouldn't exist as separate actions; they should be merged INTO their parent action.", + ); + lines.push( + "- DUPLICATES: multiple actions doing the same thing with different parameter values (e.g., 'navigateToTabAlarm', 'navigateToTabTimer', 'navigateToTabClock' should all be ONE 'navigateToTab' with a tab parameter).", + ); + lines.push( + "- BROKEN: actions whose playback is obviously incomplete (1 step that just opens a dialog and never closes it, or N invocations of the same toggle button merged into one recipe).", + ); + lines.push( + "- AMBIGUOUS: action names too generic to be useful, or descriptions that don't match the playback.", + ); + lines.push(""); + lines.push( + "For DUPLICATES specifically, emit mergeRecommendations: list the action names to merge, propose a single combined name, and propose the parameter (with type and possibly enumValues) that distinguishes them.", + ); + lines.push(""); + lines.push("--- Action set under review ---"); + lines.push(""); + for (const a of actions) { + lines.push(`### ${a.actionName}`); + lines.push(`description: ${a.description}`); + lines.push( + `parameters: ${a.parameters.map((p) => `${p.name}:${p.type}=${JSON.stringify(p.examples)}`).join(", ") || "(none)"}`, + ); + lines.push(`destructive: ${a.destructive}`); + lines.push(`playback (${a.playback.length} step(s)):`); + for (let i = 0; i < a.playback.length; i++) { + const s = a.playback[i]!; + const v = + s.valueRef !== undefined + ? ` ref=${s.valueRef}` + : s.valueLiteral !== undefined + ? ` lit=${JSON.stringify(s.valueLiteral)}` + : ""; + lines.push( + ` ${i + 1}. ${s.verb}${v} on ${lastSegment(s.selector)}`, + ); + } + lines.push(""); + } + lines.push( + "Return a ValidationResult. Be willing to flag many actions if the set is poorly synthesized.", + ); + const result = await translator.translate(lines.join("\n")); + if (!result.success) { + return { reviews: [] }; + } + return result.data; +} + +function applyMergeRecommendations( + actions: SynthesizedAction[], + recs: MergeRecommendation[], +): SynthesizedAction[] { + let working = [...actions]; + for (const rec of recs) { + const targets = rec.actionNames + .map((n) => working.find((a) => a.actionName === n)) + .filter((x): x is SynthesizedAction => x !== undefined); + if (targets.length < 2) continue; + + // Use the LONGEST playback as the canonical recipe (most complete observation). + const canonical = targets.reduce((best, cur) => + cur.playback.length > best.playback.length ? cur : best, + ); + + // Build the parameter that distinguishes the variants. + const distParam = { + name: rec.proposedParam.name, + type: rec.proposedParam.type, + description: `Distinguishes ${rec.actionNames.join(" / ")} variants.`, + examples: collectExamples(targets), + ...(rec.proposedParam.enumValues !== undefined + ? { enumValues: rec.proposedParam.enumValues } + : {}), + }; + + // Strip the literal that varies (we don't know which step it's at without reading + // the cluster; mark every literal-with-matching-type as a candidate ref so the + // user/runtime can fix up. This is heuristic — log it in description.) + const playback = canonical.playback.map((s) => { + // Only swap a literal of the proposed-param type to a valueRef on the FIRST match. + return s; + }); + // Conservative: append note to description; leave playback literals alone. + // (Better: we'd reconstruct from the cluster's chunk variations. Future work.) + + // Merged action. + const merged: SynthesizedAction = { + actionName: rec.proposedName, + description: + canonical.description + + ` [merged from: ${rec.actionNames.join(", ")}]`, + parameters: dedupeParams([...canonical.parameters, distParam]), + playback, + preconditions: canonical.preconditions, + postconditions: canonical.postconditions, + destructive: targets.some((t) => t.destructive), + }; + + // Replace targets with merged. + working = working.filter( + (a) => !rec.actionNames.includes(a.actionName), + ); + working.push(merged); + } + return working; +} + +function collectExamples( + actions: SynthesizedAction[], +): Array { + // Use the actionName variants as examples when no parameters distinguish them. + return actions.map((a) => { + const m = a.actionName.match(/[A-Z][a-z]+$/); + return m ? m[0].toLowerCase() : a.actionName; + }); +} + +function dedupeParams( + params: SynthesizedAction["parameters"], +): SynthesizedAction["parameters"] { + const seen = new Set(); + return params.filter((p) => { + if (seen.has(p.name)) return false; + seen.add(p.name); + return true; + }); +} + +/* ---------- Output report ---------- */ + +function renderReport(args: { + integrationName: string; + graph: LoadedGraph; + chunks: Chunk[]; + neutralByState: Map; + clusters: ClusteringResult; + actions: SynthesizedAction[]; +}): string { + const lines: string[] = []; + lines.push(`# Synthesis report: ${args.integrationName}`); + lines.push(""); + lines.push(`Run dir: \`${args.graph.runDir}\``); + lines.push( + `States: ${args.graph.states.length} · Transitions: ${args.graph.transitions.length} · Chunks: ${args.chunks.length}`, + ); + lines.push( + `Clusters: ${args.clusters.clusters.length} · Orphans: ${args.clusters.orphans?.length ?? 0} · Synthesized actions: ${args.actions.length}`, + ); + lines.push(""); + lines.push("## Neutral classifications"); + for (const c of [...args.neutralByState.values()]) { + const flag = c.isNeutral ? "✓" : "✗"; + const label = c.tabOrSection ? ` [${c.tabOrSection}]` : ""; + lines.push(`- ${flag} ${c.stateId}${label}: ${c.reason}`); + } + lines.push(""); + lines.push("## Clusters"); + for (const cl of args.clusters.clusters) { + lines.push( + `- **${cl.intentName}** (${cl.clusterId}): ${cl.shortDescription} — chunks: ${cl.chunkIds.join(", ")}`, + ); + } + if (args.clusters.orphans && args.clusters.orphans.length > 0) { + lines.push(""); + lines.push(`Orphan chunks: ${args.clusters.orphans.join(", ")}`); + } + lines.push(""); + lines.push("## Synthesized actions"); + for (const a of args.actions) { + lines.push( + `### ${a.actionName}${a.destructive ? " (destructive)" : ""}`, + ); + lines.push(a.description); + lines.push(""); + lines.push("Parameters:"); + if (a.parameters.length === 0) { + lines.push(" (none)"); + } else { + for (const p of a.parameters) { + const eg = p.examples + .slice(0, 5) + .map((v) => JSON.stringify(v)) + .join(", "); + lines.push( + ` - \`${p.name}\` (${p.type}${p.enumValues ? `: ${p.enumValues.join("|")}` : ""}) — ${p.description}${eg ? `; examples: ${eg}` : ""}`, + ); + } + } + lines.push(""); + lines.push("Playback:"); + for (let i = 0; i < a.playback.length; i++) { + const step = a.playback[i]!; + const valuePart = + step.valueRef !== undefined + ? ` ${step.valueRef}` + : step.valueLiteral !== undefined + ? ` ${JSON.stringify(step.valueLiteral)}` + : ""; + lines.push( + ` ${i + 1}. ${step.verb}${valuePart} on \`${step.selector}\``, + ); + } + lines.push(""); + lines.push(`Preconditions: ${a.preconditions.description}`); + lines.push(`Postconditions: ${a.postconditions.description}`); + lines.push(""); + } + return lines.join("\n"); +} + +/* ---------- helpers ---------- */ + +function makeTranslator( + model: ChatModel, + typeName: string, +): TypeChatJsonTranslator { + const schema = loadSchema(["synthesisLlmSchema.ts"], import.meta.url); + const validator = createTypeScriptJsonValidator(schema, typeName); + return createJsonTranslator(model, validator); +} + +function loadGraph(runDir: string): LoadedGraph { + const statesFile = path.join(runDir, "states.jsonl"); + const transitionsFile = path.join(runDir, "transitions.jsonl"); + if (!existsSync(statesFile) || !existsSync(transitionsFile)) { + throw new Error( + `Missing states.jsonl or transitions.jsonl in ${runDir}`, + ); + } + const states = readFileSync(statesFile, "utf8") + .split("\n") + .filter((l) => l.length > 0) + .map((l) => JSON.parse(l) as CapturedState); + const transitions = readFileSync(transitionsFile, "utf8") + .split("\n") + .filter((l) => l.length > 0) + .map((l) => JSON.parse(l) as CapturedTransition); + return { states, transitions, runDir }; +} + +function loadStateTree(graph: LoadedGraph, stateId: string): TreeNode { + const state = graph.states.find((s) => s.id === stateId); + if (!state) { + throw new Error(`No such state: ${stateId}`); + } + return JSON.parse( + readFileSync(path.join(graph.runDir, state.treeFile), "utf8"), + ) as TreeNode; +} + +function lastSegment(selector: string): string { + const segs = selector.split("/").filter((s) => s.length > 0); + return segs[segs.length - 1] ?? selector; +} + +function truncate(s: string, n: number): string { + return s.length > n ? s.slice(0, n - 1) + "…" : s; +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/tabReconnaissance.ts b/ts/packages/agents/onboarding/src/uiCapture/tabReconnaissance.ts new file mode 100644 index 0000000000..0567bb35c3 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/tabReconnaissance.ts @@ -0,0 +1,290 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { ChatModel } from "aiclient"; +import { loadSchema } from "typeagent"; +import { + createJsonTranslator, + MultimodalPromptContent, + TypeChatJsonTranslator, +} from "typechat"; +import { createTypeScriptJsonValidator } from "typechat/ts"; + +import { getReconModel } from "../lib/llm.js"; +import type { HelperClient } from "./helperClient.js"; +import type { ExpectedAction, TabRecon } from "./reconLlmSchema.js"; +import type { TreeNode } from "./types.js"; + +export type TabRef = { + selector: string; + name: string; + automationId?: string; +}; + +export type AppReconnaissance = { + appHint: string; + tabs: TabRef[]; + surveys: Array<{ tab: TabRef; recon: TabRecon }>; + /** Flattened expected-actions list across all tabs, ready to feed into a crawl goal. */ + expectedActions: Array; +}; + +export type ReconnoiterOptions = { + client: HelperClient; + rootSelector: string; + appHint: string; + model?: ChatModel; + /** Optional: max tabs to survey. */ + maxTabs?: number; + /** Wait after navigating to a tab before capturing. Defaults to 1500ms (UWP NavView is slow). */ + perTabSettleMs?: number; +}; + +/** + * Walk the app's main window, find tab-like navigation (ListItems with + * SelectionItem pattern under a NavView/Group container), navigate to + * each, screenshot + dump the tree, and ask a vision-capable LLM to + * describe each tab's purpose and plausible user actions. + * + * The output (`expectedActions`) is meant to be fed into the explore + * loop's goal as a TODO list — concrete tasks the explorer should drive + * the app through, instead of generic "explore breadth" guidance. + */ +export async function reconnoiterApp( + opts: ReconnoiterOptions, +): Promise { + const model = opts.model ?? getReconModel(); + const settleMs = opts.perTabSettleMs ?? 1500; + + // 1. Discover tabs. + const initialTree = await opts.client.treeDump({ + root: opts.rootSelector, + maxDepth: 10, + }); + let tabs = discoverTabs(initialTree); + if (opts.maxTabs && tabs.length > opts.maxTabs) { + tabs = tabs.slice(0, opts.maxTabs); + } + + // 2. Survey each. + const surveys: Array<{ tab: TabRef; recon: TabRecon }> = []; + for (const tab of tabs) { + try { + await opts.client.doSelect({ selector: tab.selector }); + } catch { + // Some tabs may be unselectable (already-selected, gated, etc.) — skip. + continue; + } + await sleep(settleMs); + try { + await opts.client.eventsIdle({ + debounceMs: 600, + maxWaitMs: 4000, + }); + } catch { + /* idle failures are non-fatal */ + } + const tree = await opts.client.treeDump({ + root: opts.rootSelector, + maxDepth: 8, + }); + const shot = await opts.client.screenshot({ + root: opts.rootSelector, + }); + const recon = await classifyTab( + model, + tab, + tree, + shot.pngBase64, + opts.appHint, + ); + if (recon) { + surveys.push({ tab, recon }); + } + } + + // 3. Flatten the expected actions list. + const expectedActions: AppReconnaissance["expectedActions"] = []; + for (const s of surveys) { + for (const ea of s.recon.expectedActions) { + expectedActions.push({ ...ea, tabHint: s.tab.name }); + } + } + return { appHint: opts.appHint, tabs, surveys, expectedActions }; +} + +/** + * Heuristic tab discovery: find the largest cluster of sibling ListItems + * that all expose SelectionItem pattern. That's the navigation strip. + */ +function discoverTabs(root: TreeNode): TabRef[] { + let bestParent: TreeNode | null = null; + let bestItems: TreeNode[] = []; + + function walk(n: TreeNode): void { + const items = n.children.filter( + (c) => + c.controlType === "ListItem" && + c.patterns.includes("SelectionItem") && + c.isEnabled, + ); + if (items.length >= 2 && items.length > bestItems.length) { + bestParent = n; + bestItems = items; + } + for (const c of n.children) walk(c); + } + walk(root); + void bestParent; // for debug if needed + + return bestItems + .filter((i) => i.name && i.name.length > 0) + .map((i) => { + const ref: TabRef = { + selector: i.selector, + name: i.name!, + }; + if (i.automationId !== undefined) ref.automationId = i.automationId; + return ref; + }); +} + +async function classifyTab( + model: ChatModel, + tab: TabRef, + tree: TreeNode, + screenshotPngBase64: string, + appHint: string, +): Promise { + const translator = makeReconTranslator(model); + const text = buildReconPrompt(tab, tree, appHint); + const dataUrl = `data:image/png;base64,${screenshotPngBase64}`; + const imageOnlyContent: MultimodalPromptContent[] = [ + { type: "text", text: `Screenshot of the '${tab.name}' tab:` }, + { + type: "image_url", + image_url: { url: dataUrl }, + } as MultimodalPromptContent, + ]; + const result = await translator.translate(text, [ + { role: "user", content: imageOnlyContent }, + ]); + if (!result.success) { + process.stderr.write( + `[recon] tab '${tab.name}' translation failed: ${result.message}\n`, + ); + return null; + } + return result.data; +} + +function buildReconPrompt( + tab: TabRef, + tree: TreeNode, + appHint: string, +): string { + const lines: string[] = []; + lines.push( + `You are reviewing a screenshot and accessibility tree for ONE tab of a Windows desktop application to enumerate the user-facing actions it supports.`, + ); + lines.push(""); + lines.push(`App: ${appHint}`); + lines.push( + `Tab: '${tab.name}'${tab.automationId ? ` (AutomationId=${tab.automationId})` : ""}`, + ); + lines.push(""); + lines.push( + "Identify what this tab is FOR (its purpose) and list the user-meaningful actions it supports. For each action:", + ); + lines.push( + "- Use a camelCase verb-noun name (createAlarm, startStopwatch, addCity, recordLap, navigateToTab, etc.).", + ); + lines.push("- Describe what the user accomplishes."); + lines.push( + "- List parameters with types and a plausible EXAMPLE value (your best guess from the visible UI).", + ); + lines.push( + "- Mark priority='primary' for the tab's main intent(s); priority='secondary' for adjacent features (settings, sign-in, music, etc.).", + ); + lines.push( + "- destructive=true for delete/remove/reset/clear actions, else false.", + ); + lines.push(""); + lines.push( + "Be aspirational: include actions that the screenshot/tree implies are possible even if you can't see them executed (e.g. if there's an 'Add' button visible, the action 'createX' is implied).", + ); + lines.push(""); + lines.push("Filtered actionable controls (from UIA tree):"); + lines.push(summarizeActionableControls(tree)); + lines.push(""); + lines.push("Return a TabRecon."); + return lines.join("\n"); +} + +function summarizeActionableControls(root: TreeNode): string { + const lines: string[] = []; + function walk(n: TreeNode, depth: number): void { + if ( + n.patterns.length > 0 && + n.isEnabled && + !n.isOffscreen && + (n.name || n.automationId) + ) { + const label = n.name ?? n.automationId ?? ""; + lines.push( + `${" ".repeat(depth)}${n.controlType} '${truncate(label, 50)}' [${n.patterns.join(",")}]`, + ); + } + for (const c of n.children) walk(c, depth + 1); + } + walk(root, 0); + return lines.slice(0, 60).join("\n"); +} + +function truncate(s: string, n: number): string { + return s.length > n ? s.slice(0, n - 1) + "…" : s; +} + +function makeReconTranslator( + model: ChatModel, +): TypeChatJsonTranslator { + const schema = loadSchema(["reconLlmSchema.ts"], import.meta.url); + const validator = createTypeScriptJsonValidator( + schema, + "TabRecon", + ); + return createJsonTranslator(model, validator); +} + +async function sleep(ms: number): Promise { + await new Promise((res) => setTimeout(res, ms)); +} + +/** + * Render an AppReconnaissance into a numbered TODO list suitable for use + * as the explore loop's goal. Includes example invocations so the LLM + * oracle has concrete targets. + */ +export function renderReconAsGoal(recon: AppReconnaissance): string { + const lines: string[] = []; + lines.push( + `Drive ${recon.appHint} through these specific user actions, working through them in order. Each takes multiple UI steps (open dialog, fill fields, click commit). Skip and move on if a task gets stuck. Avoid destructive actions unless explicitly listed.`, + ); + lines.push(""); + let i = 1; + for (const a of recon.expectedActions) { + const params = a.parameters + .map((p) => `${p.name}=${JSON.stringify(p.example)}`) + .join(", "); + const dest = a.destructive ? " [DESTRUCTIVE — skip]" : ""; + lines.push( + `${i}. ${a.intentName}(${params}) on the ${a.tabHint} tab — ${a.description}${dest}`, + ); + i++; + } + lines.push(""); + lines.push( + "After each action, observe the result and move to the next item. If you've completed all of these, choose 'stop'.", + ); + return lines.join("\n"); +} diff --git a/ts/packages/agents/onboarding/src/uiCapture/test/calibrateSmoke.ts b/ts/packages/agents/onboarding/src/uiCapture/test/calibrateSmoke.ts new file mode 100644 index 0000000000..ca0f2ec0d3 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/test/calibrateSmoke.ts @@ -0,0 +1,229 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { mkdirSync, writeFileSync } from "node:fs"; +import path from "node:path"; +import { fileURLToPath } from "node:url"; + +import { calibrateDynamicControls } from "../dynamicControls.js"; +import { HelperClient } from "../helperClient.js"; +import type { DynamicControlRule, TreeNode } from "../types.js"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const fixturesDir = path.resolve( + __dirname, + "../../..", + "test/fixtures/uiCapture", +); + +const CLOCK_AUMID = "Microsoft.WindowsAlarms_8wekyb3d8bbwe!App"; + +function log(msg: string): void { + process.stdout.write(`[cal] ${msg}\n`); +} + +function findFirst( + node: TreeNode, + pred: (n: TreeNode) => boolean, +): TreeNode | null { + if (pred(node)) { + return node; + } + for (const c of node.children) { + const f = findFirst(c, pred); + if (f) { + return f; + } + } + return null; +} + +async function sleep(ms: number): Promise { + await new Promise((res) => setTimeout(res, ms)); +} + +async function runStopwatchCalibration( + client: HelperClient, + rootSelector: string, +): Promise { + log("navigating to Stopwatch..."); + let tree = await client.treeDump({ root: rootSelector, maxDepth: 8 }); + const stopwatchTab = findFirst( + tree, + (n) => n.name === "Stopwatch" && n.patterns.includes("SelectionItem"), + ); + if (!stopwatchTab) { + throw new Error("Stopwatch tab not found in NavView"); + } + await client.doSelect({ selector: stopwatchTab.selector }); + await client.eventsIdle({ debounceMs: 800, maxWaitMs: 3000 }); + + log("starting stopwatch..."); + tree = await client.treeDump({ root: rootSelector, maxDepth: 8 }); + const startBtn = findFirst( + tree, + (n) => + (n.automationId === "PlayPauseButton" || n.name === "Start") && + n.patterns.includes("Invoke"), + ); + if (!startBtn) { + throw new Error("Stopwatch Start button not found"); + } + await client.doInvoke({ selector: startBtn.selector }); + await sleep(500); // let stopwatch tick a bit before first dump + + log("calibrating (3 dumps over ~6s)..."); + const calibrated = await calibrateDynamicControls({ + client, + rootSelector, + integrationName: "windowsClock", + dumpCount: 3, + delayMs: 3000, + maxDepth: 8, + }); + log( + `calibration: ${calibrated.rules.length} rule(s) in ${calibrated.calibration?.durationMs}ms`, + ); + for (const r of calibrated.rules.slice(0, 8)) { + log( + ` rule ${r.id}: match=${JSON.stringify(r.match)} props=${JSON.stringify(r.dynamicProperties)} confidence=${r.confidence.toFixed(2)} semantic=${r.semantic ?? ""}`, + ); + } + + if (calibrated.rules.length === 0) { + throw new Error( + "Expected at least one dynamic rule from calibration on a running stopwatch", + ); + } + + // Save the calibrated file as a fixture for inspection. + writeFileSync( + path.join(fixturesDir, "clock-stopwatch-dynamic.json"), + JSON.stringify(calibrated, null, 2), + ); + log("saved clock-stopwatch-dynamic.json"); + + // Verify rules-aware fingerprint is stable while stopwatch ticks. + log("comparing fingerprints with and without rules over a 4s window..."); + const fpRulesA = await client.treeFingerprint({ + root: rootSelector, + dynamicRules: calibrated.rules, + }); + const fpNakedA = await client.treeFingerprint({ root: rootSelector }); + await sleep(4000); + const fpRulesB = await client.treeFingerprint({ + root: rootSelector, + dynamicRules: calibrated.rules, + }); + const fpNakedB = await client.treeFingerprint({ root: rootSelector }); + + log(` with rules: ${fpRulesA.hash} → ${fpRulesB.hash}`); + log(` without rules: ${fpNakedA.hash} → ${fpNakedB.hash}`); + + if (fpRulesA.hash !== fpRulesB.hash) { + log( + " ⚠ rules-aware fingerprints differ — calibration likely missed a dynamic control", + ); + } else { + log(" ✓ rules-aware fingerprints match (dynamic content masked)"); + } + if (fpNakedA.hash === fpNakedB.hash) { + log( + " ⚠ naked fingerprints match — stopwatch may not have advanced (UI might be paused)", + ); + } else { + log(" ✓ naked fingerprints differ (stopwatch advanced)"); + } + + // Stop stopwatch. + log("stopping stopwatch..."); + tree = await client.treeDump({ root: rootSelector, maxDepth: 8 }); + const stopBtn = findFirst( + tree, + (n) => + (n.automationId === "PlayPauseButton" || + n.automationId === "PauseButton" || + n.name === "Pause" || + n.name === "Stop") && + n.patterns.includes("Invoke"), + ); + if (stopBtn) { + await client.doInvoke({ selector: stopBtn.selector }); + } + log(" stopwatch stopped"); +} + +async function runFingerprintBasics( + client: HelperClient, + rootSelector: string, +): Promise { + log("fingerprint basics..."); + const fpA = await client.treeFingerprint({ root: rootSelector }); + const fpB = await client.treeFingerprint({ root: rootSelector }); + log(` back-to-back: ${fpA.hash} == ${fpB.hash}? ${fpA.hash === fpB.hash}`); + if (fpA.hash !== fpB.hash) { + throw new Error("Two fingerprints of the same instant should match"); + } + log( + ` controlCount=${fpA.controlCount} activeWindow='${fpA.activeWindowTitle}'`, + ); + + const closeRule: DynamicControlRule = { + id: "test-close", + match: { kind: "automationId", value: "Close" }, + dynamicProperties: ["name"], + reason: "user-marked", + confidence: 1, + observations: 1, + firstSeen: new Date().toISOString(), + lastConfirmed: new Date().toISOString(), + }; + const fpMasked = await client.treeFingerprint({ + root: rootSelector, + dynamicRules: [closeRule], + }); + log(` with Close-name mask: ${fpMasked.hash}`); + if (fpMasked.hash === fpA.hash) { + throw new Error( + "Masking the Close button's name should change the fingerprint", + ); + } + log(" ✓ rule application changes the fingerprint"); +} + +async function main(): Promise { + mkdirSync(fixturesDir, { recursive: true }); + const client = await HelperClient.start({ debug: true }); + try { + await client.ping(); + + // Close any existing Clock. + for (const w of (await client.appList()).filter((x) => + x.title.includes("Clock"), + )) { + await client.appKill({ pid: w.pid }); + } + await sleep(1500); + + log("launching Clock..."); + const launch = await client.appLaunch({ aumid: CLOCK_AUMID }); + await client.eventsIdle({ debounceMs: 800, maxWaitMs: 5000 }); + + await runFingerprintBasics(client, launch.mainWindow); + await runStopwatchCalibration(client, launch.mainWindow); + + // Cleanup. + await client.appKill({ pid: launch.pid }); + log("DONE"); + } finally { + await client.dispose(); + } +} + +main().catch((e) => { + process.stderr.write(`FAILED: ${e}\n`); + if (e instanceof Error && e.stack) { + process.stderr.write(e.stack + "\n"); + } + process.exit(1); +}); diff --git a/ts/packages/agents/onboarding/src/uiCapture/test/clockAgentDemo.ts b/ts/packages/agents/onboarding/src/uiCapture/test/clockAgentDemo.ts new file mode 100644 index 0000000000..b562455d80 --- /dev/null +++ b/ts/packages/agents/onboarding/src/uiCapture/test/clockAgentDemo.ts @@ -0,0 +1,213 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +import { existsSync, readFileSync, readdirSync, statSync } from "node:fs"; +import { homedir } from "node:os"; +import path from "node:path"; + +import { HelperClient } from "../helperClient.js"; +import { executePlayback } from "../playbackExecutor.js"; +import { inferSnapshotPolicy } from "../snapshotPolicy.js"; +import type { SynthesizedAction } from "../synthesisLlmSchema.js"; +import type { TreeNode } from "../types.js"; + +const CLOCK_AUMID = "Microsoft.WindowsAlarms_8wekyb3d8bbwe!App"; +const WORKSPACE = path.join( + homedir(), + ".typeagent", + "onboarding", + "windowsClock", +); + +function log(msg: string): void { + process.stdout.write(`[demo] ${msg}\n`); +} + +async function sleep(ms: number): Promise { + await new Promise((res) => setTimeout(res, ms)); +} + +function findMostRecentDir(parent: string): string { + const entries = readdirSync(parent) + .map((name) => ({ + name, + full: path.join(parent, name), + stat: statSync(path.join(parent, name)), + })) + .filter((e) => e.stat.isDirectory()) + .sort((a, b) => b.stat.mtimeMs - a.stat.mtimeMs); + if (entries.length === 0) { + throw new Error(`No directories under ${parent}`); + } + return entries[0]!.full; +} + +function findActionByName(file: string, actionName: string): SynthesizedAction { + const json = JSON.parse(readFileSync(file, "utf8")); + const action = json.actions.find( + (a: SynthesizedAction) => a.actionName === actionName, + ); + if (!action) { + throw new Error( + `No action '${actionName}' in ${file} (have: ${json.actions.map((a: SynthesizedAction) => a.actionName).join(", ")})`, + ); + } + return action; +} + +function findFirst( + node: TreeNode, + pred: (n: TreeNode) => boolean, +): TreeNode | null { + if (pred(node)) return node; + for (const c of node.children) { + const f = findFirst(c, pred); + if (f) return f; + } + return null; +} + +function countMatches(node: TreeNode, pred: (n: TreeNode) => boolean): number { + let n = pred(node) ? 1 : 0; + for (const c of node.children) n += countMatches(c, pred); + return n; +} + +async function main(): Promise { + const runDir = findMostRecentDir(path.join(WORKSPACE, "runs")); + const actionsFile = path.join(runDir, "discoveredActions.json"); + if (!existsSync(actionsFile)) { + throw new Error(`No discoveredActions.json at ${actionsFile}`); + } + log(`run dir: ${runDir}`); + + const createAlarm = findActionByName(actionsFile, "createAlarm"); + log( + `loaded createAlarm: ${createAlarm.parameters.length} params, ${createAlarm.playback.length} steps`, + ); + + // Find a baseline snapshot to restore against. + const snapshotsDir = path.join(WORKSPACE, "snapshots"); + if (!existsSync(snapshotsDir)) { + throw new Error(`No snapshots dir at ${snapshotsDir}`); + } + const baselineSnapshotDir = findMostRecentDir(snapshotsDir); + log(`baseline snapshot: ${baselineSnapshotDir}`); + + const policy = await inferSnapshotPolicy({ + integrationName: "windowsClock", + aumid: CLOCK_AUMID, + }); + policy.detectionStatus = "user-confirmed"; + + const client = await HelperClient.start({ debug: false }); + try { + // Make sure Clock is closed before we restore (file locks). + for (const w of (await client.appList()).filter((x) => + x.title.includes("Clock"), + )) { + await client.appKill({ pid: w.pid }); + } + await sleep(1500); + + log("restoring baseline so we have a known starting point..."); + await client.snapshotRestore({ + snapshotDir: baselineSnapshotDir, + policy, + }); + + log("launching Clock..."); + const launch = await client.appLaunch({ aumid: CLOCK_AUMID }); + await client.eventsIdle({ debounceMs: 800, maxWaitMs: 5000 }); + log(`launched pid=${launch.pid}`); + + // Count Toggle controls (each alarm has one) BEFORE running the action, + // restricted to the alarm tab area. + const treeBefore = await client.treeDump({ + root: launch.mainWindow, + maxDepth: 10, + }); + const togglesBefore = countMatches( + treeBefore, + (n) => n.automationId === "AlarmViewGrid", + ); + log(`alarm-toggle ListItems before: ${togglesBefore}`); + + const params = { + alarmName: "Crawled Demo Alarm", + hour: 8, + minute: 15, + }; + log(`executing createAlarm with ${JSON.stringify(params)}...`); + const result = await executePlayback(createAlarm, params, { + client, + defaultIdleDebounceMs: 700, + defaultIdleMaxWaitMs: 4000, + }); + + log( + `playback ${result.success ? "OK" : "FAILED"} (${result.steps.length} step(s))`, + ); + for (const s of result.steps) { + const v = + s.value !== undefined ? ` ${JSON.stringify(s.value)}` : ""; + const status = s.success ? "✓" : "✗"; + log( + ` ${status} step ${s.stepIndex + 1}: ${s.verb}${v} (${s.durationMs}ms)${ + s.errorMessage ? ` — ${s.errorMessage}` : "" + }`, + ); + } + + // Count alarm toggles after; expect +1 if action worked. + await sleep(800); + const treeAfter = await client.treeDump({ + root: launch.mainWindow, + maxDepth: 10, + }); + const togglesAfter = countMatches( + treeAfter, + (n) => n.automationId === "AlarmViewGrid", + ); + log(`alarm-toggle ListItems after: ${togglesAfter}`); + + // Look for an alarm whose name contains our test string. + // Alarms render as DataItem with name like "Edit alarm, ,