From 759f18e2e8d39aecbca240abc795c4f4aa4f3e6d Mon Sep 17 00:00:00 2001 From: Aaron Stannard Date: Sat, 16 May 2026 18:08:55 +0000 Subject: [PATCH] fix(shell): prevent hang when ShellTool.Kill fails with Win32Exception MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ShellTool's cancellation handler assumes process.Kill() always succeeds. When Kill() throws Win32Exception (OS refuses the kill), the code logged the error and immediately awaited ReadToEndAsync on both pipes. If the child process is still alive and holding stdout/stderr open, those reads never hit EOF — recreating the exact hung-tool-call failure mode this PR was meant to fix. Fix: close StandardOutput/StandardError in the Win32Exception path so pipe reads drain immediately instead of blocking forever. Also tighten the regression test to spawn a child-process tree (Unix: 'sleep 120 & wait') so it actually exercises the entireProcessTree kill path rather than a lone sleep. --- .../Tools/ShellToolTests.cs | 13 ++++++---- src/Netclaw.Actors/Tools/ShellTool.cs | 26 ++++++++++++++++--- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs b/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs index 64c07e0e8..7119e5f7f 100644 --- a/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs +++ b/src/Netclaw.Actors.Tests/Tools/ShellToolTests.cs @@ -73,17 +73,20 @@ public async Task Requested_timeout_overrides_default_timeout() } [Fact] - public async Task Caller_cancellation_returns_gracefully() + public async Task Caller_cancellation_kills_child_process_tree_and_returns_gracefully() { // Reproduces the session-pipeline path: ShellTool's own timeout is long, // and cancellation instead arrives via the *outer* ct (the pipeline's // per-tool deadline). ShellTool must catch that, kill the process, and // return a message rather than letting the cancellation escape as an - // exception. If the kill failed, draining the pipes would hang and this - // test would never complete — so a passing run also proves the process - // was terminated. + // exception. On Unix the command also spawns a background child that + // inherits stdout/stderr; if the tree kill regresses, that child keeps + // the pipe write-ends open and the test never completes. var tool = new ShellTool(new ToolConfig { ShellTimeoutSeconds = 100 }); - var args = ToolInput.Create("Command", "sleep 120"); + var command = OperatingSystem.IsWindows() + ? "ping 127.0.0.1 -n 120 > nul" + : "sleep 120 & wait"; + var args = ToolInput.Create("Command", command); var context = new ToolExecutionContext("test/thread", Path.GetTempPath()) { Audience = TrustAudience.Personal, diff --git a/src/Netclaw.Actors/Tools/ShellTool.cs b/src/Netclaw.Actors/Tools/ShellTool.cs index a99eea1d9..ed7b5f0cd 100644 --- a/src/Netclaw.Actors/Tools/ShellTool.cs +++ b/src/Netclaw.Actors/Tools/ShellTool.cs @@ -134,18 +134,36 @@ protected override async Task ExecuteAsync(Params args, ToolExecutionCon } catch (OperationCanceledException) { + var killClosedPipes = true; + try { process.Kill(entireProcessTree: true); } - catch (Exception ex) when (ex is InvalidOperationException or Win32Exception) + catch (InvalidOperationException ex) { - // Already exited (TOCTOU race), or the OS refused the kill. + // Already exited between cancellation detection and kill. Debug.WriteLine($"shell_execute: process kill skipped — {ex.Message}"); } + catch (Win32Exception ex) + { + // If the OS refuses the kill, the child may keep stdout/stderr + // open forever. Close our read ends so cancellation still + // returns promptly instead of hanging in ReadToEndAsync. + killClosedPipes = false; + Debug.WriteLine($"shell_execute: process kill skipped — {ex.Message}"); + process.StandardOutput.Dispose(); + process.StandardError.Dispose(); + } - // The kill closes the pipes, so the reads now drain to EOF. - await Task.WhenAll(stdoutTask, stderrTask); + try + { + await Task.WhenAll(stdoutTask, stderrTask); + } + catch (Exception ex) when (!killClosedPipes && ex is IOException or ObjectDisposedException) + { + Debug.WriteLine($"shell_execute: pipe drain aborted — {ex.Message}"); + } return timeoutCts.IsCancellationRequested ? $"Error: Command timed out after {effectiveTimeoutSeconds} seconds."