From 5f580044132e8d2ec65abef5f514a339c64141ba Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 12:51:23 -0700 Subject: [PATCH 01/27] Harden gRPC worker and client against silent disconnects Adds Hello deadline, channel recreation after N consecutive failures, jittered exponential backoff, distinct Unauthenticated logging, and DTS health-ping observability to the worker. Mirrors the channel-recreate pattern in the client via a CallInvoker wrapper. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DurableTaskSchedulerClientExtensions.cs | 78 +++++ .../Grpc/ChannelRecreatingCallInvoker.cs | 270 ++++++++++++++++++ src/Client/Grpc/GrpcDurableTaskClient.cs | 26 +- .../Grpc/GrpcDurableTaskClientOptions.cs | 36 +++ .../Internal/InternalOptionsExtensions.cs | 33 +++ src/Client/Grpc/Logs.cs | 9 + .../DurableTaskSchedulerWorkerExtensions.cs | 80 ++++++ .../Grpc/GrpcDurableTaskWorker.Processor.cs | 114 ++++++-- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 120 +++++++- .../Grpc/GrpcDurableTaskWorkerOptions.cs | 46 +++ .../Internal/InternalOptionsExtensions.cs | 23 ++ src/Worker/Grpc/Logs.cs | 18 ++ src/Worker/Grpc/ProcessorExitReason.cs | 22 ++ src/Worker/Grpc/ReconnectBackoff.cs | 42 +++ ...pcDurableTaskWorkerOptionsInternalTests.cs | 62 ++++ .../Grpc.Tests/ReconnectBackoffTests.cs | 114 ++++++++ 16 files changed, 1072 insertions(+), 21 deletions(-) create mode 100644 src/Client/Grpc/ChannelRecreatingCallInvoker.cs create mode 100644 src/Client/Grpc/Internal/InternalOptionsExtensions.cs create mode 100644 src/Worker/Grpc/ProcessorExitReason.cs create mode 100644 src/Worker/Grpc/ReconnectBackoff.cs create mode 100644 test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs create mode 100644 test/Worker/Grpc.Tests/ReconnectBackoffTests.cs diff --git a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs index d98ac1256..0596b427c 100644 --- a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs +++ b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs @@ -7,6 +7,7 @@ using Azure.Core; using Grpc.Net.Client; using Microsoft.DurableTask.Client.Grpc; +using Microsoft.DurableTask.Client.Grpc.Internal; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.DependencyInjection.Extensions; using Microsoft.Extensions.Options; @@ -157,6 +158,83 @@ public void Configure(string? name, GrpcDurableTaskClientOptions options) options.Channel = this.channels.GetOrAdd( cacheKey, _ => new Lazy(source.CreateChannel)).Value; + options.SetChannelRecreator((oldChannel, ct) => this.RecreateChannelAsync(cacheKey, source, oldChannel, ct)); + } + + /// + /// Atomically swaps the cached channel for the given key with a freshly created one and schedules + /// graceful disposal of the old channel after a grace period so any in-flight RPCs from peer + /// clients can drain. Returns the currently cached channel if a peer client has already recreated it. + /// + async Task RecreateChannelAsync( + string cacheKey, + DurableTaskSchedulerClientOptions source, + GrpcChannel oldChannel, + CancellationToken cancellation) + { + cancellation.ThrowIfCancellationRequested(); + + if (this.disposed == 1) + { + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } + + if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) + { + Lazy created = new(source.CreateChannel); + if (this.channels.TryAdd(cacheKey, created)) + { + return created.Value; + } + + this.channels.TryGetValue(cacheKey, out currentLazy); + } + + if (currentLazy is null) + { + throw new InvalidOperationException("Failed to obtain a cached gRPC channel after recreation attempt."); + } + + if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) + { + // A peer client already swapped in a new channel; reuse it. + return currentLazy.Value; + } + + Lazy newLazy = new(source.CreateChannel); + if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) + { + this.channels.TryGetValue(cacheKey, out Lazy? winner); + return winner?.Value ?? newLazy.Value; + } + + if (currentLazy.IsValueCreated) + { + _ = ScheduleDeferredDisposeAsync(currentLazy.Value); + } + + await Task.Yield(); + return newLazy.Value; + } + + static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) + { + try + { + await Task.Delay(TimeSpan.FromSeconds(30)).ConfigureAwait(false); + await DisposeChannelAsync(channel).ConfigureAwait(false); + } + catch (Exception ex) when (ex is not OutOfMemoryException + and not StackOverflowException + and not ThreadAbortException) + { + if (ex is not OperationCanceledException and not ObjectDisposedException) + { + Trace.TraceError( + "Unexpected exception while deferred-disposing gRPC channel in DurableTaskSchedulerClientExtensions.ScheduleDeferredDisposeAsync: {0}", + ex); + } + } } /// diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs new file mode 100644 index 000000000..d3d715bdb --- /dev/null +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -0,0 +1,270 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Diagnostics; +using Microsoft.Extensions.Logging; + +namespace Microsoft.DurableTask.Client.Grpc; + +/// +/// A wrapper that observes RPC outcomes and triggers a fire-and-forget channel +/// recreation after a configurable number of consecutive transport failures +/// (, or on RPCs that are +/// not long-poll waits). This guards against half-open HTTP/2 connections that can otherwise wedge +/// an entire client process for the lifetime of the gRPC channel. +/// +/// +/// The wrapper holds an immutable (channel + invoker pair) and swaps +/// the entire pair atomically on recreate to avoid torn state. Streaming RPCs are forwarded without +/// outcome observation; only unary RPC outcomes count toward the failure threshold. +/// The triggering RPC still surfaces its original failure to the caller; subsequent RPCs benefit +/// from the recreated channel. +/// +sealed class ChannelRecreatingCallInvoker : CallInvoker, IAsyncDisposable +{ + /// + /// Methods on which a response is expected behavior + /// (long-poll-style waits) and must NOT be counted toward the recreate threshold. + /// + static readonly HashSet DeadlineExceededAllowedMethods = new(StringComparer.Ordinal) + { + "/TaskHubSidecarService/WaitForInstanceCompletion", + "/TaskHubSidecarService/WaitForInstanceStart", + }; + + readonly Func> recreator; + readonly int failureThreshold; + readonly TimeSpan minRecreateInterval; + readonly bool ownsChannel; + readonly ILogger logger; + + TransportState state; + int consecutiveFailures; + int recreateInFlight; + long lastRecreateTicks; + + public ChannelRecreatingCallInvoker( + GrpcChannel initialChannel, + Func> recreator, + int failureThreshold, + TimeSpan minRecreateInterval, + bool ownsChannel, + ILogger logger) + { + this.recreator = recreator; + this.failureThreshold = failureThreshold; + this.minRecreateInterval = minRecreateInterval; + this.ownsChannel = ownsChannel; + this.logger = logger; + this.state = new TransportState(initialChannel, initialChannel.CreateCallInvoker()); + + // Seed lastRecreateTicks so cooldown does not block the very first recreate attempt. + this.lastRecreateTicks = Stopwatch.GetTimestamp() - StopwatchTicksFor(minRecreateInterval); + } + + public override TResponse BlockingUnaryCall( + Method method, string? host, CallOptions options, TRequest request) + { + TransportState current = this.state; + try + { + TResponse response = current.Invoker.BlockingUnaryCall(method, host, options, request); + this.RecordSuccess(); + return response; + } + catch (RpcException ex) + { + this.RecordFailure(ex.StatusCode, method.FullName); + throw; + } + } + + public override AsyncUnaryCall AsyncUnaryCall( + Method method, string? host, CallOptions options, TRequest request) + { + TransportState current = this.state; + AsyncUnaryCall call = current.Invoker.AsyncUnaryCall(method, host, options, request); + this.ObserveOutcome(call.ResponseAsync, method.FullName); + return call; + } + + public override AsyncServerStreamingCall AsyncServerStreamingCall( + Method method, string? host, CallOptions options, TRequest request) + { + // Streaming calls are forwarded without outcome observation. The streaming methods used by the + // DurableTask client are bounded snapshots (e.g. StreamInstanceHistory) where errors surface as + // exceptions to user code, so global failure counting on these would create false positives. + return this.state.Invoker.AsyncServerStreamingCall(method, host, options, request); + } + + public override AsyncClientStreamingCall AsyncClientStreamingCall( + Method method, string? host, CallOptions options) + { + return this.state.Invoker.AsyncClientStreamingCall(method, host, options); + } + + public override AsyncDuplexStreamingCall AsyncDuplexStreamingCall( + Method method, string? host, CallOptions options) + { + return this.state.Invoker.AsyncDuplexStreamingCall(method, host, options); + } + + public async ValueTask DisposeAsync() + { + if (!this.ownsChannel) + { + return; + } + + TransportState current = this.state; + try + { +#if NET6_0_OR_GREATER + await current.Channel.ShutdownAsync().ConfigureAwait(false); + current.Channel.Dispose(); +#else + await current.Channel.ShutdownAsync().ConfigureAwait(false); +#endif + } + catch (Exception ex) when (ex is OperationCanceledException or ObjectDisposedException) + { + // Best-effort disposal. + } + } + + static long StopwatchTicksFor(TimeSpan ts) => + (long)(ts.TotalSeconds * Stopwatch.Frequency); + + void ObserveOutcome(Task responseAsync, string methodFullName) + { + // Use ContinueWith with TaskScheduler.Default so we don't capture sync context. + responseAsync.ContinueWith( + (t, state) => + { + var (self, name) = ((ChannelRecreatingCallInvoker, string))state!; + if (t.Status == TaskStatus.RanToCompletion) + { + self.RecordSuccess(); + } + else if (t.Exception?.InnerException is RpcException rpcEx) + { + self.RecordFailure(rpcEx.StatusCode, name); + } + }, + (this, methodFullName), + CancellationToken.None, + TaskContinuationOptions.ExecuteSynchronously, + TaskScheduler.Default); + } + + void RecordSuccess() + { + Volatile.Write(ref this.consecutiveFailures, 0); + } + + void RecordFailure(StatusCode status, string methodFullName) + { + bool counts = status switch + { + StatusCode.Unavailable => true, + StatusCode.DeadlineExceeded => !DeadlineExceededAllowedMethods.Contains(methodFullName), + _ => false, + }; + + if (!counts) + { + return; + } + + int count = Interlocked.Increment(ref this.consecutiveFailures); + if (this.failureThreshold <= 0 || count < this.failureThreshold) + { + return; + } + + this.MaybeTriggerRecreate(count); + } + + void MaybeTriggerRecreate(int observedCount) + { + long nowTicks = Stopwatch.GetTimestamp(); + long elapsedTicks = nowTicks - Volatile.Read(ref this.lastRecreateTicks); + long cooldownTicks = StopwatchTicksFor(this.minRecreateInterval); + if (elapsedTicks < cooldownTicks) + { + return; + } + + // Single-flight guard: only one recreate task in flight at a time. + if (Interlocked.CompareExchange(ref this.recreateInFlight, 1, 0) != 0) + { + return; + } + + // Re-check elapsed under the guard to avoid back-to-back recreates that won the CAS race. + elapsedTicks = Stopwatch.GetTimestamp() - Volatile.Read(ref this.lastRecreateTicks); + if (elapsedTicks < cooldownTicks) + { + Interlocked.Exchange(ref this.recreateInFlight, 0); + return; + } + + this.logger.RecreatingChannel(observedCount); + _ = Task.Run(() => this.RecreateAsync(observedCount)); + } + + async Task RecreateAsync(int observedCount) + { + try + { + TransportState current = this.state; + using CancellationTokenSource cts = new(TimeSpan.FromSeconds(30)); + GrpcChannel newChannel = await this.recreator(current.Channel, cts.Token).ConfigureAwait(false); + + if (!ReferenceEquals(newChannel, current.Channel)) + { + this.state = new TransportState(newChannel, newChannel.CreateCallInvoker()); + this.logger.ChannelRecreated(GetEndpointDescription(newChannel)); + } + + // Successful recreate (even if a peer beat us to it) → reset the failure counter. + Volatile.Write(ref this.consecutiveFailures, 0); + Volatile.Write(ref this.lastRecreateTicks, Stopwatch.GetTimestamp()); + } + catch (Exception ex) when (ex is not OutOfMemoryException + and not StackOverflowException + and not ThreadAbortException) + { + this.logger.ChannelRecreateFailed(ex); + + // Update lastRecreateTicks even on failure so the cooldown applies to failed attempts too. + Volatile.Write(ref this.lastRecreateTicks, Stopwatch.GetTimestamp()); + } + finally + { + Interlocked.Exchange(ref this.recreateInFlight, 0); + } + } + + static string GetEndpointDescription(GrpcChannel channel) + { +#if NET6_0_OR_GREATER + return channel.Target ?? "(unknown)"; +#else + return channel.Target ?? "(unknown)"; +#endif + } + + sealed class TransportState + { + public TransportState(GrpcChannel channel, CallInvoker invoker) + { + this.Channel = channel; + this.Invoker = invoker; + } + + public GrpcChannel Channel { get; } + + public CallInvoker Invoker { get; } + } +} diff --git a/src/Client/Grpc/GrpcDurableTaskClient.cs b/src/Client/Grpc/GrpcDurableTaskClient.cs index 46e4dd2ed..036bb26fc 100644 --- a/src/Client/Grpc/GrpcDurableTaskClient.cs +++ b/src/Client/Grpc/GrpcDurableTaskClient.cs @@ -52,7 +52,7 @@ public GrpcDurableTaskClient(string name, GrpcDurableTaskClientOptions options, { this.logger = Check.NotNull(logger); this.options = Check.NotNull(options); - this.asyncDisposable = GetCallInvoker(options, out CallInvoker callInvoker); + this.asyncDisposable = GetCallInvoker(options, logger, out CallInvoker callInvoker); this.sidecarClient = new TaskHubSidecarServiceClient(callInvoker); if (this.options.EnableEntitySupport) @@ -624,21 +624,43 @@ public override async Task> GetOrchestrationHistoryAsync( } } - static AsyncDisposable GetCallInvoker(GrpcDurableTaskClientOptions options, out CallInvoker callInvoker) + static AsyncDisposable GetCallInvoker(GrpcDurableTaskClientOptions options, ILogger logger, out CallInvoker callInvoker) { + Func>? recreator = options.Internal.ChannelRecreator; + int threshold = options.Internal.ChannelRecreateFailureThreshold; + TimeSpan cooldown = options.Internal.MinRecreateInterval; + bool recreateEnabled = recreator != null && threshold > 0; + if (options.Channel is GrpcChannel c) { + if (recreateEnabled) + { + ChannelRecreatingCallInvoker wrapper = new(c, recreator!, threshold, cooldown, ownsChannel: false, logger); + callInvoker = wrapper; + return default; + } + callInvoker = c.CreateCallInvoker(); return default; } if (options.CallInvoker is CallInvoker invoker) { + // Externally supplied invoker — we do not own the underlying channel and cannot recreate it. callInvoker = invoker; return default; } + // Self-owned address path: create the channel ourselves so we own its lifecycle. c = GetChannel(options.Address); + + if (recreateEnabled) + { + ChannelRecreatingCallInvoker wrapper = new(c, recreator!, threshold, cooldown, ownsChannel: true, logger); + callInvoker = wrapper; + return new AsyncDisposable(() => wrapper.DisposeAsync()); + } + callInvoker = c.CreateCallInvoker(); return new AsyncDisposable(() => new(c.ShutdownAsync())); } diff --git a/src/Client/Grpc/GrpcDurableTaskClientOptions.cs b/src/Client/Grpc/GrpcDurableTaskClientOptions.cs index b67b62f9a..126aad345 100644 --- a/src/Client/Grpc/GrpcDurableTaskClientOptions.cs +++ b/src/Client/Grpc/GrpcDurableTaskClientOptions.cs @@ -22,4 +22,40 @@ public sealed class GrpcDurableTaskClientOptions : DurableTaskClientOptions /// Gets or sets the gRPC call invoker to use. Will supersede when provided. /// public CallInvoker? CallInvoker { get; set; } + + /// + /// Gets the internal options. These are not exposed directly, but configurable via + /// . + /// + internal InternalOptions Internal { get; } = new(); + + /// + /// Internal options are not exposed directly, but configurable via . + /// + internal class InternalOptions + { + /// + /// Gets or sets the number of consecutive transport failures (Unavailable responses, or + /// DeadlineExceeded responses on RPCs other than long-poll waits) after which the underlying + /// gRPC channel will be recreated to clear stale DNS, sub-channel state, or routing-affinity + /// bindings. Setting to 0 or a negative value disables channel recreation. Defaults to 5. + /// + public int ChannelRecreateFailureThreshold { get; set; } = 5; + + /// + /// Gets or sets the minimum interval between consecutive channel recreate attempts. Acts as a + /// cooldown so a burst of failures during a real outage cannot thrash the channel cache. + /// Defaults to 30 seconds. + /// + public TimeSpan MinRecreateInterval { get; set; } = TimeSpan.FromSeconds(30); + + /// + /// Gets or sets an optional callback invoked when the client requests a fresh gRPC channel after + /// repeated transport failures. The callback receives the previously-used channel and should + /// return either a freshly created channel or the currently cached channel if a peer has already + /// recreated it. Implementations are responsible for atomic swap and deferred disposal of the + /// old channel so in-flight RPCs from peer clients are not interrupted. + /// + public Func>? ChannelRecreator { get; set; } + } } diff --git a/src/Client/Grpc/Internal/InternalOptionsExtensions.cs b/src/Client/Grpc/Internal/InternalOptionsExtensions.cs new file mode 100644 index 000000000..800848f34 --- /dev/null +++ b/src/Client/Grpc/Internal/InternalOptionsExtensions.cs @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace Microsoft.DurableTask.Client.Grpc.Internal; + +/// +/// Provides access to configuring internal options for the gRPC client. +/// +public static class InternalOptionsExtensions +{ + /// + /// Sets a callback that the client invokes when the underlying gRPC channel needs to be recreated + /// after repeated transport failures (e.g., because the backend was replaced and the existing channel + /// is wedged on a half-open HTTP/2 connection). The callback receives the channel the client last + /// observed and must return either a freshly created channel or the currently cached channel if a + /// peer client has already swapped it. Implementations are responsible for atomic swap and deferred + /// disposal of the old channel so in-flight RPCs from peer clients are not interrupted. + /// + /// The gRPC client options. + /// The recreate callback. + /// + /// This is an internal API that supports the DurableTask infrastructure and not subject to + /// the same compatibility standards as public APIs. It may be changed or removed without notice in + /// any release. You should only use it directly in your code with extreme caution and knowing that + /// doing so can result in application failures when updating to a new DurableTask release. + /// + public static void SetChannelRecreator( + this GrpcDurableTaskClientOptions options, + Func> recreator) + { + options.Internal.ChannelRecreator = recreator ?? throw new ArgumentNullException(nameof(recreator)); + } +} diff --git a/src/Client/Grpc/Logs.cs b/src/Client/Grpc/Logs.cs index 8887f17d9..d57b015c5 100644 --- a/src/Client/Grpc/Logs.cs +++ b/src/Client/Grpc/Logs.cs @@ -39,6 +39,15 @@ static partial class Logs [LoggerMessage(EventId = 46, Level = LogLevel.Information, Message = "Purging instances with filter: {{ CreatedFrom = {createdFrom}, CreatedTo = {createdTo}, Statuses = {statuses} }}")] public static partial void PurgingInstances(this ILogger logger, DateTimeOffset? createdFrom, DateTimeOffset? createdTo, string? statuses); + [LoggerMessage(EventId = 80, Level = LogLevel.Warning, Message = "Recreating gRPC channel to backend after {failureCount} consecutive transport failures.")] + public static partial void RecreatingChannel(this ILogger logger, int failureCount); + + [LoggerMessage(EventId = 81, Level = LogLevel.Information, Message = "gRPC channel to backend has been recreated. New target: {endpoint}.")] + public static partial void ChannelRecreated(this ILogger logger, string endpoint); + + [LoggerMessage(EventId = 82, Level = LogLevel.Warning, Message = "gRPC channel recreation failed.")] + public static partial void ChannelRecreateFailed(this ILogger logger, Exception exception); + /// /// . /// diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index 0164d1cc5..ecf3bf754 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -156,9 +156,89 @@ public void Configure(string? name, GrpcDurableTaskWorkerOptions options) options.Channel = this.channels.GetOrAdd( cacheKey, _ => new Lazy(source.CreateChannel)).Value; + options.SetChannelRecreator((oldChannel, ct) => this.RecreateChannelAsync(cacheKey, source, oldChannel, ct)); options.ConfigureForAzureManaged(); } + /// + /// Atomically swaps the cached channel for the given key with a freshly created one and schedules + /// graceful disposal of the old channel after a grace period so any in-flight RPCs from peer workers + /// can drain. Returns the currently cached channel if a peer worker has already recreated it. + /// + async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWorkerOptions source, GrpcChannel oldChannel, CancellationToken cancellation) + { + cancellation.ThrowIfCancellationRequested(); + + if (this.disposed == 1) + { + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } + + // CAS swap: only replace if the cached lazy still holds the channel the caller observed. + if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) + { + // No entry — create one and return it. + Lazy created = new(source.CreateChannel); + if (this.channels.TryAdd(cacheKey, created)) + { + return created.Value; + } + + // Another thread added one; fall through to read it. + this.channels.TryGetValue(cacheKey, out currentLazy); + } + + if (currentLazy is null) + { + throw new InvalidOperationException("Failed to obtain a cached gRPC channel after recreation attempt."); + } + + if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) + { + // A peer worker already swapped in a new channel; reuse it. + return currentLazy.Value; + } + + Lazy newLazy = new(source.CreateChannel); + if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) + { + // Lost the race; whoever won has the freshest entry. + this.channels.TryGetValue(cacheKey, out Lazy? winner); + return winner?.Value ?? newLazy.Value; + } + + // Successful swap. Schedule graceful disposal of the old channel after a grace period + // so peer workers' in-flight RPCs can drain. + if (currentLazy.IsValueCreated) + { + _ = ScheduleDeferredDisposeAsync(currentLazy.Value); + } + + await Task.Yield(); + return newLazy.Value; + } + + static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) + { + try + { + // Grace period to let in-flight RPCs from peer workers complete before draining the channel. + await Task.Delay(TimeSpan.FromSeconds(30)).ConfigureAwait(false); + await DisposeChannelAsync(channel).ConfigureAwait(false); + } + catch (Exception ex) when (ex is not OutOfMemoryException + and not StackOverflowException + and not ThreadAbortException) + { + if (ex is not OperationCanceledException and not ObjectDisposedException) + { + Trace.TraceError( + "Unexpected exception while deferred-disposing gRPC channel in DurableTaskSchedulerWorkerExtensions.ScheduleDeferredDisposeAsync: {0}", + ex); + } + } + } + /// public async ValueTask DisposeAsync() { diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 58a6db040..c07157aef 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -54,29 +54,61 @@ public Processor(GrpcDurableTaskWorker worker, TaskHubSidecarServiceClient clien ILogger Logger => this.worker.logger; - public async Task ExecuteAsync(CancellationToken cancellation) + public async Task ExecuteAsync(CancellationToken cancellation) { + // Tracks consecutive failures against the same channel. Reset only after the stream + // has actually delivered a message (HelloAsync alone is not proof the channel is healthy). + int consecutiveChannelFailures = 0; + + // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. + int reconnectAttempt = 0; + Random backoffRandom = new(); + while (!cancellation.IsCancellationRequested) { + bool channelLikelyPoisoned = false; try { AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); - await this.ProcessWorkItemsAsync(stream, cancellation); + await this.ProcessWorkItemsAsync( + stream, + cancellation, + onFirstMessage: () => + { + consecutiveChannelFailures = 0; + reconnectAttempt = 0; + }, + onSilentDisconnect: () => channelLikelyPoisoned = true); } catch (RpcException) when (cancellation.IsCancellationRequested) { // Worker is shutting down - let the method exit gracefully - break; + return ProcessorExitReason.Shutdown; } catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled) { - // Sidecar is shutting down - retry + // Sidecar is shutting down - retry. Don't count toward channel-poisoned threshold: + // Cancelled is ambiguous and shouldn't drive recreate storms. this.Logger.SidecarDisconnected(); } + catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) + { + // HelloAsync hung — most plausibly a half-open HTTP/2 connection / stale routing. + int seconds = Math.Max(1, (int)this.internalOptions.HelloDeadline.TotalSeconds); + this.Logger.HelloTimeout(seconds); + channelLikelyPoisoned = true; + } catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) { - // Sidecar is down - keep retrying + // Sidecar is down - keep retrying. this.Logger.SidecarUnavailable(); + channelLikelyPoisoned = true; + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) + { + // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel + // recreate: a fresh channel won't fix bad credentials. + this.Logger.AuthenticationFailed(ex); } catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) { @@ -91,7 +123,7 @@ public async Task ExecuteAsync(CancellationToken cancellation) catch (OperationCanceledException) when (cancellation.IsCancellationRequested) { // Shutting down, lets exit gracefully. - break; + return ProcessorExitReason.Shutdown; } catch (Exception ex) { @@ -99,19 +131,39 @@ public async Task ExecuteAsync(CancellationToken cancellation) this.Logger.UnexpectedError(ex, string.Empty); } + if (channelLikelyPoisoned) + { + consecutiveChannelFailures++; + int threshold = this.internalOptions.ChannelRecreateFailureThreshold; + if (threshold > 0 && consecutiveChannelFailures >= threshold) + { + this.Logger.RecreatingChannel(consecutiveChannelFailures); + return ProcessorExitReason.ChannelRecreateRequested; + } + } + try { - // CONSIDER: Exponential backoff - await Task.Delay(TimeSpan.FromSeconds(5), cancellation); + TimeSpan delay = ReconnectBackoff.Compute( + reconnectAttempt, + this.internalOptions.ReconnectBackoffBase, + this.internalOptions.ReconnectBackoffCap, + backoffRandom); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt + await Task.Delay(delay, cancellation); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) { // Worker is shutting down - let the method exit gracefully - break; + return ProcessorExitReason.Shutdown; } } + + return ProcessorExitReason.Shutdown; } + static string GetActionsListForLogging(IReadOnlyList actions) { if (actions.Count == 0) @@ -242,7 +294,10 @@ async ValueTask BuildRuntimeStateAsync( async Task> ConnectAsync(CancellationToken cancellation) { - await this.client!.HelloAsync(EmptyMessage, cancellationToken: cancellation); + TimeSpan helloDeadline = this.internalOptions.HelloDeadline; + DateTime? deadline = helloDeadline > TimeSpan.Zero ? DateTime.UtcNow.Add(helloDeadline) : null; + + await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); this.Logger.EstablishedWorkItemConnection(); DurableTaskWorkerOptions workerOptions = this.worker.workerOptions; @@ -263,20 +318,43 @@ async ValueTask BuildRuntimeStateAsync( cancellationToken: cancellation); } - async Task ProcessWorkItemsAsync(AsyncServerStreamingCall stream, CancellationToken cancellation) + async Task ProcessWorkItemsAsync( + AsyncServerStreamingCall stream, + CancellationToken cancellation, + Action? onFirstMessage = null, + Action? onSilentDisconnect = null) { // Create a new token source for timing out and a final token source that keys off of them both. - // The timeout token is used to detect when we are no longer getting any messages, including health checks. - // If this is the case, it signifies the connection has been dropped silently and we need to reconnect. + // The timeout token is used to detect when we are no longer getting any messages, including health + // pings sent periodically by the server. If this is the case, it signifies the connection has been + // dropped silently and we need to reconnect. + TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; + bool silentDisconnectEnabled = silentDisconnectTimeout > TimeSpan.Zero; using var timeoutSource = new CancellationTokenSource(); - timeoutSource.CancelAfter(TimeSpan.FromSeconds(60)); + if (silentDisconnectEnabled) + { + timeoutSource.CancelAfter(silentDisconnectTimeout); + } + using var tokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellation, timeoutSource.Token); + bool firstMessageObserved = false; + while (!cancellation.IsCancellationRequested) { await foreach (P.WorkItem workItem in stream.ResponseStream.ReadAllAsync(tokenSource.Token)) { - timeoutSource.CancelAfter(TimeSpan.FromSeconds(60)); + if (silentDisconnectEnabled) + { + timeoutSource.CancelAfter(silentDisconnectTimeout); + } + + if (!firstMessageObserved) + { + firstMessageObserved = true; + onFirstMessage?.Invoke(); + } + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) { this.RunBackgroundTask( @@ -321,7 +399,10 @@ async Task ProcessWorkItemsAsync(AsyncServerStreamingCall stream, Ca } else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) { - // No-op + // Health pings are heartbeat-only signals from the backend; the silent-disconnect + // timer reset above is the actionable behavior. Logging at Trace allows operators + // to confirm liveness without flooding info-level telemetry. + this.Logger.ReceivedHealthPing(); } else { @@ -338,6 +419,7 @@ async Task ProcessWorkItemsAsync(AsyncServerStreamingCall stream, Ca { // Since the cancellation came from the timeout, log a warning. this.Logger.ConnectionTimeout(); + onSilentDisconnect?.Invoke(); } return; diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index 1d03d96ae..1905a5146 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -57,9 +57,123 @@ public GrpcDurableTaskWorker( /// protected override async Task ExecuteAsync(CancellationToken stoppingToken) { - await using AsyncDisposable disposable = this.GetCallInvoker(out CallInvoker callInvoker, out string address); - this.logger.StartingTaskHubWorker(address); - await new Processor(this, new(callInvoker), this.orchestrationFilter, this.ExceptionPropertiesProvider).ExecuteAsync(stoppingToken); + AsyncDisposable workerOwnedChannelDisposable = this.GetCallInvoker(out CallInvoker callInvoker, out string address); + try + { + this.logger.StartingTaskHubWorker(address); + + while (!stoppingToken.IsCancellationRequested) + { + Processor processor = new(this, new(callInvoker), this.orchestrationFilter, this.ExceptionPropertiesProvider); + ProcessorExitReason reason = await processor.ExecuteAsync(stoppingToken); + + if (reason == ProcessorExitReason.Shutdown || stoppingToken.IsCancellationRequested) + { + return; + } + + // ChannelRecreateRequested: try to obtain a fresh channel before re-entering the loop. + ChannelRecreateResult result = await this.TryRecreateChannelAsync(stoppingToken, workerOwnedChannelDisposable); + if (result.Recreated) + { + callInvoker = result.NewCallInvoker!; + address = result.NewAddress!; + AsyncDisposable previousDisposable = workerOwnedChannelDisposable; + workerOwnedChannelDisposable = result.NewWorkerOwnedDisposable; + this.logger.ChannelRecreated(address); + + // Dispose the prior worker-owned channel (if any) AFTER swapping in the new one. + // For caller-supplied channels this is a no-op (default AsyncDisposable). + if (!ReferenceEquals(previousDisposable, workerOwnedChannelDisposable)) + { + await previousDisposable.DisposeAsync(); + } + } + + // If we couldn't recreate (e.g., caller-owned CallInvoker), fall through and retry on the + // existing transport. The Processor's outer backoff already waited before returning. + } + } + finally + { + await workerOwnedChannelDisposable.DisposeAsync(); + } + } + + async Task TryRecreateChannelAsync( + CancellationToken cancellation, + AsyncDisposable currentWorkerOwnedDisposable) + { + // Path 1: caller (or extension method like ConfigureGrpcChannel) supplied a recreator. + Func>? recreator = this.grpcOptions.Internal.ChannelRecreator; + if (recreator is not null && this.grpcOptions.Channel is GrpcChannel currentChannel) + { + try + { + GrpcChannel newChannel = await recreator(currentChannel, cancellation).ConfigureAwait(false); + if (!ReferenceEquals(newChannel, currentChannel)) + { + // The recreator owns disposal of the old channel; we don't dispose here. + return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, currentWorkerOwnedDisposable); + } + + // Recreator returned the same instance — nothing to swap. + return ChannelRecreateResult.NotRecreated(currentWorkerOwnedDisposable); + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + throw; + } + catch (Exception ex) + { + // Don't crash the worker if recreate fails; just keep using the existing transport. + this.logger.UnexpectedError(ex, string.Empty); + return ChannelRecreateResult.NotRecreated(currentWorkerOwnedDisposable); + } + } + + // Path 2: worker-owned channel created from Address. We can rebuild it ourselves. + if (this.grpcOptions.Channel is null + && this.grpcOptions.CallInvoker is null) + { + try + { + GrpcChannel newChannel = GetChannel(this.grpcOptions.Address); + AsyncDisposable newDisposable = new(() => new(newChannel.ShutdownAsync())); + return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, newDisposable); + } + catch (Exception ex) + { + this.logger.UnexpectedError(ex, string.Empty); + return ChannelRecreateResult.NotRecreated(currentWorkerOwnedDisposable); + } + } + + // Path 3: caller-owned CallInvoker or externally-supplied Channel without a recreator. + // No safe way to recreate; let the inner loop continue trying on the existing transport. + return ChannelRecreateResult.NotRecreated(currentWorkerOwnedDisposable); + } + + readonly struct ChannelRecreateResult + { + public ChannelRecreateResult(bool recreated, CallInvoker? newCallInvoker, string? newAddress, AsyncDisposable newWorkerOwnedDisposable) + { + this.Recreated = recreated; + this.NewCallInvoker = newCallInvoker; + this.NewAddress = newAddress; + this.NewWorkerOwnedDisposable = newWorkerOwnedDisposable; + } + + public bool Recreated { get; } + + public CallInvoker? NewCallInvoker { get; } + + public string? NewAddress { get; } + + public AsyncDisposable NewWorkerOwnedDisposable { get; } + + public static ChannelRecreateResult NotRecreated(AsyncDisposable currentDisposable) + => new(false, null, null, currentDisposable); } #if NET6_0_OR_GREATER diff --git a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs index 52372f65d..4b18d8528 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs @@ -97,5 +97,51 @@ internal class InternalOptions /// unlock events into the history when an orchestration terminates while holding an entity lock. /// public bool InsertEntityUnlocksOnCompletion { get; set; } + + /// + /// Gets or sets the maximum amount of time to wait for the initial Hello handshake against the + /// backend before treating the connect attempt as failed and retrying. A non-positive value disables + /// the deadline. Defaults to 30 seconds. This guards against half-open HTTP/2 connections that can + /// otherwise cause reconnect to hang indefinitely. + /// + public TimeSpan HelloDeadline { get; set; } = TimeSpan.FromSeconds(30); + + /// + /// Gets or sets the maximum amount of time the worker will wait between messages on an established + /// work-item stream before treating the channel as silently disconnected and forcing a reconnect. + /// The backend sends periodic health-ping work items expressly to keep this window alive when no + /// real work is flowing, so this value should be larger than the server's ping cadence to avoid + /// false positives. Defaults to 120 seconds. A non-positive value disables silent-disconnect detection. + /// + public TimeSpan SilentDisconnectTimeout { get; set; } = TimeSpan.FromSeconds(120); + + /// + /// Gets or sets the number of consecutive connect failures (Hello timeouts, Unavailable responses, or + /// silent stream disconnects) after which the underlying gRPC channel will be recreated to clear + /// stale DNS, sub-channel state, or routing-affinity bindings. Setting to 0 or a negative value + /// disables channel recreation. Defaults to 5. + /// + public int ChannelRecreateFailureThreshold { get; set; } = 5; + + /// + /// Gets or sets the base delay used when computing reconnect backoff with full jitter: + /// the actual delay is uniformly random in [0, min(cap, base * 2^attempt)]. Defaults to 1 second. + /// + public TimeSpan ReconnectBackoffBase { get; set; } = TimeSpan.FromSeconds(1); + + /// + /// Gets or sets the maximum delay used when computing reconnect backoff with full jitter. + /// Defaults to 30 seconds. + /// + public TimeSpan ReconnectBackoffCap { get; set; } = TimeSpan.FromSeconds(30); + + /// + /// Gets or sets an optional callback invoked when the worker requests a fresh gRPC channel after + /// repeated connect failures. The callback receives the previously-used channel and should return + /// either a freshly created channel or the currently cached channel if a peer worker has already + /// recreated it. Implementations are responsible for atomic swap and deferred disposal of the old + /// channel so in-flight RPCs from peer workers are not interrupted. + /// + public Func>? ChannelRecreator { get; set; } } } diff --git a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs index 0739c15c2..43d5d90f4 100644 --- a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs +++ b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs @@ -27,4 +27,27 @@ public static void ConfigureForAzureManaged(this GrpcDurableTaskWorkerOptions op options.Internal.ConvertOrchestrationEntityEvents = true; options.Internal.InsertEntityUnlocksOnCompletion = true; } + + /// + /// Sets a callback that the worker invokes when the underlying gRPC channel needs to be recreated + /// after repeated connect failures (e.g., because the backend was replaced and the existing channel + /// is wedged on a half-open HTTP/2 connection). The callback receives the channel the worker last + /// observed and must return either a freshly created channel or the currently cached channel if a + /// peer worker has already swapped it. Implementations are responsible for atomic swap and deferred + /// disposal of the old channel so in-flight RPCs from peer workers are not interrupted. + /// + /// The gRPC worker options. + /// The recreate callback. + /// + /// This is an internal API that supports the DurableTask infrastructure and not subject to + /// the same compatibility standards as public APIs. It may be changed or removed without notice in + /// any release. You should only use it directly in your code with extreme caution and knowing that + /// doing so can result in application failures when updating to a new DurableTask release. + /// + public static void SetChannelRecreator( + this GrpcDurableTaskWorkerOptions options, + Func> recreator) + { + options.Internal.ChannelRecreator = recreator ?? throw new ArgumentNullException(nameof(recreator)); + } } diff --git a/src/Worker/Grpc/Logs.cs b/src/Worker/Grpc/Logs.cs index b7d1f957e..e63c3c60e 100644 --- a/src/Worker/Grpc/Logs.cs +++ b/src/Worker/Grpc/Logs.cs @@ -52,6 +52,24 @@ static partial class Logs [LoggerMessage(EventId = 56, Level = LogLevel.Warning, Message = "Channel to backend has stopped receiving traffic, will attempt to reconnect.")] public static partial void ConnectionTimeout(this ILogger logger); + [LoggerMessage(EventId = 70, Level = LogLevel.Warning, Message = "Hello handshake to backend timed out after {timeoutSeconds}s. Will retry.")] + public static partial void HelloTimeout(this ILogger logger, int timeoutSeconds); + + [LoggerMessage(EventId = 71, Level = LogLevel.Warning, Message = "Authentication failed when connecting to backend. Will retry.")] + public static partial void AuthenticationFailed(this ILogger logger, Exception ex); + + [LoggerMessage(EventId = 72, Level = LogLevel.Warning, Message = "Recreating gRPC channel to backend after {failureCount} consecutive connect failures.")] + public static partial void RecreatingChannel(this ILogger logger, int failureCount); + + [LoggerMessage(EventId = 73, Level = LogLevel.Information, Message = "gRPC channel to backend has been recreated. New target: {endpoint}.")] + public static partial void ChannelRecreated(this ILogger logger, string endpoint); + + [LoggerMessage(EventId = 74, Level = LogLevel.Debug, Message = "Reconnect attempt {attempt} will retry after {delayMs} ms.")] + public static partial void ReconnectBackoff(this ILogger logger, int attempt, int delayMs); + + [LoggerMessage(EventId = 75, Level = LogLevel.Trace, Message = "Received health ping from the backend.")] + public static partial void ReceivedHealthPing(this ILogger logger); + [LoggerMessage(EventId = 57, Level = LogLevel.Warning, Message = "Orchestration version did not meet worker versioning requirements. Error action = '{errorAction}'. Version error = '{versionError}'")] public static partial void OrchestrationVersionFailure(this ILogger logger, string errorAction, string versionError); diff --git a/src/Worker/Grpc/ProcessorExitReason.cs b/src/Worker/Grpc/ProcessorExitReason.cs new file mode 100644 index 000000000..144ae4304 --- /dev/null +++ b/src/Worker/Grpc/ProcessorExitReason.cs @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// Indicates why returned to its caller. +/// +enum ProcessorExitReason +{ + /// + /// The processor exited because cancellation was requested (graceful shutdown). + /// + Shutdown, + + /// + /// The processor exited because the underlying gRPC channel appears poisoned and should be recreated + /// before the next reconnect attempt. The caller is expected to obtain a fresh channel and rebuild the + /// processor. + /// + ChannelRecreateRequested, +} diff --git a/src/Worker/Grpc/ReconnectBackoff.cs b/src/Worker/Grpc/ReconnectBackoff.cs new file mode 100644 index 000000000..af7e65aad --- /dev/null +++ b/src/Worker/Grpc/ReconnectBackoff.cs @@ -0,0 +1,42 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// Helpers for computing reconnect backoff delays in the gRPC worker. +/// +static class ReconnectBackoff +{ + /// + /// Computes a full-jitter exponential backoff delay: a uniformly random TimeSpan in + /// [0, min(cap, base * 2^attempt)]. Returns when the base delay + /// is non-positive. + /// + /// The retry attempt index, starting at 0. + /// The base delay used for the exponential growth. + /// The maximum delay before jitter is applied. + /// The random source used for jitter. + /// The computed jittered delay. + public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Random random) + { + if (baseDelay <= TimeSpan.Zero) + { + return TimeSpan.Zero; + } + + if (attempt < 0) + { + attempt = 0; + } + + // Cap the exponent to avoid overflow in 2^attempt for pathological attempt values. + int safeAttempt = Math.Min(attempt, 30); + + double capMs = Math.Max(baseDelay.TotalMilliseconds, cap.TotalMilliseconds); + double exp = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); + double bound = Math.Min(capMs, exp); + double jittered = random.NextDouble() * bound; + return TimeSpan.FromMilliseconds(jittered); + } +} diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs new file mode 100644 index 000000000..1f36245df --- /dev/null +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using Microsoft.DurableTask.Worker.Grpc.Internal; + +namespace Microsoft.DurableTask.Worker.Grpc.Tests; + +public class GrpcDurableTaskWorkerOptionsInternalTests +{ + [Fact] + public void InternalOptions_HasSafeDefaults() + { + // Arrange + GrpcDurableTaskWorkerOptions options = new(); + + // Act + GrpcDurableTaskWorkerOptions.InternalOptions internalOptions = options.Internal; + + // Assert + internalOptions.HelloDeadline.Should().Be(TimeSpan.FromSeconds(30)); + internalOptions.ChannelRecreateFailureThreshold.Should().Be(5); + internalOptions.ReconnectBackoffBase.Should().Be(TimeSpan.FromSeconds(1)); + internalOptions.ReconnectBackoffCap.Should().Be(TimeSpan.FromSeconds(30)); + internalOptions.ChannelRecreator.Should().BeNull(); + } + + [Fact] + public void SetChannelRecreator_NullCallback_Throws() + { + // Arrange + GrpcDurableTaskWorkerOptions options = new(); + + // Act + Action act = () => options.SetChannelRecreator(null!); + + // Assert + act.Should().Throw(); + } + + [Fact] + public void SetChannelRecreator_StoresCallbackOnInternalOptions() + { + // Arrange + GrpcDurableTaskWorkerOptions options = new(); + bool invoked = false; + Func> recreator = (channel, ct) => + { + invoked = true; + return Task.FromResult(channel); + }; + + // Act + options.SetChannelRecreator(recreator); + + // Assert + options.Internal.ChannelRecreator.Should().BeSameAs(recreator); + + // Sanity-check that invoking the stored delegate calls the original. + options.Internal.ChannelRecreator!.Invoke(null!, CancellationToken.None); + invoked.Should().BeTrue(); + } +} diff --git a/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs b/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs new file mode 100644 index 000000000..eb5cd1877 --- /dev/null +++ b/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs @@ -0,0 +1,114 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +namespace Microsoft.DurableTask.Worker.Grpc.Tests; + +public class ReconnectBackoffTests +{ + [Fact] + public void Compute_ZeroBase_ReturnsZero() + { + // Arrange + Random random = new(42); + + // Act + TimeSpan delay = ReconnectBackoff.Compute(attempt: 5, baseDelay: TimeSpan.Zero, cap: TimeSpan.FromSeconds(30), random); + + // Assert + delay.Should().Be(TimeSpan.Zero); + } + + [Fact] + public void Compute_NegativeBase_ReturnsZero() + { + // Arrange + Random random = new(42); + + // Act + TimeSpan delay = ReconnectBackoff.Compute(attempt: 0, baseDelay: TimeSpan.FromMilliseconds(-100), cap: TimeSpan.FromSeconds(30), random); + + // Assert + delay.Should().Be(TimeSpan.Zero); + } + + [Fact] + public void Compute_NeverExceedsCap() + { + // Arrange + TimeSpan cap = TimeSpan.FromSeconds(30); + TimeSpan baseDelay = TimeSpan.FromSeconds(1); + Random random = new(1); + + // Act + Assert: try a wide range of attempts, including pathological values. + for (int attempt = 0; attempt < 50; attempt++) + { + TimeSpan delay = ReconnectBackoff.Compute(attempt, baseDelay, cap, random); + delay.Should().BeLessThanOrEqualTo(cap, $"attempt {attempt} produced {delay}"); + delay.Should().BeGreaterThanOrEqualTo(TimeSpan.Zero); + } + } + + [Fact] + public void Compute_GrowsExponentiallyUntilCap() + { + // Arrange: a Random that always returns 1.0 forces the upper bound of the jitter window. + DeterministicRandom random = new(value: 0.999999); + TimeSpan baseDelay = TimeSpan.FromSeconds(1); + TimeSpan cap = TimeSpan.FromSeconds(30); + + // Act + double d0 = ReconnectBackoff.Compute(0, baseDelay, cap, random).TotalMilliseconds; + double d1 = ReconnectBackoff.Compute(1, baseDelay, cap, random).TotalMilliseconds; + double d2 = ReconnectBackoff.Compute(2, baseDelay, cap, random).TotalMilliseconds; + double d3 = ReconnectBackoff.Compute(3, baseDelay, cap, random).TotalMilliseconds; + double d10 = ReconnectBackoff.Compute(10, baseDelay, cap, random).TotalMilliseconds; + + // Assert: roughly doubles each step until cap is reached. + d0.Should().BeApproximately(1000, 1); + d1.Should().BeApproximately(2000, 1); + d2.Should().BeApproximately(4000, 1); + d3.Should().BeApproximately(8000, 1); + d10.Should().BeApproximately(30000, 1, "should be clamped at the cap"); + } + + [Fact] + public void Compute_WithFullJitter_StaysWithinBounds() + { + // Arrange: with random=0 the result is 0; with random=1 the result is the bound. + TimeSpan baseDelay = TimeSpan.FromSeconds(1); + TimeSpan cap = TimeSpan.FromSeconds(30); + + // Act + Assert: random=0 → 0 + TimeSpan low = ReconnectBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.0)); + low.TotalMilliseconds.Should().BeApproximately(0, 0.5); + + // random ~1 → bound (= 8s for attempt=3, base=1s) + TimeSpan high = ReconnectBackoff.Compute(3, baseDelay, cap, new DeterministicRandom(0.999999)); + high.TotalMilliseconds.Should().BeApproximately(8000, 1); + } + + [Fact] + public void Compute_NegativeAttempt_TreatedAsZero() + { + // Arrange + DeterministicRandom random = new(0.999999); + + // Act + TimeSpan delay = ReconnectBackoff.Compute(attempt: -5, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.FromSeconds(30), random); + + // Assert + delay.TotalMilliseconds.Should().BeApproximately(1000, 1); + } + + sealed class DeterministicRandom : Random + { + readonly double value; + + public DeterministicRandom(double value) + { + this.value = value; + } + + public override double NextDouble() => this.value; + } +} From 71bca4c5a54bca2ae57c1767295cb469f770973d Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 13:00:30 -0700 Subject: [PATCH 02/27] Address CodeQL feedback: drop ReferenceEquals on struct, narrow generic catches Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index 1905a5146..ce5b10557 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -82,12 +82,11 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) workerOwnedChannelDisposable = result.NewWorkerOwnedDisposable; this.logger.ChannelRecreated(address); - // Dispose the prior worker-owned channel (if any) AFTER swapping in the new one. - // For caller-supplied channels this is a no-op (default AsyncDisposable). - if (!ReferenceEquals(previousDisposable, workerOwnedChannelDisposable)) - { - await previousDisposable.DisposeAsync(); - } + // Dispose the prior worker-owned channel (if any). For Path 1 (caller-supplied recreator) + // and Path 3 (caller-owned), the previous disposable is a default AsyncDisposable whose + // DisposeAsync is a no-op, so this is always safe. We do not use ReferenceEquals here + // because AsyncDisposable is a value type and reference comparison is meaningless. + await previousDisposable.DisposeAsync(); } // If we couldn't recreate (e.g., caller-owned CallInvoker), fall through and retry on the @@ -124,7 +123,7 @@ async Task TryRecreateChannelAsync( { throw; } - catch (Exception ex) + catch (Exception ex) when (!IsFatal(ex)) { // Don't crash the worker if recreate fails; just keep using the existing transport. this.logger.UnexpectedError(ex, string.Empty); @@ -142,7 +141,11 @@ async Task TryRecreateChannelAsync( AsyncDisposable newDisposable = new(() => new(newChannel.ShutdownAsync())); return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, newDisposable); } - catch (Exception ex) + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + throw; + } + catch (Exception ex) when (!IsFatal(ex)) { this.logger.UnexpectedError(ex, string.Empty); return ChannelRecreateResult.NotRecreated(currentWorkerOwnedDisposable); @@ -152,6 +155,11 @@ async Task TryRecreateChannelAsync( // Path 3: caller-owned CallInvoker or externally-supplied Channel without a recreator. // No safe way to recreate; let the inner loop continue trying on the existing transport. return ChannelRecreateResult.NotRecreated(currentWorkerOwnedDisposable); + + static bool IsFatal(Exception ex) => ex is OutOfMemoryException + or StackOverflowException + or AccessViolationException + or ThreadAbortException; } readonly struct ChannelRecreateResult From 65336586023c78ab0ebd976661a7387fe7468ef1 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 13:05:02 -0700 Subject: [PATCH 03/27] Sort new log event additions by EventId in Worker and Client Logs.cs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Client/Grpc/Logs.cs | 18 +++++++++--------- src/Worker/Grpc/Logs.cs | 36 ++++++++++++++++++------------------ 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/Client/Grpc/Logs.cs b/src/Client/Grpc/Logs.cs index d57b015c5..7eec022a4 100644 --- a/src/Client/Grpc/Logs.cs +++ b/src/Client/Grpc/Logs.cs @@ -39,15 +39,6 @@ static partial class Logs [LoggerMessage(EventId = 46, Level = LogLevel.Information, Message = "Purging instances with filter: {{ CreatedFrom = {createdFrom}, CreatedTo = {createdTo}, Statuses = {statuses} }}")] public static partial void PurgingInstances(this ILogger logger, DateTimeOffset? createdFrom, DateTimeOffset? createdTo, string? statuses); - [LoggerMessage(EventId = 80, Level = LogLevel.Warning, Message = "Recreating gRPC channel to backend after {failureCount} consecutive transport failures.")] - public static partial void RecreatingChannel(this ILogger logger, int failureCount); - - [LoggerMessage(EventId = 81, Level = LogLevel.Information, Message = "gRPC channel to backend has been recreated. New target: {endpoint}.")] - public static partial void ChannelRecreated(this ILogger logger, string endpoint); - - [LoggerMessage(EventId = 82, Level = LogLevel.Warning, Message = "gRPC channel recreation failed.")] - public static partial void ChannelRecreateFailed(this ILogger logger, Exception exception); - /// /// . /// @@ -58,5 +49,14 @@ public static void PurgingInstances(this ILogger logger, PurgeInstancesFilter fi string? statuses = filter?.Statuses is null ? null : string.Join("|", filter.Statuses); PurgingInstances(logger, filter?.CreatedFrom, filter?.CreatedTo, statuses); } + + [LoggerMessage(EventId = 80, Level = LogLevel.Warning, Message = "Recreating gRPC channel to backend after {failureCount} consecutive transport failures.")] + public static partial void RecreatingChannel(this ILogger logger, int failureCount); + + [LoggerMessage(EventId = 81, Level = LogLevel.Information, Message = "gRPC channel to backend has been recreated. New target: {endpoint}.")] + public static partial void ChannelRecreated(this ILogger logger, string endpoint); + + [LoggerMessage(EventId = 82, Level = LogLevel.Warning, Message = "gRPC channel recreation failed.")] + public static partial void ChannelRecreateFailed(this ILogger logger, Exception exception); } } diff --git a/src/Worker/Grpc/Logs.cs b/src/Worker/Grpc/Logs.cs index e63c3c60e..bafa4a334 100644 --- a/src/Worker/Grpc/Logs.cs +++ b/src/Worker/Grpc/Logs.cs @@ -52,24 +52,6 @@ static partial class Logs [LoggerMessage(EventId = 56, Level = LogLevel.Warning, Message = "Channel to backend has stopped receiving traffic, will attempt to reconnect.")] public static partial void ConnectionTimeout(this ILogger logger); - [LoggerMessage(EventId = 70, Level = LogLevel.Warning, Message = "Hello handshake to backend timed out after {timeoutSeconds}s. Will retry.")] - public static partial void HelloTimeout(this ILogger logger, int timeoutSeconds); - - [LoggerMessage(EventId = 71, Level = LogLevel.Warning, Message = "Authentication failed when connecting to backend. Will retry.")] - public static partial void AuthenticationFailed(this ILogger logger, Exception ex); - - [LoggerMessage(EventId = 72, Level = LogLevel.Warning, Message = "Recreating gRPC channel to backend after {failureCount} consecutive connect failures.")] - public static partial void RecreatingChannel(this ILogger logger, int failureCount); - - [LoggerMessage(EventId = 73, Level = LogLevel.Information, Message = "gRPC channel to backend has been recreated. New target: {endpoint}.")] - public static partial void ChannelRecreated(this ILogger logger, string endpoint); - - [LoggerMessage(EventId = 74, Level = LogLevel.Debug, Message = "Reconnect attempt {attempt} will retry after {delayMs} ms.")] - public static partial void ReconnectBackoff(this ILogger logger, int attempt, int delayMs); - - [LoggerMessage(EventId = 75, Level = LogLevel.Trace, Message = "Received health ping from the backend.")] - public static partial void ReceivedHealthPing(this ILogger logger); - [LoggerMessage(EventId = 57, Level = LogLevel.Warning, Message = "Orchestration version did not meet worker versioning requirements. Error action = '{errorAction}'. Version error = '{versionError}'")] public static partial void OrchestrationVersionFailure(this ILogger logger, string errorAction, string versionError); @@ -97,5 +79,23 @@ static partial class Logs [LoggerMessage(EventId = 65, Level = LogLevel.Information, Message = "{instanceId}: Abandoned entity work item. Completion token = '{completionToken}'")] public static partial void AbandonedEntityWorkItem(this ILogger logger, string instanceId, string completionToken); + + [LoggerMessage(EventId = 70, Level = LogLevel.Warning, Message = "Hello handshake to backend timed out after {timeoutSeconds}s. Will retry.")] + public static partial void HelloTimeout(this ILogger logger, int timeoutSeconds); + + [LoggerMessage(EventId = 71, Level = LogLevel.Warning, Message = "Authentication failed when connecting to backend. Will retry.")] + public static partial void AuthenticationFailed(this ILogger logger, Exception ex); + + [LoggerMessage(EventId = 72, Level = LogLevel.Warning, Message = "Recreating gRPC channel to backend after {failureCount} consecutive connect failures.")] + public static partial void RecreatingChannel(this ILogger logger, int failureCount); + + [LoggerMessage(EventId = 73, Level = LogLevel.Information, Message = "gRPC channel to backend has been recreated. New target: {endpoint}.")] + public static partial void ChannelRecreated(this ILogger logger, string endpoint); + + [LoggerMessage(EventId = 74, Level = LogLevel.Debug, Message = "Reconnect attempt {attempt} will retry after {delayMs} ms.")] + public static partial void ReconnectBackoff(this ILogger logger, int attempt, int delayMs); + + [LoggerMessage(EventId = 75, Level = LogLevel.Trace, Message = "Received health ping from the backend.")] + public static partial void ReceivedHealthPing(this ILogger logger); } } From 665ba0e09057f8ab9d0f1375845965a4c84ab483 Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 13:07:36 -0700 Subject: [PATCH 04/27] Address Copilot review: clamp backoff cap, cover SilentDisconnectTimeout default Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Worker/Grpc/ReconnectBackoff.cs | 2 +- ...pcDurableTaskWorkerOptionsInternalTests.cs | 1 + .../Grpc.Tests/ReconnectBackoffTests.cs | 31 +++++++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/src/Worker/Grpc/ReconnectBackoff.cs b/src/Worker/Grpc/ReconnectBackoff.cs index af7e65aad..31f3318af 100644 --- a/src/Worker/Grpc/ReconnectBackoff.cs +++ b/src/Worker/Grpc/ReconnectBackoff.cs @@ -33,7 +33,7 @@ public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Ra // Cap the exponent to avoid overflow in 2^attempt for pathological attempt values. int safeAttempt = Math.Min(attempt, 30); - double capMs = Math.Max(baseDelay.TotalMilliseconds, cap.TotalMilliseconds); + double capMs = Math.Max(0, cap.TotalMilliseconds); double exp = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); double bound = Math.Min(capMs, exp); double jittered = random.NextDouble() * bound; diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs index 1f36245df..5813bc683 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerOptionsInternalTests.cs @@ -21,6 +21,7 @@ public void InternalOptions_HasSafeDefaults() internalOptions.ChannelRecreateFailureThreshold.Should().Be(5); internalOptions.ReconnectBackoffBase.Should().Be(TimeSpan.FromSeconds(1)); internalOptions.ReconnectBackoffCap.Should().Be(TimeSpan.FromSeconds(30)); + internalOptions.SilentDisconnectTimeout.Should().Be(TimeSpan.FromSeconds(120)); internalOptions.ChannelRecreator.Should().BeNull(); } diff --git a/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs b/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs index eb5cd1877..024f179eb 100644 --- a/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs +++ b/test/Worker/Grpc.Tests/ReconnectBackoffTests.cs @@ -100,6 +100,37 @@ public void Compute_NegativeAttempt_TreatedAsZero() delay.TotalMilliseconds.Should().BeApproximately(1000, 1); } + [Fact] + public void Compute_CapSmallerThanBase_ClampsToCap() + { + // Arrange: cap is intentionally smaller than baseDelay; the cap must still be honored. + DeterministicRandom random = new(0.999999); + TimeSpan baseDelay = TimeSpan.FromSeconds(5); + TimeSpan cap = TimeSpan.FromSeconds(1); + + // Act + TimeSpan delay = ReconnectBackoff.Compute(attempt: 3, baseDelay, cap, random); + + // Assert: with random ~ 1 the result is the bound, which must equal the cap. + delay.TotalMilliseconds.Should().BeApproximately(1000, 1); + delay.Should().BeLessThanOrEqualTo(cap); + } + + [Fact] + public void Compute_NonPositiveCap_ReturnsZero() + { + // Arrange + DeterministicRandom random = new(0.999999); + + // Act + TimeSpan zero = ReconnectBackoff.Compute(attempt: 3, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.Zero, random); + TimeSpan negative = ReconnectBackoff.Compute(attempt: 3, baseDelay: TimeSpan.FromSeconds(1), cap: TimeSpan.FromSeconds(-1), random); + + // Assert + zero.Should().Be(TimeSpan.Zero); + negative.Should().Be(TimeSpan.Zero); + } + sealed class DeterministicRandom : Random { readonly double value; From 4d1fbbeb1b794bab9a8917064137d9f3437a662d Mon Sep 17 00:00:00 2001 From: Copilot <223556219+Copilot@users.noreply.github.com> Date: Tue, 21 Apr 2026 13:32:18 -0700 Subject: [PATCH 05/27] Reconnect after graceful stream-end (GOAWAY) and thread-safe state swap Worker: ProcessWorkItemsAsync no longer wraps wait foreach in an outer loop. IAsyncStreamReader is single-use, so on a graceful server close (HTTP/2 GOAWAY + OK trailers during a DTS rolling upgrade) the foreach exited cleanly and the outer while re-entered an already-exhausted reader, tight-spinning until the 120s silent-disconnect timer eventually fired and the resulting OCE surfaced as a generic UnexpectedError. New flow: after the foreach exits, explicitly distinguish shutdown, timeout, and peer-initiated close. Peer-initiated close is logged as StreamEndedByPeer (EventId 76, Information) and drives the channel-poisoned counter so repeated drains escalate to recreate. Client: ChannelRecreatingCallInvoker now reads/writes the state field with Volatile.Read/Volatile.Write so call-site threads observe the post-recreate (channel, invoker) pair without torn state. When the invoker owns the channel (self-Address path), RecreateAsync now schedules deferred disposal of the old channel so repeated recreates do not leak HTTP handlers / sockets on NET6+. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc/ChannelRecreatingCallInvoker.cs | 62 ++++++- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 166 ++++++++++-------- src/Worker/Grpc/Logs.cs | 3 + 3 files changed, 146 insertions(+), 85 deletions(-) diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index d3d715bdb..8cd4656ee 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -38,6 +38,11 @@ sealed class ChannelRecreatingCallInvoker : CallInvoker, IAsyncDisposable readonly bool ownsChannel; readonly ILogger logger; + // Accessed from call-site threads (reads) and the background recreate task (writes). + // Read/written with Volatile.Read / Volatile.Write to prevent torn reads and to publish + // the new reference so that callers on other threads observe it without additional + // synchronization. The TransportState itself is immutable so readers see a consistent + // (Channel, Invoker) pair once Volatile.Read returns. TransportState state; int consecutiveFailures; int recreateInFlight; @@ -65,7 +70,7 @@ public ChannelRecreatingCallInvoker( public override TResponse BlockingUnaryCall( Method method, string? host, CallOptions options, TRequest request) { - TransportState current = this.state; + TransportState current = Volatile.Read(ref this.state); try { TResponse response = current.Invoker.BlockingUnaryCall(method, host, options, request); @@ -82,7 +87,7 @@ public override TResponse BlockingUnaryCall( public override AsyncUnaryCall AsyncUnaryCall( Method method, string? host, CallOptions options, TRequest request) { - TransportState current = this.state; + TransportState current = Volatile.Read(ref this.state); AsyncUnaryCall call = current.Invoker.AsyncUnaryCall(method, host, options, request); this.ObserveOutcome(call.ResponseAsync, method.FullName); return call; @@ -94,19 +99,19 @@ public override AsyncServerStreamingCall AsyncServerStreamingCall AsyncClientStreamingCall( Method method, string? host, CallOptions options) { - return this.state.Invoker.AsyncClientStreamingCall(method, host, options); + return Volatile.Read(ref this.state).Invoker.AsyncClientStreamingCall(method, host, options); } public override AsyncDuplexStreamingCall AsyncDuplexStreamingCall( Method method, string? host, CallOptions options) { - return this.state.Invoker.AsyncDuplexStreamingCall(method, host, options); + return Volatile.Read(ref this.state).Invoker.AsyncDuplexStreamingCall(method, host, options); } public async ValueTask DisposeAsync() @@ -116,7 +121,7 @@ public async ValueTask DisposeAsync() return; } - TransportState current = this.state; + TransportState current = Volatile.Read(ref this.state); try { #if NET6_0_OR_GREATER @@ -217,14 +222,23 @@ async Task RecreateAsync(int observedCount) { try { - TransportState current = this.state; + TransportState current = Volatile.Read(ref this.state); using CancellationTokenSource cts = new(TimeSpan.FromSeconds(30)); GrpcChannel newChannel = await this.recreator(current.Channel, cts.Token).ConfigureAwait(false); if (!ReferenceEquals(newChannel, current.Channel)) { - this.state = new TransportState(newChannel, newChannel.CreateCallInvoker()); + Volatile.Write(ref this.state, new TransportState(newChannel, newChannel.CreateCallInvoker())); this.logger.ChannelRecreated(GetEndpointDescription(newChannel)); + + // When we own the channel, no external party is responsible for tearing down the old + // one. Defer disposal briefly so any in-flight RPCs issued against the previous + // CallInvoker before the swap can still complete (they already captured the old + // TransportState before Volatile.Write). + if (this.ownsChannel) + { + _ = ScheduleDeferredDisposeAsync(current.Channel); + } } // Successful recreate (even if a peer beat us to it) → reset the failure counter. @@ -246,6 +260,38 @@ and not StackOverflowException } } + static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) + { + try + { + // Grace period to let in-flight RPCs captured against the old invoker drain before we + // tear down the channel's HTTP handler / sockets. + await Task.Delay(TimeSpan.FromSeconds(30)).ConfigureAwait(false); + try + { + await channel.ShutdownAsync().ConfigureAwait(false); + } + catch (Exception ex) when (ex is OperationCanceledException or ObjectDisposedException) + { + // Expected during process shutdown; nothing to do. + } +#if NET6_0_OR_GREATER + channel.Dispose(); +#endif + } + catch (Exception ex) when (ex is not OutOfMemoryException + and not StackOverflowException + and not ThreadAbortException) + { + if (ex is not OperationCanceledException and not ObjectDisposedException) + { + Trace.TraceError( + "Unexpected exception while deferred-disposing gRPC channel in ChannelRecreatingCallInvoker.ScheduleDeferredDisposeAsync: {0}", + ex); + } + } + } + static string GetEndpointDescription(GrpcChannel channel) { #if NET6_0_OR_GREATER diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index c07157aef..aefffcddf 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -340,91 +340,103 @@ async Task ProcessWorkItemsAsync( bool firstMessageObserved = false; - while (!cancellation.IsCancellationRequested) + // NOTE: we deliberately do NOT wrap the await foreach in an outer loop. The underlying + // IAsyncStreamReader is single-use — once the server terminates the stream (e.g. via a + // graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext returns + // false forever and re-entering await foreach would tight-spin with no yield until the + // silent-disconnect timer eventually fires. See the end-of-stream handling below for + // the behavior we want in that case. + await foreach (P.WorkItem workItem in stream.ResponseStream.ReadAllAsync(tokenSource.Token)) { - await foreach (P.WorkItem workItem in stream.ResponseStream.ReadAllAsync(tokenSource.Token)) + if (silentDisconnectEnabled) { - if (silentDisconnectEnabled) - { - timeoutSource.CancelAfter(silentDisconnectTimeout); - } - - if (!firstMessageObserved) - { - firstMessageObserved = true; - onFirstMessage?.Invoke(); - } - - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - cancellation, - workItem.CompletionToken, - operationInfos), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) - { - // Health pings are heartbeat-only signals from the backend; the silent-disconnect - // timer reset above is the actionable behavior. Logging at Trace allows operators - // to confirm liveness without flooding info-level telemetry. - this.Logger.ReceivedHealthPing(); - } - else - { - this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); - } + timeoutSource.CancelAfter(silentDisconnectTimeout); } - if (tokenSource.IsCancellationRequested || tokenSource.Token.IsCancellationRequested) + if (!firstMessageObserved) { - // The token has cancelled, this means either: - // 1. The broader 'cancellation' was triggered, return here to start a graceful shutdown. - // 2. The timeoutSource was triggered, return here to trigger a reconnect to the backend. - if (!cancellation.IsCancellationRequested) - { - // Since the cancellation came from the timeout, log a warning. - this.Logger.ConnectionTimeout(); - onSilentDisconnect?.Invoke(); - } + firstMessageObserved = true; + onFirstMessage?.Invoke(); + } - return; + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) + { + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) + { + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + cancellation, + workItem.CompletionToken, + operationInfos), + cancellation); + } + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) + { + // Health pings are heartbeat-only signals from the backend; the silent-disconnect + // timer reset above is the actionable behavior. Logging at Trace allows operators + // to confirm liveness without flooding info-level telemetry. + this.Logger.ReceivedHealthPing(); + } + else + { + this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); } } + + if (cancellation.IsCancellationRequested) + { + // Worker is shutting down - exit gracefully. + return; + } + + if (tokenSource.Token.IsCancellationRequested) + { + // Our silent-disconnect timer fired: the stream stopped producing messages (including + // health pings) for longer than the configured window. Treat as a poisoned channel. + this.Logger.ConnectionTimeout(); + onSilentDisconnect?.Invoke(); + return; + } + + // The stream terminated cleanly (no exception, no cancellation). This is the canonical + // signal sent by the backend during a graceful drain (HTTP/2 GOAWAY with OK trailers + // when a DTS instance is being replaced). We explicitly log and count this toward the + // channel-poisoned threshold: although a single drain is benign, repeated drains on + // the same cached channel usually mean the channel is latched onto a dead/evacuated + // backend and needs to be recreated to pick up fresh DNS/routing. + this.Logger.StreamEndedByPeer(); + onSilentDisconnect?.Invoke(); } void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) diff --git a/src/Worker/Grpc/Logs.cs b/src/Worker/Grpc/Logs.cs index bafa4a334..f2b6c31df 100644 --- a/src/Worker/Grpc/Logs.cs +++ b/src/Worker/Grpc/Logs.cs @@ -97,5 +97,8 @@ static partial class Logs [LoggerMessage(EventId = 75, Level = LogLevel.Trace, Message = "Received health ping from the backend.")] public static partial void ReceivedHealthPing(this ILogger logger); + + [LoggerMessage(EventId = 76, Level = LogLevel.Information, Message = "Work-item stream ended by the backend (graceful close). Will reconnect.")] + public static partial void StreamEndedByPeer(this ILogger logger); } } From e0fc2e84b7659ccf5f1298a80c8098be72dc8737 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Tue, 21 Apr 2026 15:10:42 -0700 Subject: [PATCH 06/27] Fix critical bugs surfaced by code review C1: Wrap the worker stream-reader await foreach in a try/catch so that grpc-dotnet's RpcException(Cancelled) raised when the silent-disconnect timer fires is routed to the silent-disconnect handler. Without this catch the exception unwound to the outer Cancelled handler that explicitly does not poison the channel, leaving the silent-disconnect -> recreate path effectively dead. C2: Switch all Lazy instances in the AzureManaged worker and client extensions to LazyThreadSafetyMode.PublicationOnly so a transient CreateChannel failure does not permanently poison the cache slot for the lifetime of the process. I1: Track the most recently observed channel inside the worker's ExecuteAsync and pass it to TryRecreateChannelAsync instead of the never-updated this.grpcOptions.Channel field. The stale field caused the recreator's 'peer already swapped' branch to be skipped on every cycle. I3+I4: Add a class-level disposalCts to ChannelRecreatingCallInvoker that is cancelled in DisposeAsync and linked into the recreator's CancellationToken. The recreate task now re-checks the disposed flag before publishing a freshly created channel, and shuts the new channel down if it loses the race; this prevents leaking a freshly created channel after disposal and lets DisposeAsync abort an in-flight recreate. I5: Only treat a graceful stream-end as a poison signal when no work-item messages were observed on the stream. A drained stream that successfully delivered work is healthy backend rolling-upgrade behavior; counting it would let long-lived processes accumulate spurious poison credits. I6: Reset the consecutive-failure and reconnect-attempt counters on Unauthenticated. A status reply is proof the underlying transport is healthy, so prior transport failures should not combine with a later transient blip to trip the channel recreate. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DurableTaskSchedulerClientExtensions.cs | 8 ++- .../Grpc/ChannelRecreatingCallInvoker.cs | 64 ++++++++++++++++++- .../DurableTaskSchedulerWorkerExtensions.cs | 11 ++-- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 50 ++++++++++++--- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 27 ++++++-- 5 files changed, 137 insertions(+), 23 deletions(-) diff --git a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs index 0596b427c..efd4cf61e 100644 --- a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs +++ b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs @@ -4,6 +4,7 @@ using System.Collections.Concurrent; using System.Diagnostics; using System.Linq; +using System.Threading; using Azure.Core; using Grpc.Net.Client; using Microsoft.DurableTask.Client.Grpc; @@ -157,7 +158,7 @@ public void Configure(string? name, GrpcDurableTaskClientOptions options) string cacheKey = $"{optionsName}\u001F{source.EndpointAddress}\u001F{source.TaskHubName}\u001F{source.ResourceId}\u001F{credentialType}\u001F{source.AllowInsecureCredentials}\u001F{retryOptionsKey}"; options.Channel = this.channels.GetOrAdd( cacheKey, - _ => new Lazy(source.CreateChannel)).Value; + _ => new Lazy(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly)).Value; options.SetChannelRecreator((oldChannel, ct) => this.RecreateChannelAsync(cacheKey, source, oldChannel, ct)); } @@ -181,7 +182,8 @@ async Task RecreateChannelAsync( if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) { - Lazy created = new(source.CreateChannel); + // PublicationOnly avoids permanently caching a transient CreateChannel exception. + Lazy created = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (this.channels.TryAdd(cacheKey, created)) { return created.Value; @@ -201,7 +203,7 @@ async Task RecreateChannelAsync( return currentLazy.Value; } - Lazy newLazy = new(source.CreateChannel); + Lazy newLazy = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { this.channels.TryGetValue(cacheKey, out Lazy? winner); diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index 8cd4656ee..ef42e6196 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -38,6 +38,10 @@ sealed class ChannelRecreatingCallInvoker : CallInvoker, IAsyncDisposable readonly bool ownsChannel; readonly ILogger logger; + // Cancelled in DisposeAsync so an in-flight RecreateAsync stops promptly and does not leak the + // freshly created channel back into our state after we've disposed. + readonly CancellationTokenSource disposalCts = new(); + // Accessed from call-site threads (reads) and the background recreate task (writes). // Read/written with Volatile.Read / Volatile.Write to prevent torn reads and to publish // the new reference so that callers on other threads observe it without additional @@ -47,6 +51,7 @@ sealed class ChannelRecreatingCallInvoker : CallInvoker, IAsyncDisposable int consecutiveFailures; int recreateInFlight; long lastRecreateTicks; + int disposed; public ChannelRecreatingCallInvoker( GrpcChannel initialChannel, @@ -116,8 +121,26 @@ public override AsyncDuplexStreamingCall AsyncDuplexStreami public async ValueTask DisposeAsync() { + if (Interlocked.Exchange(ref this.disposed, 1) != 0) + { + return; + } + + // Signal any in-flight RecreateAsync to abort. We do this BEFORE shutting down the channel so + // the recreator's cancellation token is observed and the recreate task does not race to + // Volatile.Write a freshly created channel into our state after we've torn it down. + try + { + this.disposalCts.Cancel(); + } + catch (ObjectDisposedException) + { + // Already disposed by a racing caller; nothing more to do for cancellation. + } + if (!this.ownsChannel) { + this.disposalCts.Dispose(); return; } @@ -135,6 +158,10 @@ public async ValueTask DisposeAsync() { // Best-effort disposal. } + finally + { + this.disposalCts.Dispose(); + } } static long StopwatchTicksFor(TimeSpan ts) => @@ -222,12 +249,43 @@ async Task RecreateAsync(int observedCount) { try { + if (Volatile.Read(ref this.disposed) != 0) + { + return; + } + TransportState current = Volatile.Read(ref this.state); - using CancellationTokenSource cts = new(TimeSpan.FromSeconds(30)); + + // Link to the disposal CTS so DisposeAsync can promptly abort an in-flight recreate. + // The 30s timeout keeps a wedged recreator from holding the single-flight slot indefinitely. + using CancellationTokenSource cts = CancellationTokenSource.CreateLinkedTokenSource(this.disposalCts.Token); + cts.CancelAfter(TimeSpan.FromSeconds(30)); GrpcChannel newChannel = await this.recreator(current.Channel, cts.Token).ConfigureAwait(false); if (!ReferenceEquals(newChannel, current.Channel)) { + // Re-check disposal before publishing the new channel into state. Otherwise we could + // race with DisposeAsync and leak the new channel (its socket handlers + DNS resolver + // would never be torn down). + if (Volatile.Read(ref this.disposed) != 0) + { + if (this.ownsChannel) + { + try + { + await newChannel.ShutdownAsync().ConfigureAwait(false); + } + catch (Exception shutdownEx) when (shutdownEx is not OutOfMemoryException + and not StackOverflowException + and not ThreadAbortException) + { + // Best-effort cleanup. + } + } + + return; + } + Volatile.Write(ref this.state, new TransportState(newChannel, newChannel.CreateCallInvoker())); this.logger.ChannelRecreated(GetEndpointDescription(newChannel)); @@ -245,6 +303,10 @@ async Task RecreateAsync(int observedCount) Volatile.Write(ref this.consecutiveFailures, 0); Volatile.Write(ref this.lastRecreateTicks, Stopwatch.GetTimestamp()); } + catch (OperationCanceledException) when (Volatile.Read(ref this.disposed) != 0) + { + // We were disposed mid-recreate; nothing to log. + } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException and not ThreadAbortException) diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index ecf3bf754..ef6a678a6 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -4,6 +4,7 @@ using System.Collections.Concurrent; using System.Diagnostics; using System.Linq; +using System.Threading; using Azure.Core; using Grpc.Net.Client; using Microsoft.DurableTask.Worker.Grpc; @@ -155,7 +156,7 @@ public void Configure(string? name, GrpcDurableTaskWorkerOptions options) string cacheKey = $"{optionsName}\u001F{source.EndpointAddress}\u001F{source.TaskHubName}\u001F{source.ResourceId}\u001F{credentialType}\u001F{source.AllowInsecureCredentials}\u001F{source.WorkerId}"; options.Channel = this.channels.GetOrAdd( cacheKey, - _ => new Lazy(source.CreateChannel)).Value; + _ => new Lazy(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly)).Value; options.SetChannelRecreator((oldChannel, ct) => this.RecreateChannelAsync(cacheKey, source, oldChannel, ct)); options.ConfigureForAzureManaged(); } @@ -177,8 +178,10 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul // CAS swap: only replace if the cached lazy still holds the channel the caller observed. if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) { - // No entry — create one and return it. - Lazy created = new(source.CreateChannel); + // No entry — create one and return it. PublicationOnly ensures a transient + // CreateChannel failure doesn't permanently poison the cache slot (the default + // ExecutionAndPublication mode caches exceptions for the lifetime of the Lazy). + Lazy created = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (this.channels.TryAdd(cacheKey, created)) { return created.Value; @@ -199,7 +202,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul return currentLazy.Value; } - Lazy newLazy = new(source.CreateChannel); + Lazy newLazy = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { // Lost the race; whoever won has the freshest entry. diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index aefffcddf..ea2fc7cdc 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -107,8 +107,12 @@ await this.ProcessWorkItemsAsync( catch (RpcException ex) when (ex.StatusCode == StatusCode.Unauthenticated) { // Auth rejection — log distinctly so it's diagnosable. Do not count toward channel - // recreate: a fresh channel won't fix bad credentials. + // recreate: a fresh channel won't fix bad credentials. Reset the consecutive-failure + // counters: a status reply is proof the transport itself is healthy, so prior + // transport failures should not combine with later ones to trip the recreate. this.Logger.AuthenticationFailed(ex); + consecutiveChannelFailures = 0; + reconnectAttempt = 0; } catch (RpcException ex) when (ex.StatusCode == StatusCode.NotFound) { @@ -346,8 +350,10 @@ async Task ProcessWorkItemsAsync( // false forever and re-entering await foreach would tight-spin with no yield until the // silent-disconnect timer eventually fires. See the end-of-stream handling below for // the behavior we want in that case. - await foreach (P.WorkItem workItem in stream.ResponseStream.ReadAllAsync(tokenSource.Token)) + try { + await foreach (P.WorkItem workItem in stream.ResponseStream.ReadAllAsync(tokenSource.Token)) + { if (silentDisconnectEnabled) { timeoutSource.CancelAfter(silentDisconnectTimeout); @@ -412,6 +418,28 @@ async Task ProcessWorkItemsAsync( { this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); } + } + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down - fall through to the shutdown branch below. + } + catch (OperationCanceledException) when (timeoutSource.IsCancellationRequested) + { + // Silent-disconnect timer fired and the underlying gRPC channel surfaced cancellation + // as OperationCanceledException (this happens when GrpcChannelOptions.ThrowOperationCanceledOnCancellation + // is set to true). Fall through to the silent-disconnect branch below. + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled + && timeoutSource.IsCancellationRequested + && !cancellation.IsCancellationRequested) + { + // Silent-disconnect timer fired mid-MoveNext. By default + // (GrpcChannelOptions.ThrowOperationCanceledOnCancellation == false), grpc-dotnet + // surfaces the linked cancellation as RpcException(Cancelled) rather than OCE. Without + // this catch the exception would propagate to ExecuteAsync's generic Cancelled handler + // and the silent-disconnect → channel-recreate path would never fire. Fall through to + // the silent-disconnect branch below. } if (cancellation.IsCancellationRequested) @@ -420,7 +448,7 @@ async Task ProcessWorkItemsAsync( return; } - if (tokenSource.Token.IsCancellationRequested) + if (timeoutSource.IsCancellationRequested) { // Our silent-disconnect timer fired: the stream stopped producing messages (including // health pings) for longer than the configured window. Treat as a poisoned channel. @@ -431,12 +459,18 @@ async Task ProcessWorkItemsAsync( // The stream terminated cleanly (no exception, no cancellation). This is the canonical // signal sent by the backend during a graceful drain (HTTP/2 GOAWAY with OK trailers - // when a DTS instance is being replaced). We explicitly log and count this toward the - // channel-poisoned threshold: although a single drain is benign, repeated drains on - // the same cached channel usually mean the channel is latched onto a dead/evacuated - // backend and needs to be recreated to pick up fresh DNS/routing. + // when a DTS instance is being replaced). Log it explicitly so operators can see it. + // Only count it toward the channel-poisoned threshold when the stream produced no + // messages: a stream that successfully delivered work and was then closed by the server + // is healthy behavior (e.g. routine rolling upgrade), and counting those would let a + // long-lived process accumulate spurious "poison" credits across many healthy drains. + // An empty drain, on the other hand, is a strong signal the channel is latched onto a + // dead/evacuated backend and needs to be recreated to pick up fresh DNS/routing. this.Logger.StreamEndedByPeer(); - onSilentDisconnect?.Invoke(); + if (!firstMessageObserved) + { + onSilentDisconnect?.Invoke(); + } } void RunBackgroundTask(P.WorkItem? workItem, Func handler, CancellationToken cancellation) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index ce5b10557..babe9cb6c 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -58,6 +58,14 @@ public GrpcDurableTaskWorker( protected override async Task ExecuteAsync(CancellationToken stoppingToken) { AsyncDisposable workerOwnedChannelDisposable = this.GetCallInvoker(out CallInvoker callInvoker, out string address); + + // Track the most recently observed channel so the recreator can compare it against the + // currently-cached channel and skip the swap when a peer worker has already recreated it. + // We must NOT use this.grpcOptions.Channel here: that field is set once when options are + // configured and is never updated when the AzureManaged extension swaps the cached channel. + // Passing the stale field would cause the recreator's "peer already swapped" branch to be + // skipped, producing redundant ChannelRecreated logs and wasted recreate attempts. + GrpcChannel? latestObservedChannel = this.grpcOptions.Channel; try { this.logger.StartingTaskHubWorker(address); @@ -73,11 +81,12 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) } // ChannelRecreateRequested: try to obtain a fresh channel before re-entering the loop. - ChannelRecreateResult result = await this.TryRecreateChannelAsync(stoppingToken, workerOwnedChannelDisposable); + ChannelRecreateResult result = await this.TryRecreateChannelAsync(stoppingToken, workerOwnedChannelDisposable, latestObservedChannel); if (result.Recreated) { callInvoker = result.NewCallInvoker!; address = result.NewAddress!; + latestObservedChannel = result.NewChannel; AsyncDisposable previousDisposable = workerOwnedChannelDisposable; workerOwnedChannelDisposable = result.NewWorkerOwnedDisposable; this.logger.ChannelRecreated(address); @@ -101,11 +110,12 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) async Task TryRecreateChannelAsync( CancellationToken cancellation, - AsyncDisposable currentWorkerOwnedDisposable) + AsyncDisposable currentWorkerOwnedDisposable, + GrpcChannel? currentChannel) { // Path 1: caller (or extension method like ConfigureGrpcChannel) supplied a recreator. Func>? recreator = this.grpcOptions.Internal.ChannelRecreator; - if (recreator is not null && this.grpcOptions.Channel is GrpcChannel currentChannel) + if (recreator is not null && currentChannel is not null) { try { @@ -113,7 +123,7 @@ async Task TryRecreateChannelAsync( if (!ReferenceEquals(newChannel, currentChannel)) { // The recreator owns disposal of the old channel; we don't dispose here. - return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, currentWorkerOwnedDisposable); + return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, currentWorkerOwnedDisposable, newChannel); } // Recreator returned the same instance — nothing to swap. @@ -139,7 +149,7 @@ async Task TryRecreateChannelAsync( { GrpcChannel newChannel = GetChannel(this.grpcOptions.Address); AsyncDisposable newDisposable = new(() => new(newChannel.ShutdownAsync())); - return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, newDisposable); + return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, newDisposable, newChannel); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) { @@ -164,12 +174,13 @@ or AccessViolationException readonly struct ChannelRecreateResult { - public ChannelRecreateResult(bool recreated, CallInvoker? newCallInvoker, string? newAddress, AsyncDisposable newWorkerOwnedDisposable) + public ChannelRecreateResult(bool recreated, CallInvoker? newCallInvoker, string? newAddress, AsyncDisposable newWorkerOwnedDisposable, GrpcChannel? newChannel) { this.Recreated = recreated; this.NewCallInvoker = newCallInvoker; this.NewAddress = newAddress; this.NewWorkerOwnedDisposable = newWorkerOwnedDisposable; + this.NewChannel = newChannel; } public bool Recreated { get; } @@ -180,8 +191,10 @@ public ChannelRecreateResult(bool recreated, CallInvoker? newCallInvoker, string public AsyncDisposable NewWorkerOwnedDisposable { get; } + public GrpcChannel? NewChannel { get; } + public static ChannelRecreateResult NotRecreated(AsyncDisposable currentDisposable) - => new(false, null, null, currentDisposable); + => new(false, null, null, currentDisposable, null); } #if NET6_0_OR_GREATER From 5dfc5b1041c86195968786c9255d179566a6c09b Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Tue, 21 Apr 2026 15:55:29 -0700 Subject: [PATCH 07/27] Address remaining audit nits - M2: Add explanatory comment in ChannelRecreatingCallInvoker.RecordFailure about why only Unavailable + non-long-poll DeadlineExceeded count toward the recreate threshold. - M3: Drop the unnecessary 'await Task.Yield()' at the end of both AzureManaged RecreateChannelAsync paths. Convert the methods to non-async returning Task.FromResult to avoid CS1998. - I2: Expose 'SetHelloDeadline' and 'SetSilentDisconnectTimeout' in 'Microsoft.DurableTask.Worker.Grpc.Internal.InternalOptionsExtensions' so the PR description's configurability claims are actually reachable from outside the assembly. Pattern matches existing 'SetChannelRecreator'. - M1 (deferred): A regression test for the silent-disconnect path requires faking 'AsyncServerStreamingCall' and 'IAsyncStreamReader'. Skipped to keep the change surgical; the fix is small and verified by inspection. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DurableTaskSchedulerClientExtensions.cs | 11 +++---- .../Grpc/ChannelRecreatingCallInvoker.cs | 9 +++++ .../DurableTaskSchedulerWorkerExtensions.cs | 11 +++---- .../Internal/InternalOptionsExtensions.cs | 33 +++++++++++++++++++ 4 files changed, 52 insertions(+), 12 deletions(-) diff --git a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs index efd4cf61e..d34a92e3e 100644 --- a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs +++ b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs @@ -167,7 +167,7 @@ public void Configure(string? name, GrpcDurableTaskClientOptions options) /// graceful disposal of the old channel after a grace period so any in-flight RPCs from peer /// clients can drain. Returns the currently cached channel if a peer client has already recreated it. /// - async Task RecreateChannelAsync( + Task RecreateChannelAsync( string cacheKey, DurableTaskSchedulerClientOptions source, GrpcChannel oldChannel, @@ -186,7 +186,7 @@ async Task RecreateChannelAsync( Lazy created = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (this.channels.TryAdd(cacheKey, created)) { - return created.Value; + return Task.FromResult(created.Value); } this.channels.TryGetValue(cacheKey, out currentLazy); @@ -200,14 +200,14 @@ async Task RecreateChannelAsync( if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) { // A peer client already swapped in a new channel; reuse it. - return currentLazy.Value; + return Task.FromResult(currentLazy.Value); } Lazy newLazy = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { this.channels.TryGetValue(cacheKey, out Lazy? winner); - return winner?.Value ?? newLazy.Value; + return Task.FromResult(winner?.Value ?? newLazy.Value); } if (currentLazy.IsValueCreated) @@ -215,8 +215,7 @@ async Task RecreateChannelAsync( _ = ScheduleDeferredDisposeAsync(currentLazy.Value); } - await Task.Yield(); - return newLazy.Value; + return Task.FromResult(newLazy.Value); } static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index ef42e6196..0b3990685 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -196,6 +196,15 @@ void RecordSuccess() void RecordFailure(StatusCode status, string methodFullName) { + // Only count statuses that indicate an actual transport problem, not application-level errors: + // * Unavailable — half-open connection, peer reset, or dead routing target. + // * DeadlineExceeded — the call exceeded the *client-supplied* deadline. This is a + // transport hint EXCEPT for long-poll RPCs (e.g. WaitForInstance*) + // where a deadline timeout is expected behavior, so those are + // excluded explicitly. + // Other statuses (NotFound, InvalidArgument, FailedPrecondition, etc.) are application + // failures that a fresh channel won't fix and would otherwise produce false-positive + // recreates. bool counts = status switch { StatusCode.Unavailable => true, diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index ef6a678a6..2c4a0b5cf 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -166,7 +166,7 @@ public void Configure(string? name, GrpcDurableTaskWorkerOptions options) /// graceful disposal of the old channel after a grace period so any in-flight RPCs from peer workers /// can drain. Returns the currently cached channel if a peer worker has already recreated it. /// - async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWorkerOptions source, GrpcChannel oldChannel, CancellationToken cancellation) + Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWorkerOptions source, GrpcChannel oldChannel, CancellationToken cancellation) { cancellation.ThrowIfCancellationRequested(); @@ -184,7 +184,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul Lazy created = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (this.channels.TryAdd(cacheKey, created)) { - return created.Value; + return Task.FromResult(created.Value); } // Another thread added one; fall through to read it. @@ -199,7 +199,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) { // A peer worker already swapped in a new channel; reuse it. - return currentLazy.Value; + return Task.FromResult(currentLazy.Value); } Lazy newLazy = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); @@ -207,7 +207,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul { // Lost the race; whoever won has the freshest entry. this.channels.TryGetValue(cacheKey, out Lazy? winner); - return winner?.Value ?? newLazy.Value; + return Task.FromResult(winner?.Value ?? newLazy.Value); } // Successful swap. Schedule graceful disposal of the old channel after a grace period @@ -217,8 +217,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul _ = ScheduleDeferredDisposeAsync(currentLazy.Value); } - await Task.Yield(); - return newLazy.Value; + return Task.FromResult(newLazy.Value); } static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) diff --git a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs index 43d5d90f4..385fb93f6 100644 --- a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs +++ b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs @@ -50,4 +50,37 @@ public static void SetChannelRecreator( { options.Internal.ChannelRecreator = recreator ?? throw new ArgumentNullException(nameof(recreator)); } + + /// + /// Sets the deadline applied to the initial Hello RPC during worker connect. A wedged + /// handshake on a half-open HTTP/2 connection no longer hangs the reconnect loop indefinitely. + /// + /// The gRPC worker options. + /// The deadline; non-positive disables the deadline. + /// + /// This is an internal API that supports the DurableTask infrastructure and not subject to + /// the same compatibility standards as public APIs. It may be changed or removed without notice in + /// any release. + /// + public static void SetHelloDeadline(this GrpcDurableTaskWorkerOptions options, TimeSpan deadline) + { + options.Internal.HelloDeadline = deadline; + } + + /// + /// Sets the silent-disconnect timeout. If no message (including health pings) arrives on the + /// work-item stream within this window, the worker treats the stream as silently disconnected + /// and reconnects. + /// + /// The gRPC worker options. + /// The timeout; non-positive disables silent-disconnect detection. + /// + /// This is an internal API that supports the DurableTask infrastructure and not subject to + /// the same compatibility standards as public APIs. It may be changed or removed without notice in + /// any release. + /// + public static void SetSilentDisconnectTimeout(this GrpcDurableTaskWorkerOptions options, TimeSpan timeout) + { + options.Internal.SilentDisconnectTimeout = timeout; + } } From f71d5108bc567ec0140555104da8c97a68da71ef Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Tue, 21 Apr 2026 16:05:47 -0700 Subject: [PATCH 08/27] Add M1: regression test for silent-disconnect classification Extracts the work-item stream-consume + termination-classification logic from GrpcDurableTaskWorker.Processor.ProcessWorkItemsAsync into a new internal helper WorkItemStreamConsumer.ConsumeAsync. The helper owns the linked-token wiring, the await foreach, and the three-arm catch chain that distinguishes: - outer cancellation (Shutdown) - silent-disconnect timeout surfaced as OCE (SilentDisconnect) - silent-disconnect timeout surfaced as RpcException(Cancelled) (SilentDisconnect) from a graceful drain. ProcessWorkItemsAsync becomes a thin coordinator that dispatches per-item work via a new DispatchWorkItem method and switches on the helper's outcome to decide post-loop logging and channel-poisoned signaling. Adds 9 unit tests including the C1 regression test HangingStream_SurfacingRpcCancelled_ReturnsSilentDisconnect that fails on the pre-fix code: prior to the fix the helper would have propagated the RpcException(Cancelled) past the silent-disconnect classification, leaving the caller without the channelLikelyPoisoned signal and the channel-recreate counter would not advance. Behavior is preserved exactly. Outer-cancellation surfaced as RpcException(Cancelled) still propagates to ExecuteAsync's outer catch chain (documented in OuterCancellation_WithRpcCancelledFromStream_PropagatesException). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 211 +++++++---------- src/Worker/Grpc/WorkItemStreamConsumer.cs | 130 +++++++++++ .../Grpc.Tests/WorkItemStreamConsumerTests.cs | 221 ++++++++++++++++++ 3 files changed, 437 insertions(+), 125 deletions(-) create mode 100644 src/Worker/Grpc/WorkItemStreamConsumer.cs create mode 100644 test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index ea2fc7cdc..ef367e09f 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -328,148 +328,109 @@ async Task ProcessWorkItemsAsync( Action? onFirstMessage = null, Action? onSilentDisconnect = null) { - // Create a new token source for timing out and a final token source that keys off of them both. - // The timeout token is used to detect when we are no longer getting any messages, including health - // pings sent periodically by the server. If this is the case, it signifies the connection has been - // dropped silently and we need to reconnect. + // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — + // including health pings sent periodically by the server — arrive within the configured + // window. If that fires we treat the stream as silently disconnected and reconnect. TimeSpan silentDisconnectTimeout = this.internalOptions.SilentDisconnectTimeout; - bool silentDisconnectEnabled = silentDisconnectTimeout > TimeSpan.Zero; - using var timeoutSource = new CancellationTokenSource(); - if (silentDisconnectEnabled) - { - timeoutSource.CancelAfter(silentDisconnectTimeout); - } - - using var tokenSource = CancellationTokenSource.CreateLinkedTokenSource(cancellation, timeoutSource.Token); - bool firstMessageObserved = false; - - // NOTE: we deliberately do NOT wrap the await foreach in an outer loop. The underlying - // IAsyncStreamReader is single-use — once the server terminates the stream (e.g. via a - // graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext returns - // false forever and re-entering await foreach would tight-spin with no yield until the - // silent-disconnect timer eventually fires. See the end-of-stream handling below for - // the behavior we want in that case. - try + // NOTE: the consumer deliberately does NOT wrap its await foreach in an outer loop. + // The underlying IAsyncStreamReader is single-use — once the server terminates the stream + // (e.g. via a graceful HTTP/2 GOAWAY with OK trailers during a rolling upgrade), MoveNext + // returns false forever and re-entering await foreach would tight-spin with no yield. + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + ct => stream.ResponseStream.ReadAllAsync(ct), + silentDisconnectTimeout, + workItem => this.DispatchWorkItem(workItem, cancellation), + onFirstMessage, + cancellation); + + switch (result.Outcome) { - await foreach (P.WorkItem workItem in stream.ResponseStream.ReadAllAsync(tokenSource.Token)) - { - if (silentDisconnectEnabled) - { - timeoutSource.CancelAfter(silentDisconnectTimeout); - } + case WorkItemStreamOutcome.Shutdown: + return; - if (!firstMessageObserved) - { - firstMessageObserved = true; - onFirstMessage?.Invoke(); - } + case WorkItemStreamOutcome.SilentDisconnect: + // Stream stopped producing messages (including health pings) for longer than the + // configured window. Treat as a poisoned channel. + this.Logger.ConnectionTimeout(); + onSilentDisconnect?.Invoke(); + return; - if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunOrchestratorAsync( - workItem.OrchestratorRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunActivityAsync( - workItem.ActivityRequest, - workItem.CompletionToken, - cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) - { - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) - { - workItem.EntityRequestV2.ToEntityBatchRequest( - out EntityBatchRequest batchRequest, - out List operationInfos); - - this.RunBackgroundTask( - workItem, - () => this.OnRunEntityBatchAsync( - batchRequest, - cancellation, - workItem.CompletionToken, - operationInfos), - cancellation); - } - else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) - { - // Health pings are heartbeat-only signals from the backend; the silent-disconnect - // timer reset above is the actionable behavior. Logging at Trace allows operators - // to confirm liveness without flooding info-level telemetry. - this.Logger.ReceivedHealthPing(); - } - else - { - this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); - } - } + case WorkItemStreamOutcome.GracefulDrain: + // Canonical signal sent by the backend during a graceful drain (HTTP/2 GOAWAY + + // OK trailers when a DTS instance is being replaced). Log it explicitly so + // operators can see it. Only count it toward the channel-poisoned threshold when + // the stream produced no messages: a stream that successfully delivered work and + // was then closed by the server is healthy behavior (e.g. routine rolling + // upgrade), and counting those would let a long-lived process accumulate spurious + // "poison" credits across many healthy drains. An empty drain, on the other hand, + // is a strong signal the channel is latched onto a dead/evacuated backend and + // needs to be recreated to pick up fresh DNS/routing. + this.Logger.StreamEndedByPeer(); + if (!result.FirstMessageObserved) + { + onSilentDisconnect?.Invoke(); + } + + return; } - catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + } + + void DispatchWorkItem(P.WorkItem workItem, CancellationToken cancellation) + { + if (workItem.RequestCase == P.WorkItem.RequestOneofCase.OrchestratorRequest) { - // Worker is shutting down - fall through to the shutdown branch below. + this.RunBackgroundTask( + workItem, + () => this.OnRunOrchestratorAsync( + workItem.OrchestratorRequest, + workItem.CompletionToken, + cancellation), + cancellation); } - catch (OperationCanceledException) when (timeoutSource.IsCancellationRequested) + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.ActivityRequest) { - // Silent-disconnect timer fired and the underlying gRPC channel surfaced cancellation - // as OperationCanceledException (this happens when GrpcChannelOptions.ThrowOperationCanceledOnCancellation - // is set to true). Fall through to the silent-disconnect branch below. + this.RunBackgroundTask( + workItem, + () => this.OnRunActivityAsync( + workItem.ActivityRequest, + workItem.CompletionToken, + cancellation), + cancellation); } - catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled - && timeoutSource.IsCancellationRequested - && !cancellation.IsCancellationRequested) + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequest) { - // Silent-disconnect timer fired mid-MoveNext. By default - // (GrpcChannelOptions.ThrowOperationCanceledOnCancellation == false), grpc-dotnet - // surfaces the linked cancellation as RpcException(Cancelled) rather than OCE. Without - // this catch the exception would propagate to ExecuteAsync's generic Cancelled handler - // and the silent-disconnect → channel-recreate path would never fire. Fall through to - // the silent-disconnect branch below. + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync(workItem.EntityRequest.ToEntityBatchRequest(), cancellation), + cancellation); } - - if (cancellation.IsCancellationRequested) + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.EntityRequestV2) { - // Worker is shutting down - exit gracefully. - return; + workItem.EntityRequestV2.ToEntityBatchRequest( + out EntityBatchRequest batchRequest, + out List operationInfos); + + this.RunBackgroundTask( + workItem, + () => this.OnRunEntityBatchAsync( + batchRequest, + cancellation, + workItem.CompletionToken, + operationInfos), + cancellation); } - - if (timeoutSource.IsCancellationRequested) + else if (workItem.RequestCase == P.WorkItem.RequestOneofCase.HealthPing) { - // Our silent-disconnect timer fired: the stream stopped producing messages (including - // health pings) for longer than the configured window. Treat as a poisoned channel. - this.Logger.ConnectionTimeout(); - onSilentDisconnect?.Invoke(); - return; + // Health pings are heartbeat-only signals from the backend; the silent-disconnect + // timer reset (handled inside WorkItemStreamConsumer) is the actionable behavior. + // Logging at Trace allows operators to confirm liveness without flooding info-level + // telemetry. + this.Logger.ReceivedHealthPing(); } - - // The stream terminated cleanly (no exception, no cancellation). This is the canonical - // signal sent by the backend during a graceful drain (HTTP/2 GOAWAY with OK trailers - // when a DTS instance is being replaced). Log it explicitly so operators can see it. - // Only count it toward the channel-poisoned threshold when the stream produced no - // messages: a stream that successfully delivered work and was then closed by the server - // is healthy behavior (e.g. routine rolling upgrade), and counting those would let a - // long-lived process accumulate spurious "poison" credits across many healthy drains. - // An empty drain, on the other hand, is a strong signal the channel is latched onto a - // dead/evacuated backend and needs to be recreated to pick up fresh DNS/routing. - this.Logger.StreamEndedByPeer(); - if (!firstMessageObserved) + else { - onSilentDisconnect?.Invoke(); + this.Logger.UnexpectedWorkItemType(workItem.RequestCase.ToString()); } } diff --git a/src/Worker/Grpc/WorkItemStreamConsumer.cs b/src/Worker/Grpc/WorkItemStreamConsumer.cs new file mode 100644 index 000000000..8652302fc --- /dev/null +++ b/src/Worker/Grpc/WorkItemStreamConsumer.cs @@ -0,0 +1,130 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Runtime.CompilerServices; +using P = Microsoft.DurableTask.Protobuf; + +namespace Microsoft.DurableTask.Worker.Grpc; + +/// +/// Reason a invocation terminated. +/// +internal enum WorkItemStreamOutcome +{ + /// The outer cancellation token was signalled (worker shutdown). + Shutdown, + + /// The silent-disconnect timer fired (no item or health ping arrived in time). + SilentDisconnect, + + /// The stream completed without exception (e.g. server-initiated graceful close). + GracefulDrain, +} + +/// Result of consuming a work-item stream. +/// Why the loop terminated. +/// Whether at least one message was delivered before termination. +internal readonly record struct WorkItemStreamResult(WorkItemStreamOutcome Outcome, bool FirstMessageObserved); + +/// +/// Consumes a work-item stream and classifies its termination. Owns the silent-disconnect timeout +/// wiring and the catch chain that distinguishes a wedged stream (silent disconnect) from a normal +/// graceful drain or a worker shutdown. Per-item dispatch is delegated to the caller via the +/// onItem callback. +/// +/// +/// The onItem callback is synchronous because production dispatch is fire-and-forget. +/// +internal static class WorkItemStreamConsumer +{ + /// + /// Consume a work-item stream until shutdown, silent disconnect, or graceful drain. + /// + /// + /// Factory that opens the stream with the supplied linked-cancellation token. Production passes + /// ct => stream.ResponseStream.ReadAllAsync(ct); tests pass arbitrary fakes. + /// + /// + /// How long to wait between successive items (or health pings) before treating the stream as + /// silently disconnected. Non-positive values disable detection entirely. + /// + /// + /// Synchronous per-item dispatch. Invoked once per delivered work item, after the silent-disconnect + /// timer has been re-armed. + /// + /// + /// Optional callback invoked exactly once when the first message is observed. Used by callers to + /// reset retry counters that should only count consecutive transport failures. + /// + /// Outer worker cancellation token. + /// The classified outcome plus whether any message was observed. + public static async Task ConsumeAsync( + Func> openStream, + TimeSpan silentDisconnectTimeout, + Action onItem, + Action? onFirstMessage, + CancellationToken cancellation) + { + bool silentDisconnectEnabled = silentDisconnectTimeout > TimeSpan.Zero; + using CancellationTokenSource timeoutSource = new(); + if (silentDisconnectEnabled) + { + timeoutSource.CancelAfter(silentDisconnectTimeout); + } + + using CancellationTokenSource tokenSource = CancellationTokenSource.CreateLinkedTokenSource( + cancellation, timeoutSource.Token); + + bool firstMessageObserved = false; + + try + { + await foreach (P.WorkItem workItem in openStream(tokenSource.Token).ConfigureAwait(false)) + { + if (silentDisconnectEnabled) + { + timeoutSource.CancelAfter(silentDisconnectTimeout); + } + + if (!firstMessageObserved) + { + firstMessageObserved = true; + onFirstMessage?.Invoke(); + } + + onItem(workItem); + } + } + catch (OperationCanceledException) when (cancellation.IsCancellationRequested) + { + // Worker is shutting down. + } + catch (OperationCanceledException) when (timeoutSource.IsCancellationRequested) + { + // Silent-disconnect timer fired and grpc-dotnet surfaced cancellation as OCE + // (when GrpcChannelOptions.ThrowOperationCanceledOnCancellation == true). + } + catch (RpcException ex) when (ex.StatusCode == StatusCode.Cancelled + && timeoutSource.IsCancellationRequested + && !cancellation.IsCancellationRequested) + { + // Silent-disconnect timer fired mid-MoveNext. By default + // (GrpcChannelOptions.ThrowOperationCanceledOnCancellation == false), grpc-dotnet + // surfaces the linked cancellation as RpcException(Cancelled) rather than OCE. + // Without this catch the exception would propagate past the silent-disconnect + // branch and the recreate path would never fire. + } + + if (cancellation.IsCancellationRequested) + { + return new WorkItemStreamResult(WorkItemStreamOutcome.Shutdown, firstMessageObserved); + } + + if (timeoutSource.IsCancellationRequested) + { + return new WorkItemStreamResult(WorkItemStreamOutcome.SilentDisconnect, firstMessageObserved); + } + + return new WorkItemStreamResult(WorkItemStreamOutcome.GracefulDrain, firstMessageObserved); + } +} diff --git a/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs new file mode 100644 index 000000000..c142a4245 --- /dev/null +++ b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs @@ -0,0 +1,221 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Runtime.CompilerServices; +using System.Threading.Channels; +using Grpc.Core; +using Microsoft.DurableTask.Worker.Grpc; +using P = Microsoft.DurableTask.Protobuf; + +namespace Microsoft.DurableTask.Worker.Grpc.Tests; + +public class WorkItemStreamConsumerTests +{ + static readonly TimeSpan ShortTimeout = TimeSpan.FromMilliseconds(150); + + [Fact] + public async Task EmptyStream_ReturnsGracefulDrain() + { + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + openStream: _ => EmptyStream(), + silentDisconnectTimeout: TimeSpan.FromSeconds(5), + onItem: _ => throw new InvalidOperationException("onItem should not be invoked"), + onFirstMessage: () => throw new InvalidOperationException("onFirstMessage should not be invoked"), + cancellation: CancellationToken.None); + + result.Outcome.Should().Be(WorkItemStreamOutcome.GracefulDrain); + result.FirstMessageObserved.Should().BeFalse(); + } + + [Fact] + public async Task StreamWithItems_ReturnsGracefulDrain_AndFiresCallbacks() + { + P.WorkItem item1 = new() { HealthPing = new P.HealthPing() }; + P.WorkItem item2 = new() { HealthPing = new P.HealthPing() }; + List received = new(); + int firstMessageCount = 0; + + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + openStream: _ => StreamOf(item1, item2), + silentDisconnectTimeout: TimeSpan.FromSeconds(5), + onItem: received.Add, + onFirstMessage: () => firstMessageCount++, + cancellation: CancellationToken.None); + + result.Outcome.Should().Be(WorkItemStreamOutcome.GracefulDrain); + result.FirstMessageObserved.Should().BeTrue(); + received.Should().BeEquivalentTo(new[] { item1, item2 }, o => o.WithStrictOrdering()); + firstMessageCount.Should().Be(1); + } + + [Fact] + public async Task HangingStream_SurfacingOce_ReturnsSilentDisconnect() + { + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + openStream: ct => HangingStream(ct, throwAsRpc: false), + silentDisconnectTimeout: ShortTimeout, + onItem: _ => { }, + onFirstMessage: null, + cancellation: CancellationToken.None); + + result.Outcome.Should().Be(WorkItemStreamOutcome.SilentDisconnect); + result.FirstMessageObserved.Should().BeFalse(); + } + + /// + /// Regression test for the C1 silent-disconnect bug. grpc-dotnet by default surfaces a linked-token + /// cancellation as (StatusCode.Cancelled), not . + /// Pre-fix this exception propagated past the silent-disconnect branch and the channel-recreate + /// callback was never invoked. + /// + [Fact] + public async Task HangingStream_SurfacingRpcCancelled_ReturnsSilentDisconnect() + { + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + openStream: ct => HangingStream(ct, throwAsRpc: true), + silentDisconnectTimeout: ShortTimeout, + onItem: _ => { }, + onFirstMessage: null, + cancellation: CancellationToken.None); + + result.Outcome.Should().Be(WorkItemStreamOutcome.SilentDisconnect); + result.FirstMessageObserved.Should().BeFalse(); + } + + [Fact] + public async Task OuterCancellation_WithOceFromStream_ReturnsShutdown() + { + // When the inner stream surfaces cancellation as OperationCanceledException, the helper + // classifies the termination and returns Shutdown. + using CancellationTokenSource outer = new(); + outer.CancelAfter(ShortTimeout); + + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + openStream: ct => HangingStream(ct, throwAsRpc: false), + silentDisconnectTimeout: TimeSpan.FromSeconds(30), + onItem: _ => { }, + onFirstMessage: null, + cancellation: outer.Token); + + result.Outcome.Should().Be(WorkItemStreamOutcome.Shutdown); + result.FirstMessageObserved.Should().BeFalse(); + } + + [Fact] + public async Task OuterCancellation_WithRpcCancelledFromStream_PropagatesException() + { + // When the inner stream surfaces outer cancellation as RpcException(Cancelled), the helper + // does NOT classify it as Shutdown — the caller's outer catch chain (ExecuteAsync) handles + // RpcException(Cancelled)-during-shutdown. Adding it to the helper would conflict with the + // post-fix silent-disconnect catch, which scopes RpcException(Cancelled) handling to the case + // where the timeout source — not the outer cancellation — fired. + using CancellationTokenSource outer = new(); + outer.CancelAfter(ShortTimeout); + + Func act = () => WorkItemStreamConsumer.ConsumeAsync( + openStream: ct => HangingStream(ct, throwAsRpc: true), + silentDisconnectTimeout: TimeSpan.FromSeconds(30), + onItem: _ => { }, + onFirstMessage: null, + cancellation: outer.Token); + + await act.Should().ThrowAsync().Where(e => e.StatusCode == StatusCode.Cancelled); + } + + [Fact] + public async Task PerItem_HeartbeatReset_KeepsTimerAlive() + { + // Feed one item, wait ~2x the timeout, then complete. Without a heartbeat reset on the + // delivered item the timer would fire mid-wait and the outcome would be SilentDisconnect. + Channel channel = Channel.CreateUnbounded(); + TimeSpan timeout = TimeSpan.FromMilliseconds(200); + + Task consumeTask = WorkItemStreamConsumer.ConsumeAsync( + openStream: ct => channel.Reader.ReadAllAsync(ct), + silentDisconnectTimeout: timeout, + onItem: _ => { }, + onFirstMessage: null, + cancellation: CancellationToken.None); + + // Deliver an item just before each timer would fire to keep the stream "alive". + await Task.Delay(timeout / 2); + await channel.Writer.WriteAsync(new P.WorkItem { HealthPing = new P.HealthPing() }); + await Task.Delay(timeout / 2); + await channel.Writer.WriteAsync(new P.WorkItem { HealthPing = new P.HealthPing() }); + channel.Writer.Complete(); + + WorkItemStreamResult result = await consumeTask; + + result.Outcome.Should().Be(WorkItemStreamOutcome.GracefulDrain); + result.FirstMessageObserved.Should().BeTrue(); + } + + [Fact] + public async Task UnrelatedRpcException_Propagates() + { + Func act = () => WorkItemStreamConsumer.ConsumeAsync( + openStream: _ => ThrowingStream(new RpcException(new Status(StatusCode.Unavailable, "boom"))), + silentDisconnectTimeout: TimeSpan.FromSeconds(5), + onItem: _ => { }, + onFirstMessage: null, + cancellation: CancellationToken.None); + + await act.Should().ThrowAsync().Where(e => e.StatusCode == StatusCode.Unavailable); + } + + [Fact] + public async Task DisabledSilentDisconnect_OnlyShutdownEndsLoop() + { + using CancellationTokenSource outer = new(); + outer.CancelAfter(ShortTimeout); + + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + openStream: ct => HangingStream(ct, throwAsRpc: false), + silentDisconnectTimeout: TimeSpan.Zero, // disabled + onItem: _ => { }, + onFirstMessage: null, + cancellation: outer.Token); + + result.Outcome.Should().Be(WorkItemStreamOutcome.Shutdown); + } + +#pragma warning disable CS1998 // Async method lacks 'await' operators + static async IAsyncEnumerable EmptyStream() + { + yield break; + } + + static async IAsyncEnumerable StreamOf(params P.WorkItem[] items) + { + foreach (P.WorkItem item in items) + { + yield return item; + } + } + + static async IAsyncEnumerable ThrowingStream(Exception ex) + { + throw ex; +#pragma warning disable CS0162 // Unreachable code detected + yield break; +#pragma warning restore CS0162 + } +#pragma warning restore CS1998 + + static async IAsyncEnumerable HangingStream( + [EnumeratorCancellation] CancellationToken ct, + bool throwAsRpc) + { + try + { + await Task.Delay(Timeout.Infinite, ct); + } + catch (OperationCanceledException) when (throwAsRpc) + { + // Mimic grpc-dotnet's default surface shape for linked-token cancellation. + throw new RpcException(new Status(StatusCode.Cancelled, "stream cancelled")); + } + + yield break; + } +} From cec5f4aeeb345d978b13f6099c813feaf6763ee1 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Tue, 21 Apr 2026 16:13:47 -0700 Subject: [PATCH 09/27] Materialize new channel before swapping cache entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the AzureManaged worker and client recreators wrote the new Lazy into the cache via TryUpdate, scheduled deferred disposal of the old channel, and THEN materialized newLazy.Value. If CreateChannel throws inside .Value, the cache is left pointing at a permanently-failing Lazy and the still-healthy old channel has already been queued for shutdown — an unrecoverable state for that cache key. Now we call source.CreateChannel() first and only TryUpdate after the new channel is proven created. If the swap loses the race we dispose the freshly-created channel so it does not leak. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DurableTaskSchedulerClientExtensions.cs | 14 +++++++++++--- .../DurableTaskSchedulerWorkerExtensions.cs | 15 +++++++++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs index d34a92e3e..273284828 100644 --- a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs +++ b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs @@ -203,11 +203,19 @@ Task RecreateChannelAsync( return Task.FromResult(currentLazy.Value); } - Lazy newLazy = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); + // Materialize the new channel BEFORE swapping the dictionary so a CreateChannel failure + // leaves the existing entry intact. If we swapped a not-yet-materialized Lazy and then + // CreateChannel threw, the dictionary would point to a permanently-failing Lazy and the + // old channel would have already been queued for disposal — an unrecoverable state. + GrpcChannel newChannel = source.CreateChannel(); + Lazy newLazy = new(newChannel); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { + // Lost the race; whoever won has the freshest entry. Dispose the channel we just + // created so it doesn't leak. this.channels.TryGetValue(cacheKey, out Lazy? winner); - return Task.FromResult(winner?.Value ?? newLazy.Value); + _ = ScheduleDeferredDisposeAsync(newChannel); + return Task.FromResult(winner?.Value ?? newChannel); } if (currentLazy.IsValueCreated) @@ -215,7 +223,7 @@ Task RecreateChannelAsync( _ = ScheduleDeferredDisposeAsync(currentLazy.Value); } - return Task.FromResult(newLazy.Value); + return Task.FromResult(newChannel); } static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index 2c4a0b5cf..1473eb4da 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -202,12 +202,19 @@ Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWork return Task.FromResult(currentLazy.Value); } - Lazy newLazy = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); + // Materialize the new channel BEFORE swapping the dictionary so a CreateChannel failure + // leaves the existing entry intact. If we swapped a not-yet-materialized Lazy and then + // CreateChannel threw, the dictionary would point to a permanently-failing Lazy and the + // old channel would have already been queued for disposal — an unrecoverable state. + GrpcChannel newChannel = source.CreateChannel(); + Lazy newLazy = new(newChannel); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { - // Lost the race; whoever won has the freshest entry. + // Lost the race; whoever won has the freshest entry. Dispose the channel we just + // created so it doesn't leak. this.channels.TryGetValue(cacheKey, out Lazy? winner); - return Task.FromResult(winner?.Value ?? newLazy.Value); + _ = ScheduleDeferredDisposeAsync(newChannel); + return Task.FromResult(winner?.Value ?? newChannel); } // Successful swap. Schedule graceful disposal of the old channel after a grace period @@ -217,7 +224,7 @@ Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWork _ = ScheduleDeferredDisposeAsync(currentLazy.Value); } - return Task.FromResult(newLazy.Value); + return Task.FromResult(newChannel); } static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) From ab763db861b3bbf10fa4a14acbec2326cf42679d Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Tue, 21 Apr 2026 16:29:21 -0700 Subject: [PATCH 10/27] Address Copilot review feedback on cec5f4a - Client/Grpc/GrpcDurableTaskClient.GetCallInvoker: when an externally-supplied Channel is used with channel recreation enabled, return an AsyncDisposable that disposes the ChannelRecreatingCallInvoker wrapper. Without this, the wrapper's CancellationTokenSource and any in-flight recreate task outlive the client. The wrapper's DisposeAsync is a no-op for the channel itself when ownsChannel is false, so the externally-owned channel is not affected. - Client/Grpc/ChannelRecreatingCallInvoker.RecordFailure: a non-counted RpcException status (NotFound, InvalidArgument, FailedPrecondition, etc.) now resets consecutiveFailures. Any gRPC status reply is proof the transport is healthy enough to deliver round-trips, so an app-level error sequence should not allow a prior transport-failure count to accumulate across it and combine with a later blip to trip a false-positive recreate. - Worker/Grpc/ReconnectBackoff: doc-comment now mentions that a non-positive cap also returns TimeSpan.Zero (matches the Math.Max clamp behavior). - Worker/Grpc/WorkItemStreamConsumer: removed unused using System.Runtime.CompilerServices;. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Client/Grpc/ChannelRecreatingCallInvoker.cs | 5 +++++ src/Client/Grpc/GrpcDurableTaskClient.cs | 7 ++++++- src/Worker/Grpc/ReconnectBackoff.cs | 4 ++-- src/Worker/Grpc/WorkItemStreamConsumer.cs | 1 - 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index 0b3990685..4f54f8695 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -214,6 +214,11 @@ void RecordFailure(StatusCode status, string methodFullName) if (!counts) { + // Any gRPC status reply (even an application-level error) is proof that the transport + // is healthy enough to deliver round-trips, so reset the failure counter. This prevents + // unrelated app-level failures from silently accumulating between transport blips and + // tripping a false-positive recreate. + Volatile.Write(ref this.consecutiveFailures, 0); return; } diff --git a/src/Client/Grpc/GrpcDurableTaskClient.cs b/src/Client/Grpc/GrpcDurableTaskClient.cs index 036bb26fc..74562a3d3 100644 --- a/src/Client/Grpc/GrpcDurableTaskClient.cs +++ b/src/Client/Grpc/GrpcDurableTaskClient.cs @@ -637,7 +637,12 @@ static AsyncDisposable GetCallInvoker(GrpcDurableTaskClientOptions options, ILog { ChannelRecreatingCallInvoker wrapper = new(c, recreator!, threshold, cooldown, ownsChannel: false, logger); callInvoker = wrapper; - return default; + + // We do not own the externally-supplied channel, but we DO own the wrapper. Without + // disposing the wrapper its CancellationTokenSource and any in-flight recreate task + // would outlive the client. The wrapper's DisposeAsync is a no-op for the channel + // itself when ownsChannel == false. + return new AsyncDisposable(() => wrapper.DisposeAsync()); } callInvoker = c.CreateCallInvoker(); diff --git a/src/Worker/Grpc/ReconnectBackoff.cs b/src/Worker/Grpc/ReconnectBackoff.cs index 31f3318af..f0ff6e9f8 100644 --- a/src/Worker/Grpc/ReconnectBackoff.cs +++ b/src/Worker/Grpc/ReconnectBackoff.cs @@ -10,8 +10,8 @@ static class ReconnectBackoff { /// /// Computes a full-jitter exponential backoff delay: a uniformly random TimeSpan in - /// [0, min(cap, base * 2^attempt)]. Returns when the base delay - /// is non-positive. + /// [0, min(cap, base * 2^attempt)]. Returns when + /// or is non-positive. /// /// The retry attempt index, starting at 0. /// The base delay used for the exponential growth. diff --git a/src/Worker/Grpc/WorkItemStreamConsumer.cs b/src/Worker/Grpc/WorkItemStreamConsumer.cs index 8652302fc..2187300e2 100644 --- a/src/Worker/Grpc/WorkItemStreamConsumer.cs +++ b/src/Worker/Grpc/WorkItemStreamConsumer.cs @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System.Runtime.CompilerServices; using P = Microsoft.DurableTask.Protobuf; namespace Microsoft.DurableTask.Worker.Grpc; From 955ea6375d47f391169721b1ae8eb58d40952757 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Tue, 21 Apr 2026 16:40:58 -0700 Subject: [PATCH 11/27] Address Copilot review feedback on ab763db - AzureManaged worker/client: don't return a freshly-created channel that has been scheduled for deferred disposal when the cache slot disappears (concurrent DisposeAsync); throw ObjectDisposedException instead. - WorkItemStreamConsumer: clamp silentDisconnectTimeout to int.MaxValue ms before passing to CancellationTokenSource.CancelAfter (avoids ArgumentOutOfRangeException for >24 day values). - Processor.ConnectAsync: clamp DateTime.UtcNow.Add(helloDeadline) at DateTime.MaxValue so a misconfigured HelloDeadline cannot crash the connect loop. - Processor: clamp delay.TotalMilliseconds to int.MaxValue before logging ReconnectBackoff to avoid integer overflow. - Processor: rename 'onSilentDisconnect' callback to 'onChannelLikelyPoisoned' to reflect that it also fires on graceful empty drains, not only on silent-disconnect timeouts. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DurableTaskSchedulerClientExtensions.cs | 14 +++++++++---- .../DurableTaskSchedulerWorkerExtensions.cs | 14 +++++++++---- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 21 +++++++++++++------ src/Worker/Grpc/WorkItemStreamConsumer.cs | 13 ++++++++++-- 4 files changed, 46 insertions(+), 16 deletions(-) diff --git a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs index 273284828..3422d2e3a 100644 --- a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs +++ b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs @@ -211,11 +211,17 @@ Task RecreateChannelAsync( Lazy newLazy = new(newChannel); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { - // Lost the race; whoever won has the freshest entry. Dispose the channel we just - // created so it doesn't leak. - this.channels.TryGetValue(cacheKey, out Lazy? winner); + // Lost the race. Always queue the freshly-created channel for deferred disposal so + // it does not leak. Then return the winning entry — but if the cache slot has been + // removed entirely (e.g. concurrent DisposeAsync cleared the dictionary), do NOT + // hand back the doomed `newChannel`: it has already been scheduled for shutdown. _ = ScheduleDeferredDisposeAsync(newChannel); - return Task.FromResult(winner?.Value ?? newChannel); + if (this.channels.TryGetValue(cacheKey, out Lazy? winner) && winner is not null) + { + return Task.FromResult(winner.Value); + } + + throw new ObjectDisposedException(this.GetType().FullName); } if (currentLazy.IsValueCreated) diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index 1473eb4da..8c0adf5d8 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -210,11 +210,17 @@ Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWork Lazy newLazy = new(newChannel); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { - // Lost the race; whoever won has the freshest entry. Dispose the channel we just - // created so it doesn't leak. - this.channels.TryGetValue(cacheKey, out Lazy? winner); + // Lost the race. Always queue the freshly-created channel for deferred disposal so + // it does not leak. Then return the winning entry — but if the cache slot has been + // removed entirely (e.g. concurrent DisposeAsync cleared the dictionary), do NOT + // hand back the doomed `newChannel`: it has already been scheduled for shutdown. _ = ScheduleDeferredDisposeAsync(newChannel); - return Task.FromResult(winner?.Value ?? newChannel); + if (this.channels.TryGetValue(cacheKey, out Lazy? winner) && winner is not null) + { + return Task.FromResult(winner.Value); + } + + throw new ObjectDisposedException(this.GetType().FullName); } // Successful swap. Schedule graceful disposal of the old channel after a grace period diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index ef367e09f..8af725f37 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -78,7 +78,7 @@ await this.ProcessWorkItemsAsync( consecutiveChannelFailures = 0; reconnectAttempt = 0; }, - onSilentDisconnect: () => channelLikelyPoisoned = true); + onChannelLikelyPoisoned: () => channelLikelyPoisoned = true); } catch (RpcException) when (cancellation.IsCancellationRequested) { @@ -153,7 +153,7 @@ await this.ProcessWorkItemsAsync( this.internalOptions.ReconnectBackoffBase, this.internalOptions.ReconnectBackoffCap, backoffRandom); - this.Logger.ReconnectBackoff(reconnectAttempt, (int)delay.TotalMilliseconds); + this.Logger.ReconnectBackoff(reconnectAttempt, (int)Math.Min(int.MaxValue, delay.TotalMilliseconds)); reconnectAttempt = Math.Min(reconnectAttempt + 1, 30); // cap to avoid overflow in 2^attempt await Task.Delay(delay, cancellation); } @@ -299,7 +299,16 @@ async ValueTask BuildRuntimeStateAsync( async Task> ConnectAsync(CancellationToken cancellation) { TimeSpan helloDeadline = this.internalOptions.HelloDeadline; - DateTime? deadline = helloDeadline > TimeSpan.Zero ? DateTime.UtcNow.Add(helloDeadline) : null; + DateTime? deadline = null; + + if (helloDeadline > TimeSpan.Zero) + { + // Clamp to DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot + // throw ArgumentOutOfRangeException out of DateTime.Add and crash the connect loop. + DateTime now = DateTime.UtcNow; + TimeSpan maxOffset = DateTime.MaxValue - now; + deadline = helloDeadline >= maxOffset ? DateTime.MaxValue : now.Add(helloDeadline); + } await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); this.Logger.EstablishedWorkItemConnection(); @@ -326,7 +335,7 @@ async Task ProcessWorkItemsAsync( AsyncServerStreamingCall stream, CancellationToken cancellation, Action? onFirstMessage = null, - Action? onSilentDisconnect = null) + Action? onChannelLikelyPoisoned = null) { // The timeout token (managed by WorkItemStreamConsumer) detects when no messages — // including health pings sent periodically by the server — arrive within the configured @@ -353,7 +362,7 @@ async Task ProcessWorkItemsAsync( // Stream stopped producing messages (including health pings) for longer than the // configured window. Treat as a poisoned channel. this.Logger.ConnectionTimeout(); - onSilentDisconnect?.Invoke(); + onChannelLikelyPoisoned?.Invoke(); return; case WorkItemStreamOutcome.GracefulDrain: @@ -369,7 +378,7 @@ async Task ProcessWorkItemsAsync( this.Logger.StreamEndedByPeer(); if (!result.FirstMessageObserved) { - onSilentDisconnect?.Invoke(); + onChannelLikelyPoisoned?.Invoke(); } return; diff --git a/src/Worker/Grpc/WorkItemStreamConsumer.cs b/src/Worker/Grpc/WorkItemStreamConsumer.cs index 2187300e2..977a2129f 100644 --- a/src/Worker/Grpc/WorkItemStreamConsumer.cs +++ b/src/Worker/Grpc/WorkItemStreamConsumer.cs @@ -65,10 +65,19 @@ public static async Task ConsumeAsync( CancellationToken cancellation) { bool silentDisconnectEnabled = silentDisconnectTimeout > TimeSpan.Zero; + + // CancellationTokenSource.CancelAfter rejects values larger than int.MaxValue ms (~24.8d). + // Defensively clamp so a misconfigured SilentDisconnectTimeout cannot crash the consume loop. + TimeSpan effectiveTimeout = silentDisconnectTimeout; + if (silentDisconnectEnabled && silentDisconnectTimeout.TotalMilliseconds > int.MaxValue) + { + effectiveTimeout = TimeSpan.FromMilliseconds(int.MaxValue); + } + using CancellationTokenSource timeoutSource = new(); if (silentDisconnectEnabled) { - timeoutSource.CancelAfter(silentDisconnectTimeout); + timeoutSource.CancelAfter(effectiveTimeout); } using CancellationTokenSource tokenSource = CancellationTokenSource.CreateLinkedTokenSource( @@ -82,7 +91,7 @@ public static async Task ConsumeAsync( { if (silentDisconnectEnabled) { - timeoutSource.CancelAfter(silentDisconnectTimeout); + timeoutSource.CancelAfter(effectiveTimeout); } if (!firstMessageObserved) From 7df10dc4a37d1cd7f55250dcb5a46919eba893d0 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Wed, 22 Apr 2026 13:36:48 -0700 Subject: [PATCH 12/27] Address PR review follow-up on gRPC resilience Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DurableTaskSchedulerClientExtensions.cs | 33 ++++++- .../Grpc/ChannelRecreatingCallInvoker.cs | 92 +++++++++++-------- src/Client/Grpc/GrpcDurableTaskClient.cs | 21 ++++- .../DurableTaskSchedulerWorkerExtensions.cs | 34 +++++-- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 6 +- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 50 +++++++--- src/Worker/Grpc/ReconnectBackoff.cs | 11 ++- src/Worker/Grpc/WorkItemStreamConsumer.cs | 40 +++++--- .../Grpc.Tests/WorkItemStreamConsumerTests.cs | 14 +++ 9 files changed, 219 insertions(+), 82 deletions(-) diff --git a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs index 3422d2e3a..aa7a80a78 100644 --- a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs +++ b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs @@ -167,7 +167,7 @@ public void Configure(string? name, GrpcDurableTaskClientOptions options) /// graceful disposal of the old channel after a grace period so any in-flight RPCs from peer /// clients can drain. Returns the currently cached channel if a peer client has already recreated it. /// - Task RecreateChannelAsync( + async Task RecreateChannelAsync( string cacheKey, DurableTaskSchedulerClientOptions source, GrpcChannel oldChannel, @@ -175,18 +175,30 @@ Task RecreateChannelAsync( { cancellation.ThrowIfCancellationRequested(); + // Recreate callbacks can outlive Configure(...) because clients keep the delegate on their + // options. Best-effort check for disposal before publishing anything back into the shared cache. if (this.disposed == 1) { throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); } + // Shared-cache recreation has four relevant states: + // 1. No entry exists anymore. Create one and use it. + // 2. The entry already materialized a different channel. A peer client already refreshed it. + // 3. The entry still represents what this client observed. Win TryUpdate and publish the new channel. + // 4. The entry changes between our read and TryUpdate. Lose the race, dispose ours, and reuse the winner. if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) { // PublicationOnly avoids permanently caching a transient CreateChannel exception. Lazy created = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); + if (this.disposed == 1) + { + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } + if (this.channels.TryAdd(cacheKey, created)) { - return Task.FromResult(created.Value); + return created.Value; } this.channels.TryGetValue(cacheKey, out currentLazy); @@ -197,10 +209,12 @@ Task RecreateChannelAsync( throw new InvalidOperationException("Failed to obtain a cached gRPC channel after recreation attempt."); } + // Only a materialized Lazy can be compared against oldChannel by reference. If the cache slot + // has not created its channel yet, let TryUpdate decide whether this recreate attempt still owns it. if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) { // A peer client already swapped in a new channel; reuse it. - return Task.FromResult(currentLazy.Value); + return currentLazy.Value; } // Materialize the new channel BEFORE swapping the dictionary so a CreateChannel failure @@ -208,6 +222,15 @@ Task RecreateChannelAsync( // CreateChannel threw, the dictionary would point to a permanently-failing Lazy and the // old channel would have already been queued for disposal — an unrecoverable state. GrpcChannel newChannel = source.CreateChannel(); + if (this.disposed == 1) + { + await DisposeChannelAsync(newChannel).ConfigureAwait(false); + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } + + // The cache always stores Lazy so the steady-state Configure path and the + // recreate path use the same dictionary value shape. Recreate materializes first only to + // avoid publishing a lazy that could fault before we know channel creation succeeded. Lazy newLazy = new(newChannel); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { @@ -218,7 +241,7 @@ Task RecreateChannelAsync( _ = ScheduleDeferredDisposeAsync(newChannel); if (this.channels.TryGetValue(cacheKey, out Lazy? winner) && winner is not null) { - return Task.FromResult(winner.Value); + return winner.Value; } throw new ObjectDisposedException(this.GetType().FullName); @@ -229,7 +252,7 @@ Task RecreateChannelAsync( _ = ScheduleDeferredDisposeAsync(currentLazy.Value); } - return Task.FromResult(newChannel); + return newChannel; } static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index 4f54f8695..0d7965030 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -50,7 +50,9 @@ sealed class ChannelRecreatingCallInvoker : CallInvoker, IAsyncDisposable TransportState state; int consecutiveFailures; int recreateInFlight; - long lastRecreateTicks; + // Stopwatch timestamps are monotonic, so backend-recreate cooldowns cannot be shortened or + // extended by wall-clock jumps. + long lastRecreateTimestamp; int disposed; public ChannelRecreatingCallInvoker( @@ -68,8 +70,10 @@ public ChannelRecreatingCallInvoker( this.logger = logger; this.state = new TransportState(initialChannel, initialChannel.CreateCallInvoker()); - // Seed lastRecreateTicks so cooldown does not block the very first recreate attempt. - this.lastRecreateTicks = Stopwatch.GetTimestamp() - StopwatchTicksFor(minRecreateInterval); + // Backdate the initial timestamp so the first recreate is never blocked by the cooldown. + // Leaving the field at 0 would make the first attempt depend on how long the current process + // has been running since machine startup. + this.lastRecreateTimestamp = CreateInitialRecreateTimestamp(minRecreateInterval); } public override TResponse BlockingUnaryCall( @@ -140,6 +144,7 @@ public async ValueTask DisposeAsync() if (!this.ownsChannel) { + // The wrapper still owns disposalCts and background recreate state, but the caller owns the channel. this.disposalCts.Dispose(); return; } @@ -147,16 +152,7 @@ public async ValueTask DisposeAsync() TransportState current = Volatile.Read(ref this.state); try { -#if NET6_0_OR_GREATER - await current.Channel.ShutdownAsync().ConfigureAwait(false); - current.Channel.Dispose(); -#else - await current.Channel.ShutdownAsync().ConfigureAwait(false); -#endif - } - catch (Exception ex) when (ex is OperationCanceledException or ObjectDisposedException) - { - // Best-effort disposal. + await ShutdownAndDisposeOwnedChannelAsync(current.Channel).ConfigureAwait(false); } finally { @@ -164,9 +160,18 @@ public async ValueTask DisposeAsync() } } - static long StopwatchTicksFor(TimeSpan ts) => + static long CreateInitialRecreateTimestamp(TimeSpan minRecreateInterval) => + Stopwatch.GetTimestamp() - ToStopwatchTicks(minRecreateInterval); + + static long ToStopwatchTicks(TimeSpan ts) => (long)(ts.TotalSeconds * Stopwatch.Frequency); + static TimeSpan ElapsedSince(long previousTimestamp, long nowTimestamp) + { + long elapsedTicks = Math.Max(0, nowTimestamp - previousTimestamp); + return TimeSpan.FromSeconds((double)elapsedTicks / Stopwatch.Frequency); + } + void ObserveOutcome(Task responseAsync, string methodFullName) { // Use ContinueWith with TaskScheduler.Default so we don't capture sync context. @@ -233,10 +238,9 @@ void RecordFailure(StatusCode status, string methodFullName) void MaybeTriggerRecreate(int observedCount) { - long nowTicks = Stopwatch.GetTimestamp(); - long elapsedTicks = nowTicks - Volatile.Read(ref this.lastRecreateTicks); - long cooldownTicks = StopwatchTicksFor(this.minRecreateInterval); - if (elapsedTicks < cooldownTicks) + // This method runs on application call threads, so keep the hot path lock-free and only serialize + // the actual recreate work behind the single-flight gate below. + if (!this.HasReachedRecreateCooldown(Stopwatch.GetTimestamp())) { return; } @@ -248,8 +252,7 @@ void MaybeTriggerRecreate(int observedCount) } // Re-check elapsed under the guard to avoid back-to-back recreates that won the CAS race. - elapsedTicks = Stopwatch.GetTimestamp() - Volatile.Read(ref this.lastRecreateTicks); - if (elapsedTicks < cooldownTicks) + if (!this.HasReachedRecreateCooldown(Stopwatch.GetTimestamp())) { Interlocked.Exchange(ref this.recreateInFlight, 0); return; @@ -287,7 +290,7 @@ async Task RecreateAsync(int observedCount) { try { - await newChannel.ShutdownAsync().ConfigureAwait(false); + await ShutdownAndDisposeOwnedChannelAsync(newChannel).ConfigureAwait(false); } catch (Exception shutdownEx) when (shutdownEx is not OutOfMemoryException and not StackOverflowException @@ -312,10 +315,16 @@ and not StackOverflowException _ = ScheduleDeferredDisposeAsync(current.Channel); } } + else + { + // Returning the same channel means no swap was needed (for example, because a peer + // already refreshed a shared cache). Keep using the published state and reset the + // failure counter below. + } // Successful recreate (even if a peer beat us to it) → reset the failure counter. Volatile.Write(ref this.consecutiveFailures, 0); - Volatile.Write(ref this.lastRecreateTicks, Stopwatch.GetTimestamp()); + Volatile.Write(ref this.lastRecreateTimestamp, Stopwatch.GetTimestamp()); } catch (OperationCanceledException) when (Volatile.Read(ref this.disposed) != 0) { @@ -327,8 +336,8 @@ and not StackOverflowException { this.logger.ChannelRecreateFailed(ex); - // Update lastRecreateTicks even on failure so the cooldown applies to failed attempts too. - Volatile.Write(ref this.lastRecreateTicks, Stopwatch.GetTimestamp()); + // Update the last-attempt timestamp even on failure so the cooldown applies to failed attempts too. + Volatile.Write(ref this.lastRecreateTimestamp, Stopwatch.GetTimestamp()); } finally { @@ -343,17 +352,7 @@ static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) // Grace period to let in-flight RPCs captured against the old invoker drain before we // tear down the channel's HTTP handler / sockets. await Task.Delay(TimeSpan.FromSeconds(30)).ConfigureAwait(false); - try - { - await channel.ShutdownAsync().ConfigureAwait(false); - } - catch (Exception ex) when (ex is OperationCanceledException or ObjectDisposedException) - { - // Expected during process shutdown; nothing to do. - } -#if NET6_0_OR_GREATER - channel.Dispose(); -#endif + await ShutdownAndDisposeOwnedChannelAsync(channel).ConfigureAwait(false); } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException @@ -370,10 +369,27 @@ and not StackOverflowException static string GetEndpointDescription(GrpcChannel channel) { -#if NET6_0_OR_GREATER - return channel.Target ?? "(unknown)"; -#else return channel.Target ?? "(unknown)"; + } + + bool HasReachedRecreateCooldown(long nowTimestamp) + { + TimeSpan elapsed = ElapsedSince(Volatile.Read(ref this.lastRecreateTimestamp), nowTimestamp); + return elapsed >= this.minRecreateInterval; + } + + static async Task ShutdownAndDisposeOwnedChannelAsync(GrpcChannel channel) + { + try + { + await channel.ShutdownAsync().ConfigureAwait(false); + } + catch (Exception ex) when (ex is OperationCanceledException or ObjectDisposedException) + { + // Expected during shutdown races; nothing more to do. + } +#if NET6_0_OR_GREATER + channel.Dispose(); #endif } diff --git a/src/Client/Grpc/GrpcDurableTaskClient.cs b/src/Client/Grpc/GrpcDurableTaskClient.cs index 74562a3d3..23350d4cd 100644 --- a/src/Client/Grpc/GrpcDurableTaskClient.cs +++ b/src/Client/Grpc/GrpcDurableTaskClient.cs @@ -667,7 +667,26 @@ static AsyncDisposable GetCallInvoker(GrpcDurableTaskClientOptions options, ILog } callInvoker = c.CreateCallInvoker(); - return new AsyncDisposable(() => new(c.ShutdownAsync())); + return CreateOwnedChannelDisposable(c); + } + + static AsyncDisposable CreateOwnedChannelDisposable(GrpcChannel channel) + { + return new AsyncDisposable(() => ShutdownAndDisposeChannelAsync(channel)); + } + + static async ValueTask ShutdownAndDisposeChannelAsync(GrpcChannel channel) + { + try + { + await channel.ShutdownAsync().ConfigureAwait(false); + } + finally + { +#if NET6_0_OR_GREATER + channel.Dispose(); +#endif + } } #if NET6_0_OR_GREATER diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index 8c0adf5d8..f9bfa8e52 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -166,25 +166,36 @@ public void Configure(string? name, GrpcDurableTaskWorkerOptions options) /// graceful disposal of the old channel after a grace period so any in-flight RPCs from peer workers /// can drain. Returns the currently cached channel if a peer worker has already recreated it. /// - Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWorkerOptions source, GrpcChannel oldChannel, CancellationToken cancellation) + async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWorkerOptions source, GrpcChannel oldChannel, CancellationToken cancellation) { cancellation.ThrowIfCancellationRequested(); + // Recreate callbacks can outlive Configure(...) because workers keep the delegate on their + // options. Best-effort check for disposal before publishing anything back into the shared cache. if (this.disposed == 1) { throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); } - // CAS swap: only replace if the cached lazy still holds the channel the caller observed. + // Shared-cache recreation has four relevant states: + // 1. No entry exists anymore. Create one and use it. + // 2. The entry already materialized a different channel. A peer worker already refreshed it. + // 3. The entry still represents what this worker observed. Win TryUpdate and publish the new channel. + // 4. The entry changes between our read and TryUpdate. Lose the race, dispose ours, and reuse the winner. if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) { // No entry — create one and return it. PublicationOnly ensures a transient // CreateChannel failure doesn't permanently poison the cache slot (the default // ExecutionAndPublication mode caches exceptions for the lifetime of the Lazy). Lazy created = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); + if (this.disposed == 1) + { + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } + if (this.channels.TryAdd(cacheKey, created)) { - return Task.FromResult(created.Value); + return created.Value; } // Another thread added one; fall through to read it. @@ -196,10 +207,12 @@ Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWork throw new InvalidOperationException("Failed to obtain a cached gRPC channel after recreation attempt."); } + // Only a materialized Lazy can be compared against oldChannel by reference. If the cache slot + // has not created its channel yet, let TryUpdate decide whether this recreate attempt still owns it. if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) { // A peer worker already swapped in a new channel; reuse it. - return Task.FromResult(currentLazy.Value); + return currentLazy.Value; } // Materialize the new channel BEFORE swapping the dictionary so a CreateChannel failure @@ -207,6 +220,15 @@ Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWork // CreateChannel threw, the dictionary would point to a permanently-failing Lazy and the // old channel would have already been queued for disposal — an unrecoverable state. GrpcChannel newChannel = source.CreateChannel(); + if (this.disposed == 1) + { + await DisposeChannelAsync(newChannel).ConfigureAwait(false); + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } + + // The cache always stores Lazy so the steady-state Configure path and the + // recreate path use the same dictionary value shape. Recreate materializes first only to + // avoid publishing a lazy that could fault before we know channel creation succeeded. Lazy newLazy = new(newChannel); if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) { @@ -217,7 +239,7 @@ Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWork _ = ScheduleDeferredDisposeAsync(newChannel); if (this.channels.TryGetValue(cacheKey, out Lazy? winner) && winner is not null) { - return Task.FromResult(winner.Value); + return winner.Value; } throw new ObjectDisposedException(this.GetType().FullName); @@ -230,7 +252,7 @@ Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWork _ = ScheduleDeferredDisposeAsync(currentLazy.Value); } - return Task.FromResult(newChannel); + return newChannel; } static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 8af725f37..f6d848407 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -69,7 +69,7 @@ public async Task ExecuteAsync(CancellationToken cancellati bool channelLikelyPoisoned = false; try { - AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); + using AsyncServerStreamingCall stream = await this.ConnectAsync(cancellation); await this.ProcessWorkItemsAsync( stream, cancellation, @@ -93,7 +93,9 @@ await this.ProcessWorkItemsAsync( } catch (RpcException ex) when (ex.StatusCode == StatusCode.DeadlineExceeded) { - // HelloAsync hung — most plausibly a half-open HTTP/2 connection / stale routing. + // Only HelloAsync carries a deadline. Once the work-item stream is established, + // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. + // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. int seconds = Math.Max(1, (int)this.internalOptions.HelloDeadline.TotalSeconds); this.Logger.HelloTimeout(seconds); channelLikelyPoisoned = true; diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index babe9cb6c..4e012763e 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -59,12 +59,10 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) { AsyncDisposable workerOwnedChannelDisposable = this.GetCallInvoker(out CallInvoker callInvoker, out string address); - // Track the most recently observed channel so the recreator can compare it against the - // currently-cached channel and skip the swap when a peer worker has already recreated it. - // We must NOT use this.grpcOptions.Channel here: that field is set once when options are - // configured and is never updated when the AzureManaged extension swaps the cached channel. - // Passing the stale field would cause the recreator's "peer already swapped" branch to be - // skipped, producing redundant ChannelRecreated logs and wasted recreate attempts. + // Seed the tracker from the configured channel once, then update latestObservedChannel after + // each successful recreate. Do not re-read this.grpcOptions.Channel inside the loop: the options + // object keeps its original Channel reference even when a shared backend-channel cache has already + // swapped to a newer instance. GrpcChannel? latestObservedChannel = this.grpcOptions.Channel; try { @@ -91,10 +89,9 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) workerOwnedChannelDisposable = result.NewWorkerOwnedDisposable; this.logger.ChannelRecreated(address); - // Dispose the prior worker-owned channel (if any). For Path 1 (caller-supplied recreator) - // and Path 3 (caller-owned), the previous disposable is a default AsyncDisposable whose - // DisposeAsync is a no-op, so this is always safe. We do not use ReferenceEquals here - // because AsyncDisposable is a value type and reference comparison is meaningless. + // Dispose the prior worker-owned channel if we had one. Path 1 keeps ownership with the + // recreator, and Path 3 never recreates at all, so only Path 2 ever installs a non-default + // AsyncDisposable here. We do not use ReferenceEquals because AsyncDisposable is a value type. await previousDisposable.DisposeAsync(); } @@ -113,6 +110,11 @@ async Task TryRecreateChannelAsync( AsyncDisposable currentWorkerOwnedDisposable, GrpcChannel? currentChannel) { + // There are three ownership models here: + // 1. A caller-supplied recreator owns shared/cache-backed channels and decides how to swap them. + // 2. The worker owns an Address-created channel and can rebuild it directly. + // 3. The caller owns the Channel/CallInvoker, so the worker can only keep retrying on the same transport. + // Path 1: caller (or extension method like ConfigureGrpcChannel) supplied a recreator. Func>? recreator = this.grpcOptions.Internal.ChannelRecreator; if (recreator is not null && currentChannel is not null) @@ -122,7 +124,8 @@ async Task TryRecreateChannelAsync( GrpcChannel newChannel = await recreator(currentChannel, cancellation).ConfigureAwait(false); if (!ReferenceEquals(newChannel, currentChannel)) { - // The recreator owns disposal of the old channel; we don't dispose here. + // The recreator may return a shared cached channel. It also owns disposal of the + // previous channel, so the outer worker keeps its existing AsyncDisposable. return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, currentWorkerOwnedDisposable, newChannel); } @@ -148,7 +151,9 @@ async Task TryRecreateChannelAsync( try { GrpcChannel newChannel = GetChannel(this.grpcOptions.Address); - AsyncDisposable newDisposable = new(() => new(newChannel.ShutdownAsync())); + // This new channel is worker-owned, so hand back a disposable that will shut it down + // (and dispose it on frameworks where GrpcChannel implements IDisposable). + AsyncDisposable newDisposable = CreateOwnedChannelDisposable(newChannel); return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, newDisposable, newChannel); } catch (OperationCanceledException) when (cancellation.IsCancellationRequested) @@ -221,6 +226,25 @@ static GrpcChannel GetChannel(string? address) } #endif + static AsyncDisposable CreateOwnedChannelDisposable(GrpcChannel channel) + { + return new AsyncDisposable(() => ShutdownAndDisposeChannelAsync(channel)); + } + + static async ValueTask ShutdownAndDisposeChannelAsync(GrpcChannel channel) + { + try + { + await channel.ShutdownAsync().ConfigureAwait(false); + } + finally + { +#if NET6_0_OR_GREATER + channel.Dispose(); +#endif + } + } + AsyncDisposable GetCallInvoker(out CallInvoker callInvoker, out string address) { if (this.grpcOptions.Channel is GrpcChannel c) @@ -240,7 +264,7 @@ AsyncDisposable GetCallInvoker(out CallInvoker callInvoker, out string address) c = GetChannel(this.grpcOptions.Address); callInvoker = c.CreateCallInvoker(); address = c.Target; - return new AsyncDisposable(() => new(c.ShutdownAsync())); + return CreateOwnedChannelDisposable(c); } static ILogger CreateLogger(ILoggerFactory loggerFactory, DurableTaskWorkerOptions options) diff --git a/src/Worker/Grpc/ReconnectBackoff.cs b/src/Worker/Grpc/ReconnectBackoff.cs index f0ff6e9f8..cfa01f1b0 100644 --- a/src/Worker/Grpc/ReconnectBackoff.cs +++ b/src/Worker/Grpc/ReconnectBackoff.cs @@ -34,9 +34,12 @@ public static TimeSpan Compute(int attempt, TimeSpan baseDelay, TimeSpan cap, Ra int safeAttempt = Math.Min(attempt, 30); double capMs = Math.Max(0, cap.TotalMilliseconds); - double exp = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); - double bound = Math.Min(capMs, exp); - double jittered = random.NextDouble() * bound; - return TimeSpan.FromMilliseconds(jittered); + double exponentialMs = baseDelay.TotalMilliseconds * Math.Pow(2, safeAttempt); + double upperBoundMs = Math.Min(capMs, exponentialMs); + + // Full jitter intentionally allows any value in the retry window. The wide spread keeps many + // workers that saw the same outage from reconnecting in lockstep against the backend. + double jitteredMs = random.NextDouble() * upperBoundMs; + return TimeSpan.FromMilliseconds(jitteredMs); } } diff --git a/src/Worker/Grpc/WorkItemStreamConsumer.cs b/src/Worker/Grpc/WorkItemStreamConsumer.cs index 977a2129f..cce0b23af 100644 --- a/src/Worker/Grpc/WorkItemStreamConsumer.cs +++ b/src/Worker/Grpc/WorkItemStreamConsumer.cs @@ -36,6 +36,10 @@ internal enum WorkItemStreamOutcome /// internal static class WorkItemStreamConsumer { + // Stay just below the historical CancelAfter(TimeSpan) ceiling so extremely large configuration + // values are still treated as "effectively infinite" without depending on framework-specific edge cases. + static readonly TimeSpan MaxSupportedCancelAfterTimeout = TimeSpan.FromMilliseconds(int.MaxValue - 1d); + /// /// Consume a work-item stream until shutdown, silent disconnect, or graceful drain. /// @@ -66,20 +70,21 @@ public static async Task ConsumeAsync( { bool silentDisconnectEnabled = silentDisconnectTimeout > TimeSpan.Zero; - // CancellationTokenSource.CancelAfter rejects values larger than int.MaxValue ms (~24.8d). - // Defensively clamp so a misconfigured SilentDisconnectTimeout cannot crash the consume loop. - TimeSpan effectiveTimeout = silentDisconnectTimeout; - if (silentDisconnectEnabled && silentDisconnectTimeout.TotalMilliseconds > int.MaxValue) - { - effectiveTimeout = TimeSpan.FromMilliseconds(int.MaxValue); - } + // Clamp enormous values once up-front so the timer-reset path can simply re-arm the same window. + TimeSpan effectiveTimeout = ClampCancelAfterTimeout(silentDisconnectTimeout); using CancellationTokenSource timeoutSource = new(); - if (silentDisconnectEnabled) + void ArmSilentDisconnectTimer() { - timeoutSource.CancelAfter(effectiveTimeout); + if (silentDisconnectEnabled) + { + timeoutSource.CancelAfter(effectiveTimeout); + } } + // Arm once before reading so the initial gap before the first message is also bounded. + ArmSilentDisconnectTimer(); + using CancellationTokenSource tokenSource = CancellationTokenSource.CreateLinkedTokenSource( cancellation, timeoutSource.Token); @@ -89,10 +94,7 @@ public static async Task ConsumeAsync( { await foreach (P.WorkItem workItem in openStream(tokenSource.Token).ConfigureAwait(false)) { - if (silentDisconnectEnabled) - { - timeoutSource.CancelAfter(effectiveTimeout); - } + ArmSilentDisconnectTimer(); if (!firstMessageObserved) { @@ -135,4 +137,16 @@ public static async Task ConsumeAsync( return new WorkItemStreamResult(WorkItemStreamOutcome.GracefulDrain, firstMessageObserved); } + + static TimeSpan ClampCancelAfterTimeout(TimeSpan timeout) + { + if (timeout <= TimeSpan.Zero) + { + return timeout; + } + + return timeout <= MaxSupportedCancelAfterTimeout + ? timeout + : MaxSupportedCancelAfterTimeout; + } } diff --git a/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs index c142a4245..98370625f 100644 --- a/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs +++ b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs @@ -48,6 +48,20 @@ public async Task StreamWithItems_ReturnsGracefulDrain_AndFiresCallbacks() firstMessageCount.Should().Be(1); } + [Fact] + public async Task VeryLargeSilentDisconnectTimeout_IsClamped_AndStreamCanStillComplete() + { + WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( + openStream: _ => EmptyStream(), + silentDisconnectTimeout: TimeSpan.FromDays(365), + onItem: _ => throw new InvalidOperationException("onItem should not be invoked"), + onFirstMessage: null, + cancellation: CancellationToken.None); + + result.Outcome.Should().Be(WorkItemStreamOutcome.GracefulDrain); + result.FirstMessageObserved.Should().BeFalse(); + } + [Fact] public async Task HangingStream_SurfacingOce_ReturnsSilentDisconnect() { From add1a390bedfb5fb74578971705e7d9893a2d63d Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Wed, 22 Apr 2026 16:22:28 -0700 Subject: [PATCH 13/27] Address PR feedback for gRPC recreation Tighten the review follow-ups, preserve the configured hello timeout in worker logs, and add client/worker recreation coverage. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc/ChannelRecreatingCallInvoker.cs | 7 +- .../DurableTaskSchedulerWorkerExtensions.cs | 13 +- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 3 +- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 2 +- .../Grpc/GrpcDurableTaskWorkerOptions.cs | 4 +- .../Internal/InternalOptionsExtensions.cs | 6 +- src/Worker/Grpc/Logs.cs | 4 +- ...DurableTaskClientChannelRecreationTests.cs | 195 ++++++++++++++++ .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 209 ++++++++++++++++++ .../Grpc.Tests/WorkItemStreamConsumerTests.cs | 37 +++- 10 files changed, 451 insertions(+), 29 deletions(-) create mode 100644 test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs create mode 100644 test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index 0d7965030..a060307bf 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -238,8 +238,6 @@ void RecordFailure(StatusCode status, string methodFullName) void MaybeTriggerRecreate(int observedCount) { - // This method runs on application call threads, so keep the hot path lock-free and only serialize - // the actual recreate work behind the single-flight gate below. if (!this.HasReachedRecreateCooldown(Stopwatch.GetTimestamp())) { return; @@ -251,7 +249,8 @@ void MaybeTriggerRecreate(int observedCount) return; } - // Re-check elapsed under the guard to avoid back-to-back recreates that won the CAS race. + // A previous recreate can finish after the fast-path cooldown check but before we acquire the + // single-flight slot. Re-check after taking the slot so we don't immediately recreate again. if (!this.HasReachedRecreateCooldown(Stopwatch.GetTimestamp())) { Interlocked.Exchange(ref this.recreateInFlight, 0); @@ -259,6 +258,8 @@ void MaybeTriggerRecreate(int observedCount) } this.logger.RecreatingChannel(observedCount); + + // Keep recreate work off the caller's RPC-failure path. _ = Task.Run(() => this.RecreateAsync(observedCount)); } diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index f9bfa8e52..37f0d58c2 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -163,8 +163,9 @@ public void Configure(string? name, GrpcDurableTaskWorkerOptions options) /// /// Atomically swaps the cached channel for the given key with a freshly created one and schedules - /// graceful disposal of the old channel after a grace period so any in-flight RPCs from peer workers - /// can drain. Returns the currently cached channel if a peer worker has already recreated it. + /// graceful disposal of the old channel after a grace period so any in-flight RPCs that already + /// captured the previous channel can drain. Returns the currently cached channel if another recreate + /// attempt has already refreshed the shared entry. /// async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWorkerOptions source, GrpcChannel oldChannel, CancellationToken cancellation) { @@ -179,7 +180,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul // Shared-cache recreation has four relevant states: // 1. No entry exists anymore. Create one and use it. - // 2. The entry already materialized a different channel. A peer worker already refreshed it. + // 2. The entry already materialized a different channel. Another recreate attempt already refreshed it. // 3. The entry still represents what this worker observed. Win TryUpdate and publish the new channel. // 4. The entry changes between our read and TryUpdate. Lose the race, dispose ours, and reuse the winner. if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) @@ -211,7 +212,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul // has not created its channel yet, let TryUpdate decide whether this recreate attempt still owns it. if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) { - // A peer worker already swapped in a new channel; reuse it. + // Another recreate attempt already swapped in a new channel; reuse it. return currentLazy.Value; } @@ -246,7 +247,7 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul } // Successful swap. Schedule graceful disposal of the old channel after a grace period - // so peer workers' in-flight RPCs can drain. + // so any in-flight RPCs that already captured it can drain. if (currentLazy.IsValueCreated) { _ = ScheduleDeferredDisposeAsync(currentLazy.Value); @@ -259,7 +260,7 @@ static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) { try { - // Grace period to let in-flight RPCs from peer workers complete before draining the channel. + // Grace period to let in-flight RPCs using the previous channel complete before draining it. await Task.Delay(TimeSpan.FromSeconds(30)).ConfigureAwait(false); await DisposeChannelAsync(channel).ConfigureAwait(false); } diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index f6d848407..1ab11badc 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -96,8 +96,7 @@ await this.ProcessWorkItemsAsync( // Only HelloAsync carries a deadline. Once the work-item stream is established, // ProcessWorkItemsAsync relies on the silent-disconnect timer instead of per-read deadlines. // A DeadlineExceeded here therefore means the handshake hung on a stale or half-open channel. - int seconds = Math.Max(1, (int)this.internalOptions.HelloDeadline.TotalSeconds); - this.Logger.HelloTimeout(seconds); + this.Logger.HelloTimeout(this.internalOptions.HelloDeadline); channelLikelyPoisoned = true; } catch (RpcException ex) when (ex.StatusCode == StatusCode.Unavailable) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index 4e012763e..83e30ebd6 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -91,7 +91,7 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) // Dispose the prior worker-owned channel if we had one. Path 1 keeps ownership with the // recreator, and Path 3 never recreates at all, so only Path 2 ever installs a non-default - // AsyncDisposable here. We do not use ReferenceEquals because AsyncDisposable is a value type. + // AsyncDisposable here. await previousDisposable.DisposeAsync(); } diff --git a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs index 4b18d8528..85277abec 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs @@ -138,9 +138,9 @@ internal class InternalOptions /// /// Gets or sets an optional callback invoked when the worker requests a fresh gRPC channel after /// repeated connect failures. The callback receives the previously-used channel and should return - /// either a freshly created channel or the currently cached channel if a peer worker has already + /// either a freshly created channel or the currently cached channel if another caller has already /// recreated it. Implementations are responsible for atomic swap and deferred disposal of the old - /// channel so in-flight RPCs from peer workers are not interrupted. + /// channel so in-flight RPCs already using it are not interrupted. /// public Func>? ChannelRecreator { get; set; } } diff --git a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs index 385fb93f6..7a5e2e9c1 100644 --- a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs +++ b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs @@ -32,9 +32,9 @@ public static void ConfigureForAzureManaged(this GrpcDurableTaskWorkerOptions op /// Sets a callback that the worker invokes when the underlying gRPC channel needs to be recreated /// after repeated connect failures (e.g., because the backend was replaced and the existing channel /// is wedged on a half-open HTTP/2 connection). The callback receives the channel the worker last - /// observed and must return either a freshly created channel or the currently cached channel if a - /// peer worker has already swapped it. Implementations are responsible for atomic swap and deferred - /// disposal of the old channel so in-flight RPCs from peer workers are not interrupted. + /// observed and must return either a freshly created channel or the currently cached channel if another + /// caller has already swapped it. Implementations are responsible for atomic swap and deferred disposal + /// of the old channel so in-flight RPCs already using it are not interrupted. /// /// The gRPC worker options. /// The recreate callback. diff --git a/src/Worker/Grpc/Logs.cs b/src/Worker/Grpc/Logs.cs index f2b6c31df..ea585dcfa 100644 --- a/src/Worker/Grpc/Logs.cs +++ b/src/Worker/Grpc/Logs.cs @@ -80,8 +80,8 @@ static partial class Logs [LoggerMessage(EventId = 65, Level = LogLevel.Information, Message = "{instanceId}: Abandoned entity work item. Completion token = '{completionToken}'")] public static partial void AbandonedEntityWorkItem(this ILogger logger, string instanceId, string completionToken); - [LoggerMessage(EventId = 70, Level = LogLevel.Warning, Message = "Hello handshake to backend timed out after {timeoutSeconds}s. Will retry.")] - public static partial void HelloTimeout(this ILogger logger, int timeoutSeconds); + [LoggerMessage(EventId = 70, Level = LogLevel.Warning, Message = "Hello handshake to backend timed out after {timeout}. Will retry.")] + public static partial void HelloTimeout(this ILogger logger, TimeSpan timeout); [LoggerMessage(EventId = 71, Level = LogLevel.Warning, Message = "Authentication failed when connecting to backend. Will retry.")] public static partial void AuthenticationFailed(this ILogger logger, Exception ex); diff --git a/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs b/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs new file mode 100644 index 000000000..31bc6d59f --- /dev/null +++ b/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs @@ -0,0 +1,195 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Reflection; +using System.Text; +using Grpc.Core; +using Microsoft.DurableTask; +using Microsoft.DurableTask.Client.Grpc.Internal; +using Microsoft.Extensions.Logging.Abstractions; + +namespace Microsoft.DurableTask.Client.Grpc.Tests; + +public class GrpcDurableTaskClientChannelRecreationTests +{ + static readonly Marshaller StringMarshaller = Marshallers.Create( + value => Encoding.UTF8.GetBytes(value), + bytes => Encoding.UTF8.GetString(bytes)); + static readonly Method TestMethod = new( + MethodType.Unary, + "TestService", + "TestMethod", + StringMarshaller, + StringMarshaller); + static readonly MethodInfo GetCallInvokerMethod = typeof(GrpcDurableTaskClient) + .GetMethod("GetCallInvoker", BindingFlags.Static | BindingFlags.NonPublic)!; + + [Fact] + public async Task GetCallInvoker_WithProvidedChannel_RecreatesTransportAfterUnaryFailure() + { + // Arrange + CallbackHttpMessageHandler initialHandler = new((request, cancellationToken) => + Task.FromResult(CreateFailureResponse(StatusCode.Unavailable, "initial transport failure"))); + TaskCompletionSource recreatedTransportUsed = new(TaskCreationOptions.RunContinuationsAsynchronously); + CallbackHttpMessageHandler recreatedHandler = new((request, cancellationToken) => + { + recreatedTransportUsed.TrySetResult(); + return Task.FromResult(CreateFailureResponse(StatusCode.Unavailable, "recreated transport failure")); + }); + + GrpcChannel channel = CreateChannel("http://initial.client.test", initialHandler); + GrpcChannel recreatedChannel = CreateChannel("http://recreated.client.test", recreatedHandler); + GrpcDurableTaskClientOptions options = new() + { + Channel = channel, + }; + options.Internal.ChannelRecreateFailureThreshold = 2; + options.Internal.MinRecreateInterval = TimeSpan.Zero; + + TaskCompletionSource recreateRequested = new(TaskCreationOptions.RunContinuationsAsynchronously); + int recreatorCalls = 0; + options.SetChannelRecreator((existingChannel, ct) => + { + recreatorCalls++; + recreateRequested.TrySetResult(existingChannel); + return Task.FromResult(recreatedChannel); + }); + + try + { + // Act + (AsyncDisposable disposable, CallInvoker callInvoker) = InvokeGetCallInvoker(options); + + try + { + callInvoker.Should().BeOfType(); + GetOwnsChannel(callInvoker).Should().BeFalse(); + + // Act + await AssertRpcFailureAsync(callInvoker); + await AssertRpcFailureAsync(callInvoker); + await recreateRequested.Task.WaitAsync(TimeSpan.FromSeconds(5)); + await AssertRpcFailureAsync(callInvoker); + await recreatedTransportUsed.Task.WaitAsync(TimeSpan.FromSeconds(5)); + + // Assert + initialHandler.CallCount.Should().Be(2); + recreatedHandler.CallCount.Should().Be(1); + recreatorCalls.Should().Be(1); + } + finally + { + await disposable.DisposeAsync(); + } + } + finally + { + await DisposeChannelAsync(channel); + await DisposeChannelAsync(recreatedChannel); + } + } + + [Fact] + public async Task GetCallInvoker_WithAddressAndRecreator_UsesWrapperThatOwnsCreatedChannel() + { + // Arrange + GrpcDurableTaskClientOptions options = new() + { + Address = "http://owned.client.test", + }; + options.SetChannelRecreator((existingChannel, ct) => Task.FromResult(existingChannel)); + + // Act + (AsyncDisposable disposable, CallInvoker callInvoker) = InvokeGetCallInvoker(options); + + try + { + // Assert + callInvoker.Should().BeOfType(); + GetOwnsChannel(callInvoker).Should().BeTrue(); + } + finally + { + await disposable.DisposeAsync(); + } + } + + static (AsyncDisposable Disposable, CallInvoker CallInvoker) InvokeGetCallInvoker(GrpcDurableTaskClientOptions options) + { + object?[] args = { options, NullLogger.Instance, null }; + AsyncDisposable disposable = (AsyncDisposable)GetCallInvokerMethod.Invoke(null, args)!; + CallInvoker callInvoker = (CallInvoker)args[2]!; + return (disposable, callInvoker); + } + + static bool GetOwnsChannel(CallInvoker callInvoker) + { + return (bool)typeof(ChannelRecreatingCallInvoker) + .GetField("ownsChannel", BindingFlags.Instance | BindingFlags.NonPublic)! + .GetValue(callInvoker)!; + } + + static async Task AssertRpcFailureAsync(CallInvoker callInvoker) + { + Func act = async () => + { + using AsyncUnaryCall call = callInvoker.AsyncUnaryCall( + TestMethod, + host: null, + new CallOptions(deadline: DateTime.UtcNow.AddSeconds(1)), + request: "ping"); + + await call.ResponseAsync; + }; + + RpcException rpcException = (await act.Should().ThrowAsync()).Which; + rpcException.StatusCode.Should().Be(StatusCode.Unavailable); + } + + static GrpcChannel CreateChannel(string address, HttpMessageHandler handler) + { + return GrpcChannel.ForAddress(address, new GrpcChannelOptions + { + HttpHandler = handler, + }); + } + + static async ValueTask DisposeChannelAsync(GrpcChannel channel) + { + await channel.ShutdownAsync(); + channel.Dispose(); + } + + static HttpResponseMessage CreateFailureResponse(StatusCode statusCode, string detail) + { + HttpResponseMessage response = new(System.Net.HttpStatusCode.OK) + { + Version = new Version(2, 0), + Content = new ByteArrayContent([]), + }; + + response.Content.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("application/grpc"); + response.TrailingHeaders.Add("grpc-status", ((int)statusCode).ToString()); + response.TrailingHeaders.Add("grpc-message", detail); + return response; + } + + sealed class CallbackHttpMessageHandler : HttpMessageHandler + { + readonly Func> callback; + int callCount; + + public CallbackHttpMessageHandler(Func> callback) + { + this.callback = callback; + } + + public int CallCount => Volatile.Read(ref this.callCount); + + protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + Interlocked.Increment(ref this.callCount); + return this.callback(request, cancellationToken); + } + } +} diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs new file mode 100644 index 000000000..9dbed77da --- /dev/null +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -0,0 +1,209 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +using System.Reflection; +using Grpc.Core; +using Microsoft.DurableTask; +using Microsoft.DurableTask.Tests.Logging; +using Microsoft.DurableTask.Worker; +using Microsoft.DurableTask.Worker.Grpc.Internal; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Logging.Abstractions; + +namespace Microsoft.DurableTask.Worker.Grpc.Tests; + +public class GrpcDurableTaskWorkerTests +{ + const string Category = "Microsoft.DurableTask.Worker.Grpc"; + static readonly MethodInfo ExecuteAsyncMethod = typeof(GrpcDurableTaskWorker) + .GetMethod("ExecuteAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; + static readonly MethodInfo TryRecreateChannelAsyncMethod = typeof(GrpcDurableTaskWorker) + .GetMethod("TryRecreateChannelAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; + + [Fact] + public async Task ExecuteAsync_ConnectFailureThreshold_RecreatesConfiguredChannel() + { + // Arrange + CallbackHttpMessageHandler initialHandler = new((request, cancellationToken) => + Task.FromResult(CreateFailureResponse(StatusCode.Unavailable, "initial transport failure"))); + TaskCompletionSource recreatedTransportUsed = new(TaskCreationOptions.RunContinuationsAsynchronously); + CallbackHttpMessageHandler recreatedHandler = new(async (request, cancellationToken) => + { + recreatedTransportUsed.TrySetResult(); + await Task.Delay(Timeout.Infinite, cancellationToken); + return CreateFailureResponse(StatusCode.Cancelled, "recreated transport cancelled"); + }); + + GrpcChannel currentChannel = CreateChannel("http://initial.worker.test", initialHandler); + GrpcChannel recreatedChannel = CreateChannel("http://recreated.worker.test", recreatedHandler); + GrpcDurableTaskWorkerOptions grpcOptions = new() + { + Channel = currentChannel, + }; + grpcOptions.Internal.ChannelRecreateFailureThreshold = 2; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; + + DurableTaskWorkerOptions workerOptions = new() + { + Logging = { UseLegacyCategories = false }, + }; + TestLogProvider logProvider = new(new NullOutput()); + using CancellationTokenSource stoppingToken = new(); + int recreatorCalls = 0; + grpcOptions.SetChannelRecreator((channel, ct) => + { + recreatorCalls++; + return Task.FromResult(recreatedChannel); + }); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions, workerOptions, new SimpleLoggerFactory(logProvider)); + + try + { + // Act + Task executeTask = InvokeExecuteAsync(worker, stoppingToken.Token); + await recreatedTransportUsed.Task.WaitAsync(TimeSpan.FromSeconds(5)); + stoppingToken.Cancel(); + await executeTask; + + // Assert + recreatorCalls.Should().Be(1); + initialHandler.CallCount.Should().Be(2); + recreatedHandler.CallCount.Should().Be(1); + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => + log.Message.Contains("gRPC channel to backend has been recreated") + && log.Message.Contains(recreatedChannel.Target)); + } + finally + { + await DisposeChannelAsync(currentChannel); + await DisposeChannelAsync(recreatedChannel); + } + } + + [Fact] + public async Task TryRecreateChannelAsync_ConfiguredRecreatorReturningSameChannel_DoesNotRecreate() + { + // Arrange + GrpcChannel currentChannel = GrpcChannel.ForAddress("http://localhost:5003"); + GrpcDurableTaskWorkerOptions grpcOptions = new() + { + Channel = currentChannel, + }; + + GrpcChannel? observedChannel = null; + grpcOptions.SetChannelRecreator((channel, ct) => + { + observedChannel = channel; + return Task.FromResult(channel); + }); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions); + + try + { + // Act + object result = await InvokeTryRecreateChannelAsync(worker, currentChannel); + + // Assert + observedChannel.Should().BeSameAs(currentChannel); + GetResultProperty(result, "Recreated").Should().BeFalse(); + GetResultProperty(result, "NewChannel").Should().BeNull(); + GetResultProperty(result, "NewAddress").Should().BeNull(); + } + finally + { + await DisposeChannelAsync(currentChannel); + } + } + + static GrpcDurableTaskWorker CreateWorker(GrpcDurableTaskWorkerOptions grpcOptions) + { + return CreateWorker(grpcOptions, new DurableTaskWorkerOptions(), NullLoggerFactory.Instance); + } + + static GrpcDurableTaskWorker CreateWorker( + GrpcDurableTaskWorkerOptions grpcOptions, + DurableTaskWorkerOptions workerOptions, + ILoggerFactory loggerFactory) + { + Mock factoryMock = new(MockBehavior.Strict); + + return new GrpcDurableTaskWorker( + name: "Test", + factory: factoryMock.Object, + grpcOptions: new OptionsMonitorStub(grpcOptions), + workerOptions: new OptionsMonitorStub(workerOptions), + services: Mock.Of(), + loggerFactory: loggerFactory, + orchestrationFilter: null, + exceptionPropertiesProvider: null); + } + + static Task InvokeExecuteAsync(GrpcDurableTaskWorker worker, CancellationToken cancellationToken) + { + return (Task)ExecuteAsyncMethod.Invoke(worker, new object?[] { cancellationToken })!; + } + + static async Task InvokeTryRecreateChannelAsync(GrpcDurableTaskWorker worker, GrpcChannel currentChannel) + { + object?[] args = { CancellationToken.None, default(AsyncDisposable), currentChannel }; + Task task = (Task)TryRecreateChannelAsyncMethod.Invoke(worker, args)!; + await task; + return task.GetType().GetProperty("Result")!.GetValue(task)!; + } + + static T GetResultProperty(object result, string propertyName) + { + return (T)result.GetType().GetProperty(propertyName)!.GetValue(result)!; + } + + static async ValueTask DisposeChannelAsync(GrpcChannel channel) + { + await channel.ShutdownAsync(); + channel.Dispose(); + } + + static GrpcChannel CreateChannel(string address, HttpMessageHandler handler) + { + return GrpcChannel.ForAddress(address, new GrpcChannelOptions + { + HttpHandler = handler, + }); + } + + static HttpResponseMessage CreateFailureResponse(StatusCode statusCode, string detail) + { + HttpResponseMessage response = new(System.Net.HttpStatusCode.OK) + { + Version = new Version(2, 0), + Content = new ByteArrayContent([]), + }; + + response.Content.Headers.ContentType = new System.Net.Http.Headers.MediaTypeHeaderValue("application/grpc"); + response.TrailingHeaders.Add("grpc-status", ((int)statusCode).ToString()); + response.TrailingHeaders.Add("grpc-message", detail); + return response; + } + + sealed class CallbackHttpMessageHandler : HttpMessageHandler + { + readonly Func> callback; + int callCount; + + public CallbackHttpMessageHandler(Func> callback) + { + this.callback = callback; + } + + public int CallCount => Volatile.Read(ref this.callCount); + + protected override Task SendAsync(HttpRequestMessage request, CancellationToken cancellationToken) + { + Interlocked.Increment(ref this.callCount); + return this.callback(request, cancellationToken); + } + } +} diff --git a/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs index 98370625f..c2890fa39 100644 --- a/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs +++ b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs @@ -177,19 +177,24 @@ public async Task UnrelatedRpcException_Propagates() await act.Should().ThrowAsync().Where(e => e.StatusCode == StatusCode.Unavailable); } - [Fact] - public async Task DisabledSilentDisconnect_OnlyShutdownEndsLoop() + [Theory] + [InlineData(0)] + [InlineData(-1)] + public async Task NonPositiveSilentDisconnectTimeout_OnlyShutdownEndsLoop(int timeoutMilliseconds) { + // Arrange using CancellationTokenSource outer = new(); outer.CancelAfter(ShortTimeout); + // Act WorkItemStreamResult result = await WorkItemStreamConsumer.ConsumeAsync( openStream: ct => HangingStream(ct, throwAsRpc: false), - silentDisconnectTimeout: TimeSpan.Zero, // disabled + silentDisconnectTimeout: TimeSpan.FromMilliseconds(timeoutMilliseconds), onItem: _ => { }, onFirstMessage: null, cancellation: outer.Token); + // Assert result.Outcome.Should().Be(WorkItemStreamOutcome.Shutdown); } @@ -207,13 +212,7 @@ public async Task DisabledSilentDisconnect_OnlyShutdownEndsLoop() } } - static async IAsyncEnumerable ThrowingStream(Exception ex) - { - throw ex; -#pragma warning disable CS0162 // Unreachable code detected - yield break; -#pragma warning restore CS0162 - } + static IAsyncEnumerable ThrowingStream(Exception ex) => new ThrowingAsyncEnumerable(ex); #pragma warning restore CS1998 static async IAsyncEnumerable HangingStream( @@ -232,4 +231,22 @@ public async Task DisabledSilentDisconnect_OnlyShutdownEndsLoop() yield break; } + + sealed class ThrowingAsyncEnumerable : IAsyncEnumerable, IAsyncEnumerator + { + readonly Exception exception; + + public ThrowingAsyncEnumerable(Exception exception) + { + this.exception = exception; + } + + public P.WorkItem Current => throw new InvalidOperationException("No current item is available for a throwing stream."); + + public IAsyncEnumerator GetAsyncEnumerator(CancellationToken cancellationToken = default) => this; + + public ValueTask DisposeAsync() => default; + + public ValueTask MoveNextAsync() => ValueTask.FromException(this.exception); + } } From 4bc4a0887469a11ca7302f9a1576f4dca48f1989 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Wed, 22 Apr 2026 16:45:02 -0700 Subject: [PATCH 14/27] Stabilize WorkItemStreamConsumer timing test Measure the keep-alive interval from the consumer's first processed item so the heartbeat-reset test no longer races the CI scheduler. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc.Tests/WorkItemStreamConsumerTests.cs | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs index c2890fa39..2464c0c7e 100644 --- a/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs +++ b/test/Worker/Grpc.Tests/WorkItemStreamConsumerTests.cs @@ -139,22 +139,33 @@ public async Task OuterCancellation_WithRpcCancelledFromStream_PropagatesExcepti [Fact] public async Task PerItem_HeartbeatReset_KeepsTimerAlive() { - // Feed one item, wait ~2x the timeout, then complete. Without a heartbeat reset on the - // delivered item the timer would fire mid-wait and the outcome would be SilentDisconnect. + // Feed one item, wait long enough that the original timer would have expired, then complete. + // Synchronize on the first item actually being processed so the second delay is measured from + // the consumer's timer reset instead of from the test thread's write timing. Channel channel = Channel.CreateUnbounded(); - TimeSpan timeout = TimeSpan.FromMilliseconds(200); + TimeSpan timeout = TimeSpan.FromMilliseconds(500); + TaskCompletionSource firstItemProcessed = new(TaskCreationOptions.RunContinuationsAsynchronously); + int itemCount = 0; Task consumeTask = WorkItemStreamConsumer.ConsumeAsync( openStream: ct => channel.Reader.ReadAllAsync(ct), silentDisconnectTimeout: timeout, - onItem: _ => { }, + onItem: _ => + { + if (Interlocked.Increment(ref itemCount) == 1) + { + firstItemProcessed.TrySetResult(); + } + }, onFirstMessage: null, cancellation: CancellationToken.None); - // Deliver an item just before each timer would fire to keep the stream "alive". - await Task.Delay(timeout / 2); + await Task.Delay(TimeSpan.FromMilliseconds(150)); await channel.Writer.WriteAsync(new P.WorkItem { HealthPing = new P.HealthPing() }); - await Task.Delay(timeout / 2); + await firstItemProcessed.Task.WaitAsync(TimeSpan.FromSeconds(5)); + + // Without the per-item reset, the original timer would fire before this second item arrives. + await Task.Delay(TimeSpan.FromMilliseconds(400)); await channel.Writer.WriteAsync(new P.WorkItem { HealthPing = new P.HealthPing() }); channel.Writer.Complete(); From aba04b7fa246e9e2d3ae1750b46e719727744abb Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Wed, 22 Apr 2026 17:34:59 -0700 Subject: [PATCH 15/27] Clamp worker hello deadline to UTC max Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc/GrpcDurableTaskWorker.Processor.cs | 10 +- .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 109 ++++++++++++++++++ 2 files changed, 115 insertions(+), 4 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index 1ab11badc..f4b6dafd7 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -304,11 +304,13 @@ async ValueTask BuildRuntimeStateAsync( if (helloDeadline > TimeSpan.Zero) { - // Clamp to DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot - // throw ArgumentOutOfRangeException out of DateTime.Add and crash the connect loop. + // Clamp to a UTC DateTime.MaxValue so a misconfigured (very large) HelloDeadline cannot + // throw ArgumentOutOfRangeException out of DateTime.Add and so the gRPC deadline remains + // unambiguous during internal normalization. DateTime now = DateTime.UtcNow; - TimeSpan maxOffset = DateTime.MaxValue - now; - deadline = helloDeadline >= maxOffset ? DateTime.MaxValue : now.Add(helloDeadline); + DateTime maxDeadlineUtc = DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc); + TimeSpan maxOffset = maxDeadlineUtc - now; + deadline = helloDeadline >= maxOffset ? maxDeadlineUtc : now.Add(helloDeadline); } await this.client!.HelloAsync(EmptyMessage, deadline: deadline, cancellationToken: cancellation); diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index 9dbed77da..6e20796ae 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System.Reflection; +using Google.Protobuf.WellKnownTypes; using Grpc.Core; using Microsoft.DurableTask; using Microsoft.DurableTask.Tests.Logging; @@ -9,6 +10,7 @@ using Microsoft.DurableTask.Worker.Grpc.Internal; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging.Abstractions; +using P = Microsoft.DurableTask.Protobuf; namespace Microsoft.DurableTask.Worker.Grpc.Tests; @@ -17,6 +19,9 @@ public class GrpcDurableTaskWorkerTests const string Category = "Microsoft.DurableTask.Worker.Grpc"; static readonly MethodInfo ExecuteAsyncMethod = typeof(GrpcDurableTaskWorker) .GetMethod("ExecuteAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; + static readonly MethodInfo ProcessorConnectAsyncMethod = typeof(GrpcDurableTaskWorker) + .GetNestedType("Processor", BindingFlags.NonPublic)! + .GetMethod("ConnectAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; static readonly MethodInfo TryRecreateChannelAsyncMethod = typeof(GrpcDurableTaskWorker) .GetMethod("TryRecreateChannelAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; @@ -119,6 +124,33 @@ public async Task TryRecreateChannelAsync_ConfiguredRecreatorReturningSameChanne } } + [Fact] + public async Task ConnectAsync_VeryLargeHelloDeadline_UsesUtcMaxValueDeadline() + { + // Arrange + GrpcDurableTaskWorkerOptions grpcOptions = new(); + grpcOptions.SetHelloDeadline(TimeSpan.MaxValue); + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions); + RecordingCallInvoker callInvoker = new(); + P.TaskHubSidecarService.TaskHubSidecarServiceClient client = new(callInvoker); + object processor = CreateProcessor(worker, client); + + // Act + AsyncServerStreamingCall stream = await InvokeProcessorConnectAsync(processor); + + try + { + // Assert + callInvoker.HelloDeadline.Should().HaveValue(); + callInvoker.HelloDeadline!.Value.Kind.Should().Be(DateTimeKind.Utc); + callInvoker.HelloDeadline.Value.Should().Be(DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc)); + } + finally + { + stream.Dispose(); + } + } + static GrpcDurableTaskWorker CreateWorker(GrpcDurableTaskWorkerOptions grpcOptions) { return CreateWorker(grpcOptions, new DurableTaskWorkerOptions(), NullLoggerFactory.Instance); @@ -147,6 +179,24 @@ static Task InvokeExecuteAsync(GrpcDurableTaskWorker worker, CancellationToken c return (Task)ExecuteAsyncMethod.Invoke(worker, new object?[] { cancellationToken })!; } + static object CreateProcessor(GrpcDurableTaskWorker worker, P.TaskHubSidecarService.TaskHubSidecarServiceClient client) + { + System.Type processorType = typeof(GrpcDurableTaskWorker).GetNestedType("Processor", BindingFlags.NonPublic)!; + return Activator.CreateInstance( + processorType, + BindingFlags.Public | BindingFlags.Instance, + binder: null, + args: new object?[] { worker, client, null, null }, + culture: null)!; + } + + static async Task> InvokeProcessorConnectAsync(object processor) + { + Task task = (Task)ProcessorConnectAsyncMethod.Invoke(processor, new object?[] { CancellationToken.None })!; + await task; + return (AsyncServerStreamingCall)task.GetType().GetProperty("Result")!.GetValue(task)!; + } + static async Task InvokeTryRecreateChannelAsync(GrpcDurableTaskWorker worker, GrpcChannel currentChannel) { object?[] args = { CancellationToken.None, default(AsyncDisposable), currentChannel }; @@ -206,4 +256,63 @@ protected override Task SendAsync(HttpRequestMessage reques return this.callback(request, cancellationToken); } } + + sealed class RecordingCallInvoker : CallInvoker + { + public DateTime? HelloDeadline { get; private set; } + + public override TResponse BlockingUnaryCall(Method method, string? host, CallOptions options, TRequest request) + { + throw new NotSupportedException(); + } + + public override AsyncUnaryCall AsyncUnaryCall(Method method, string? host, CallOptions options, TRequest request) + { + if (method.FullName == "/TaskHubSidecarService/Hello") + { + this.HelloDeadline = options.Deadline; + TResponse response = (TResponse)(object)new Empty(); + return new AsyncUnaryCall( + Task.FromResult(response), + Task.FromResult(new Metadata()), + () => new Status(StatusCode.OK, string.Empty), + () => new Metadata(), + () => { }); + } + + throw new NotSupportedException($"Unexpected unary method {method.FullName}."); + } + + public override AsyncServerStreamingCall AsyncServerStreamingCall(Method method, string? host, CallOptions options, TRequest request) + { + if (method.FullName == "/TaskHubSidecarService/GetWorkItems") + { + return new AsyncServerStreamingCall( + new EmptyAsyncStreamReader(), + Task.FromResult(new Metadata()), + () => new Status(StatusCode.OK, string.Empty), + () => new Metadata(), + () => { }); + } + + throw new NotSupportedException($"Unexpected server-streaming method {method.FullName}."); + } + + public override AsyncClientStreamingCall AsyncClientStreamingCall(Method method, string? host, CallOptions options) + { + throw new NotSupportedException(); + } + + public override AsyncDuplexStreamingCall AsyncDuplexStreamingCall(Method method, string? host, CallOptions options) + { + throw new NotSupportedException(); + } + } + + sealed class EmptyAsyncStreamReader : IAsyncStreamReader + { + public T Current => default!; + + public Task MoveNext(CancellationToken cancellationToken) => Task.FromResult(false); + } } From 5594c7b3bb0fff4ed13522d873f069a3ed3e1b8b Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Wed, 22 Apr 2026 18:22:43 -0700 Subject: [PATCH 16/27] Simplify worker recreate flow Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc/ChannelRecreatingCallInvoker.cs | 27 ++++- .../DurableTaskSchedulerWorkerExtensions.cs | 105 +++++++----------- .../Grpc/GrpcDurableTaskWorkerOptions.cs | 5 +- .../Internal/InternalOptionsExtensions.cs | 20 ++-- ...DurableTaskClientChannelRecreationTests.cs | 36 ++++++ ...rableTaskSchedulerWorkerExtensionsTests.cs | 43 +++++++ .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 18 +-- 7 files changed, 161 insertions(+), 93 deletions(-) diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index a060307bf..fcf10023e 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -163,8 +163,31 @@ public async ValueTask DisposeAsync() static long CreateInitialRecreateTimestamp(TimeSpan minRecreateInterval) => Stopwatch.GetTimestamp() - ToStopwatchTicks(minRecreateInterval); - static long ToStopwatchTicks(TimeSpan ts) => - (long)(ts.TotalSeconds * Stopwatch.Frequency); + static long ToStopwatchTicks(TimeSpan ts) + { + if (ts <= TimeSpan.Zero) + { + return 0; + } + + long timeSpanTicks = ts.Ticks; + long wholeSeconds = timeSpanTicks / TimeSpan.TicksPerSecond; + long remainingTimeSpanTicks = timeSpanTicks % TimeSpan.TicksPerSecond; + if (wholeSeconds > long.MaxValue / Stopwatch.Frequency) + { + return long.MaxValue; + } + + long wholeSecondStopwatchTicks = wholeSeconds * Stopwatch.Frequency; + long partialSecondStopwatchTicks = + (long)(((decimal)remainingTimeSpanTicks * Stopwatch.Frequency) / TimeSpan.TicksPerSecond); + if (wholeSecondStopwatchTicks > long.MaxValue - partialSecondStopwatchTicks) + { + return long.MaxValue; + } + + return wholeSecondStopwatchTicks + partialSecondStopwatchTicks; + } static TimeSpan ElapsedSince(long previousTimestamp, long nowTimestamp) { diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index 37f0d58c2..f3c7fac15 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -2,6 +2,7 @@ // Licensed under the MIT License. using System.Collections.Concurrent; +using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Threading; @@ -112,6 +113,7 @@ sealed class ConfigureGrpcChannel : IConfigureNamedOptions schedulerOptions; readonly ConcurrentDictionary> channels = new(); + readonly object syncRoot = new(); volatile int disposed; /// @@ -162,97 +164,63 @@ public void Configure(string? name, GrpcDurableTaskWorkerOptions options) } /// - /// Atomically swaps the cached channel for the given key with a freshly created one and schedules + /// Replaces the cached worker channel for the given key with a freshly created one and schedules /// graceful disposal of the old channel after a grace period so any in-flight RPCs that already - /// captured the previous channel can drain. Returns the currently cached channel if another recreate - /// attempt has already refreshed the shared entry. + /// captured the previous channel can drain. /// async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedulerWorkerOptions source, GrpcChannel oldChannel, CancellationToken cancellation) { cancellation.ThrowIfCancellationRequested(); - // Recreate callbacks can outlive Configure(...) because workers keep the delegate on their - // options. Best-effort check for disposal before publishing anything back into the shared cache. + // Worker cache keys include WorkerId, so a single worker normally owns each cached entry. + // Recreate callbacks can outlive Configure(...), however, so still guard against disposal + // racing with channel publication back into the shared cache owner. if (this.disposed == 1) { throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); } - // Shared-cache recreation has four relevant states: - // 1. No entry exists anymore. Create one and use it. - // 2. The entry already materialized a different channel. Another recreate attempt already refreshed it. - // 3. The entry still represents what this worker observed. Win TryUpdate and publish the new channel. - // 4. The entry changes between our read and TryUpdate. Lose the race, dispose ours, and reuse the winner. - if (!this.channels.TryGetValue(cacheKey, out Lazy? currentLazy)) + // Materialize the replacement channel BEFORE publishing it so a CreateChannel failure leaves + // the current cache entry intact. + GrpcChannel newChannel = source.CreateChannel(); + Lazy newLazy = new(newChannel); + bool disposeNewChannel = false; + GrpcChannel? cachedChannel = null; + lock (this.syncRoot) { - // No entry — create one and return it. PublicationOnly ensures a transient - // CreateChannel failure doesn't permanently poison the cache slot (the default - // ExecutionAndPublication mode caches exceptions for the lifetime of the Lazy). - Lazy created = new(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly); if (this.disposed == 1) { - throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + disposeNewChannel = true; } - - if (this.channels.TryAdd(cacheKey, created)) + else if (this.channels.TryGetValue(cacheKey, out Lazy? currentLazy) + && currentLazy.IsValueCreated + && !ReferenceEquals(currentLazy.Value, oldChannel)) { - return created.Value; + // The cache already moved past oldChannel (for example because this callback is stale). + // Keep the current winner and discard the newly-created replacement. + disposeNewChannel = true; + cachedChannel = currentLazy.Value; + } + else + { + this.channels[cacheKey] = newLazy; } - - // Another thread added one; fall through to read it. - this.channels.TryGetValue(cacheKey, out currentLazy); - } - - if (currentLazy is null) - { - throw new InvalidOperationException("Failed to obtain a cached gRPC channel after recreation attempt."); - } - - // Only a materialized Lazy can be compared against oldChannel by reference. If the cache slot - // has not created its channel yet, let TryUpdate decide whether this recreate attempt still owns it. - if (currentLazy.IsValueCreated && !ReferenceEquals(currentLazy.Value, oldChannel)) - { - // Another recreate attempt already swapped in a new channel; reuse it. - return currentLazy.Value; } - // Materialize the new channel BEFORE swapping the dictionary so a CreateChannel failure - // leaves the existing entry intact. If we swapped a not-yet-materialized Lazy and then - // CreateChannel threw, the dictionary would point to a permanently-failing Lazy and the - // old channel would have already been queued for disposal — an unrecoverable state. - GrpcChannel newChannel = source.CreateChannel(); - if (this.disposed == 1) + if (disposeNewChannel) { await DisposeChannelAsync(newChannel).ConfigureAwait(false); - throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); - } - - // The cache always stores Lazy so the steady-state Configure path and the - // recreate path use the same dictionary value shape. Recreate materializes first only to - // avoid publishing a lazy that could fault before we know channel creation succeeded. - Lazy newLazy = new(newChannel); - if (!this.channels.TryUpdate(cacheKey, newLazy, currentLazy)) - { - // Lost the race. Always queue the freshly-created channel for deferred disposal so - // it does not leak. Then return the winning entry — but if the cache slot has been - // removed entirely (e.g. concurrent DisposeAsync cleared the dictionary), do NOT - // hand back the doomed `newChannel`: it has already been scheduled for shutdown. - _ = ScheduleDeferredDisposeAsync(newChannel); - if (this.channels.TryGetValue(cacheKey, out Lazy? winner) && winner is not null) + if (cachedChannel is not null) { - return winner.Value; + return cachedChannel; } - throw new ObjectDisposedException(this.GetType().FullName); + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); } // Successful swap. Schedule graceful disposal of the old channel after a grace period // so any in-flight RPCs that already captured it can drain. - if (currentLazy.IsValueCreated) - { - _ = ScheduleDeferredDisposeAsync(currentLazy.Value); - } - + _ = ScheduleDeferredDisposeAsync(oldChannel); return newChannel; } @@ -285,7 +253,14 @@ public async ValueTask DisposeAsync() return; } - foreach (Lazy channel in this.channels.Values.Where(lazy => lazy.IsValueCreated)) + List> channelsToDispose; + lock (this.syncRoot) + { + channelsToDispose = this.channels.Values.Where(lazy => lazy.IsValueCreated).ToList(); + this.channels.Clear(); + } + + foreach (Lazy channel in channelsToDispose) { try { @@ -305,8 +280,6 @@ and not StackOverflowException } } } - - this.channels.Clear(); GC.SuppressFinalize(this); } diff --git a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs index 85277abec..464c50a8b 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorkerOptions.cs @@ -138,9 +138,8 @@ internal class InternalOptions /// /// Gets or sets an optional callback invoked when the worker requests a fresh gRPC channel after /// repeated connect failures. The callback receives the previously-used channel and should return - /// either a freshly created channel or the currently cached channel if another caller has already - /// recreated it. Implementations are responsible for atomic swap and deferred disposal of the old - /// channel so in-flight RPCs already using it are not interrupted. + /// its replacement. Implementations are responsible for publishing the replacement channel and + /// deferring disposal of the old channel so in-flight RPCs already using it are not interrupted. /// public Func>? ChannelRecreator { get; set; } } diff --git a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs index 7a5e2e9c1..c071123ca 100644 --- a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs +++ b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs @@ -28,16 +28,16 @@ public static void ConfigureForAzureManaged(this GrpcDurableTaskWorkerOptions op options.Internal.InsertEntityUnlocksOnCompletion = true; } - /// - /// Sets a callback that the worker invokes when the underlying gRPC channel needs to be recreated - /// after repeated connect failures (e.g., because the backend was replaced and the existing channel - /// is wedged on a half-open HTTP/2 connection). The callback receives the channel the worker last - /// observed and must return either a freshly created channel or the currently cached channel if another - /// caller has already swapped it. Implementations are responsible for atomic swap and deferred disposal - /// of the old channel so in-flight RPCs already using it are not interrupted. - /// - /// The gRPC worker options. - /// The recreate callback. + /// + /// Sets a callback that the worker invokes when the underlying gRPC channel needs to be recreated + /// after repeated connect failures (e.g., because the backend was replaced and the existing channel + /// is wedged on a half-open HTTP/2 connection). The callback receives the channel the worker last + /// observed and must return its replacement. Implementations are responsible for publishing the + /// replacement channel and deferring disposal of the old channel so in-flight RPCs already using it + /// are not interrupted. + /// + /// The gRPC worker options. + /// The recreate callback. /// /// This is an internal API that supports the DurableTask infrastructure and not subject to /// the same compatibility standards as public APIs. It may be changed or removed without notice in diff --git a/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs b/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs index 31bc6d59f..df3487459 100644 --- a/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs +++ b/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Diagnostics; using System.Reflection; using System.Text; using Grpc.Core; @@ -23,6 +24,8 @@ public class GrpcDurableTaskClientChannelRecreationTests StringMarshaller); static readonly MethodInfo GetCallInvokerMethod = typeof(GrpcDurableTaskClient) .GetMethod("GetCallInvoker", BindingFlags.Static | BindingFlags.NonPublic)!; + static readonly MethodInfo ToStopwatchTicksMethod = typeof(ChannelRecreatingCallInvoker) + .GetMethod("ToStopwatchTicks", BindingFlags.Static | BindingFlags.NonPublic)!; [Fact] public async Task GetCallInvoker_WithProvidedChannel_RecreatesTransportAfterUnaryFailure() @@ -114,6 +117,34 @@ public async Task GetCallInvoker_WithAddressAndRecreator_UsesWrapperThatOwnsCrea } } + [Theory] + [InlineData(0, 0)] + [InlineData(-1, 0)] + public void ToStopwatchTicks_NonPositiveInterval_ReturnsZero(long ticks, long expected) + { + // Arrange + TimeSpan interval = TimeSpan.FromTicks(ticks); + + // Act + long stopwatchTicks = InvokeToStopwatchTicks(interval); + + // Assert + stopwatchTicks.Should().Be(expected); + } + + [Fact] + public void ToStopwatchTicks_VeryLargeInterval_SaturatesAtLongMaxValue() + { + // Arrange + TimeSpan interval = TimeSpan.MaxValue; + + // Act + long stopwatchTicks = InvokeToStopwatchTicks(interval); + + // Assert + stopwatchTicks.Should().Be(long.MaxValue); + } + static (AsyncDisposable Disposable, CallInvoker CallInvoker) InvokeGetCallInvoker(GrpcDurableTaskClientOptions options) { object?[] args = { options, NullLogger.Instance, null }; @@ -129,6 +160,11 @@ static bool GetOwnsChannel(CallInvoker callInvoker) .GetValue(callInvoker)!; } + static long InvokeToStopwatchTicks(TimeSpan interval) + { + return (long)ToStopwatchTicksMethod.Invoke(null, new object?[] { interval })!; + } + static async Task AssertRpcFailureAsync(CallInvoker callInvoker) { Func act = async () => diff --git a/test/Worker/AzureManaged.Tests/DurableTaskSchedulerWorkerExtensionsTests.cs b/test/Worker/AzureManaged.Tests/DurableTaskSchedulerWorkerExtensionsTests.cs index 6e08347ee..0c1d72805 100644 --- a/test/Worker/AzureManaged.Tests/DurableTaskSchedulerWorkerExtensionsTests.cs +++ b/test/Worker/AzureManaged.Tests/DurableTaskSchedulerWorkerExtensionsTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Reflection; using Azure.Core; using Azure.Identity; using FluentAssertions; @@ -292,6 +293,37 @@ public async Task UseDurableTaskScheduler_ServiceProviderDispose_DisposesChannel newOptions.Channel.Should().NotBeSameAs(channel, "new provider should create a new channel"); } + [Fact] + public async Task UseDurableTaskScheduler_ChannelRecreator_ReplacesCachedChannel() + { + // Arrange + ServiceCollection services = new ServiceCollection(); + Mock mockBuilder = new Mock(); + mockBuilder.Setup(b => b.Services).Returns(services); + DefaultAzureCredential credential = new DefaultAzureCredential(); + mockBuilder.Object.UseDurableTaskScheduler(ValidEndpoint, ValidTaskHub, credential, options => + { + options.WorkerId = "worker-id-1"; + }); + + await using ServiceProvider provider = services.BuildServiceProvider(); + IOptionsFactory optionsFactory = provider.GetRequiredService>(); + GrpcDurableTaskWorkerOptions options = optionsFactory.Create(Options.DefaultName); + GrpcChannel originalChannel = options.Channel!; + + // Act + Func>? recreator = GetChannelRecreator(options); + recreator.Should().NotBeNull(); + GrpcChannel recreatedChannel = await recreator!(originalChannel, CancellationToken.None); + GrpcChannel staleRecreateResult = await recreator(originalChannel, CancellationToken.None); + GrpcDurableTaskWorkerOptions refreshedOptions = optionsFactory.Create(Options.DefaultName); + + // Assert + recreatedChannel.Should().NotBeSameAs(originalChannel); + staleRecreateResult.Should().BeSameAs(recreatedChannel, "a stale recreate callback should keep the already-cached replacement channel"); + refreshedOptions.Channel.Should().BeSameAs(recreatedChannel, "the worker recreator should replace the cached channel for the same worker entry"); + } + [Fact] public async Task UseDurableTaskScheduler_ConfigureAfterDispose_ThrowsObjectDisposedException() { @@ -448,5 +480,16 @@ public async Task UseDurableTaskScheduler_DifferentWorkerId_UsesSeparateChannels options2.Channel.Should().NotBeNull(); options1.Channel.Should().NotBeSameAs(options2.Channel, "different WorkerId should use different channels"); } + + static Func>? GetChannelRecreator(GrpcDurableTaskWorkerOptions options) + { + object internalOptions = typeof(GrpcDurableTaskWorkerOptions) + .GetProperty("Internal", BindingFlags.Instance | BindingFlags.NonPublic)! + .GetValue(options)!; + + return (Func>?)internalOptions.GetType() + .GetProperty("ChannelRecreator", BindingFlags.Instance | BindingFlags.Public)! + .GetValue(internalOptions); + } } diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index 6e20796ae..075eacb80 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -136,19 +136,13 @@ public async Task ConnectAsync_VeryLargeHelloDeadline_UsesUtcMaxValueDeadline() object processor = CreateProcessor(worker, client); // Act - AsyncServerStreamingCall stream = await InvokeProcessorConnectAsync(processor); + using AsyncServerStreamingCall stream = await InvokeProcessorConnectAsync(processor); - try - { - // Assert - callInvoker.HelloDeadline.Should().HaveValue(); - callInvoker.HelloDeadline!.Value.Kind.Should().Be(DateTimeKind.Utc); - callInvoker.HelloDeadline.Value.Should().Be(DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc)); - } - finally - { - stream.Dispose(); - } + // Assert + callInvoker.HelloDeadline.Should().HaveValue(); + DateTime deadline = callInvoker.HelloDeadline!.Value; + deadline.Kind.Should().Be(DateTimeKind.Utc); + deadline.Should().Be(DateTime.SpecifyKind(DateTime.MaxValue, DateTimeKind.Utc)); } static GrpcDurableTaskWorker CreateWorker(GrpcDurableTaskWorkerOptions grpcOptions) From c0dbb3db910c500e293d71855c33d8e070fd0b0a Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Wed, 22 Apr 2026 20:05:57 -0700 Subject: [PATCH 17/27] Fix worker recreate ownership Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 13 ++--- .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 52 ++++++++++++++++++- 2 files changed, 58 insertions(+), 7 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index 83e30ebd6..cf4c52f9b 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -89,9 +89,9 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) workerOwnedChannelDisposable = result.NewWorkerOwnedDisposable; this.logger.ChannelRecreated(address); - // Dispose the prior worker-owned channel if we had one. Path 1 keeps ownership with the - // recreator, and Path 3 never recreates at all, so only Path 2 ever installs a non-default - // AsyncDisposable here. + // Dispose the prior worker-owned channel if we had one. Path 1 hands ownership of the + // replacement channel to the recreator, Path 2 installs a fresh worker-owned disposable, + // and Path 3 never recreates at all. await previousDisposable.DisposeAsync(); } @@ -124,9 +124,10 @@ async Task TryRecreateChannelAsync( GrpcChannel newChannel = await recreator(currentChannel, cancellation).ConfigureAwait(false); if (!ReferenceEquals(newChannel, currentChannel)) { - // The recreator may return a shared cached channel. It also owns disposal of the - // previous channel, so the outer worker keeps its existing AsyncDisposable. - return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, currentWorkerOwnedDisposable, newChannel); + // The recreator owns the replacement channel lifetime. Return a default disposable + // so the caller disposes the previous worker-owned channel exactly once without + // carrying that ownership forward to the recreated state. + return new ChannelRecreateResult(true, newChannel.CreateCallInvoker(), newChannel.Target, default, newChannel); } // Recreator returned the same instance — nothing to swap. diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index 075eacb80..ba0f1ba59 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -124,6 +124,48 @@ public async Task TryRecreateChannelAsync_ConfiguredRecreatorReturningSameChanne } } + [Fact] + public async Task TryRecreateChannelAsync_ConfiguredRecreatorReturningDifferentChannel_DoesNotCarryForwardOldDisposable() + { + // Arrange + GrpcChannel currentChannel = GrpcChannel.ForAddress("http://localhost:5004"); + GrpcChannel recreatedChannel = GrpcChannel.ForAddress("http://localhost:5005"); + GrpcDurableTaskWorkerOptions grpcOptions = new() + { + Channel = currentChannel, + }; + grpcOptions.SetChannelRecreator((channel, ct) => Task.FromResult(recreatedChannel)); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions); + int disposeCalls = 0; + AsyncDisposable currentWorkerOwnedDisposable = new(() => + { + Interlocked.Increment(ref disposeCalls); + return ValueTask.CompletedTask; + }); + + try + { + // Act + object result = await InvokeTryRecreateChannelAsync(worker, currentWorkerOwnedDisposable, currentChannel); + AsyncDisposable newDisposable = GetResultProperty(result, "NewWorkerOwnedDisposable"); + + // Simulate the outer worker handoff. + await currentWorkerOwnedDisposable.DisposeAsync(); + await newDisposable.DisposeAsync(); + + // Assert + GetResultProperty(result, "Recreated").Should().BeTrue(); + GetResultProperty(result, "NewChannel").Should().BeSameAs(recreatedChannel); + Volatile.Read(ref disposeCalls).Should().Be(1); + } + finally + { + await DisposeChannelAsync(currentChannel); + await DisposeChannelAsync(recreatedChannel); + } + } + [Fact] public async Task ConnectAsync_VeryLargeHelloDeadline_UsesUtcMaxValueDeadline() { @@ -193,7 +235,15 @@ static object CreateProcessor(GrpcDurableTaskWorker worker, P.TaskHubSidecarServ static async Task InvokeTryRecreateChannelAsync(GrpcDurableTaskWorker worker, GrpcChannel currentChannel) { - object?[] args = { CancellationToken.None, default(AsyncDisposable), currentChannel }; + return await InvokeTryRecreateChannelAsync(worker, default, currentChannel); + } + + static async Task InvokeTryRecreateChannelAsync( + GrpcDurableTaskWorker worker, + AsyncDisposable currentWorkerOwnedDisposable, + GrpcChannel currentChannel) + { + object?[] args = { CancellationToken.None, currentWorkerOwnedDisposable, currentChannel }; Task task = (Task)TryRecreateChannelAsyncMethod.Invoke(worker, args)!; await task; return task.GetType().GetProperty("Result")!.GetValue(task)!; From c08076e9381e03d7d40e9edd2f456d7f38094aab Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 10:07:49 -0700 Subject: [PATCH 18/27] Simplify worker channel cache Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../DurableTaskSchedulerWorkerExtensions.cs | 110 ++++++++++-------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index f3c7fac15..69f4fa6fe 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; using System.Linq; @@ -112,7 +111,7 @@ static void ConfigureSchedulerOptions( sealed class ConfigureGrpcChannel : IConfigureNamedOptions, IAsyncDisposable { readonly IOptionsMonitor schedulerOptions; - readonly ConcurrentDictionary> channels = new(); + readonly Dictionary channels = new(); readonly object syncRoot = new(); volatile int disposed; @@ -138,15 +137,6 @@ public ConfigureGrpcChannel(IOptionsMonitor s /// The options instance to configure. public void Configure(string? name, GrpcDurableTaskWorkerOptions options) { -#if NET7_0_OR_GREATER - ObjectDisposedException.ThrowIf(this.disposed == 1, this); -#else - if (this.disposed == 1) - { - throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); - } -#endif - string optionsName = name ?? Options.DefaultName; DurableTaskSchedulerWorkerOptions source = this.schedulerOptions.Get(optionsName); @@ -156,9 +146,42 @@ public void Configure(string? name, GrpcDurableTaskWorkerOptions options) // Use a delimiter character (\u001F) that will not appear in typical endpoint URIs. string credentialType = source.Credential?.GetType().FullName ?? "null"; string cacheKey = $"{optionsName}\u001F{source.EndpointAddress}\u001F{source.TaskHubName}\u001F{source.ResourceId}\u001F{credentialType}\u001F{source.AllowInsecureCredentials}\u001F{source.WorkerId}"; - options.Channel = this.channels.GetOrAdd( - cacheKey, - _ => new Lazy(source.CreateChannel, LazyThreadSafetyMode.PublicationOnly)).Value; + GrpcChannel? newChannel = null; + bool disposeNewChannel = false; + lock (this.syncRoot) + { +#if NET7_0_OR_GREATER + ObjectDisposedException.ThrowIf(this.disposed == 1, this); +#else + if (this.disposed == 1) + { + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } +#endif + + if (!this.channels.TryGetValue(cacheKey, out GrpcChannel? channel)) + { + newChannel = source.CreateChannel(); + if (this.disposed == 1) + { + disposeNewChannel = true; + } + else + { + channel = newChannel; + this.channels.Add(cacheKey, channel); + } + } + + options.Channel = channel; + } + + if (disposeNewChannel) + { + newChannel!.Dispose(); + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } + options.SetChannelRecreator((oldChannel, ct) => this.RecreateChannelAsync(cacheKey, source, oldChannel, ct)); options.ConfigureForAzureManaged(); } @@ -172,56 +195,51 @@ async Task RecreateChannelAsync(string cacheKey, DurableTaskSchedul { cancellation.ThrowIfCancellationRequested(); - // Worker cache keys include WorkerId, so a single worker normally owns each cached entry. - // Recreate callbacks can outlive Configure(...), however, so still guard against disposal - // racing with channel publication back into the shared cache owner. - if (this.disposed == 1) - { - throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); - } - - // Materialize the replacement channel BEFORE publishing it so a CreateChannel failure leaves - // the current cache entry intact. - GrpcChannel newChannel = source.CreateChannel(); - Lazy newLazy = new(newChannel); - bool disposeNewChannel = false; GrpcChannel? cachedChannel = null; + GrpcChannel? newChannel = null; + bool disposeNewChannel = false; lock (this.syncRoot) { +#if NET7_0_OR_GREATER + ObjectDisposedException.ThrowIf(this.disposed == 1, this); +#else if (this.disposed == 1) { - disposeNewChannel = true; + throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); + } +#endif + + // Worker cache keys include WorkerId, so a single worker normally owns each cached entry. + // Still guard against a stale recreate callback that is racing a more recent successful swap. + if (this.channels.TryGetValue(cacheKey, out cachedChannel) + && !ReferenceEquals(cachedChannel, oldChannel)) + { + return cachedChannel; } - else if (this.channels.TryGetValue(cacheKey, out Lazy? currentLazy) - && currentLazy.IsValueCreated - && !ReferenceEquals(currentLazy.Value, oldChannel)) + + // Materialize the replacement channel only after we've established that this callback still + // corresponds to the currently cached worker channel. + newChannel = source.CreateChannel(); + if (this.disposed == 1) { - // The cache already moved past oldChannel (for example because this callback is stale). - // Keep the current winner and discard the newly-created replacement. disposeNewChannel = true; - cachedChannel = currentLazy.Value; } else { - this.channels[cacheKey] = newLazy; + this.channels[cacheKey] = newChannel; } } if (disposeNewChannel) { - await DisposeChannelAsync(newChannel).ConfigureAwait(false); - if (cachedChannel is not null) - { - return cachedChannel; - } - + await DisposeChannelAsync(newChannel!).ConfigureAwait(false); throw new ObjectDisposedException(nameof(ConfigureGrpcChannel)); } // Successful swap. Schedule graceful disposal of the old channel after a grace period // so any in-flight RPCs that already captured it can drain. _ = ScheduleDeferredDisposeAsync(oldChannel); - return newChannel; + return newChannel!; } static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) @@ -253,18 +271,18 @@ public async ValueTask DisposeAsync() return; } - List> channelsToDispose; + List channelsToDispose; lock (this.syncRoot) { - channelsToDispose = this.channels.Values.Where(lazy => lazy.IsValueCreated).ToList(); + channelsToDispose = this.channels.Values.ToList(); this.channels.Clear(); } - foreach (Lazy channel in channelsToDispose) + foreach (GrpcChannel channel in channelsToDispose) { try { - await DisposeChannelAsync(channel.Value).ConfigureAwait(false); + await DisposeChannelAsync(channel).ConfigureAwait(false); } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException From fd9381dfab9785e4ba4ce995390fe0a9a676004d Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 10:44:21 -0700 Subject: [PATCH 19/27] Fix worker recreate disposal timing Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 66 +++++++++++++--- .../Internal/InternalOptionsExtensions.cs | 20 ++--- .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 78 +++++++++++++++++++ 3 files changed, 143 insertions(+), 21 deletions(-) diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index cf4c52f9b..252719386 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Diagnostics; using Microsoft.DurableTask.Worker.Hosting; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; @@ -12,6 +13,8 @@ namespace Microsoft.DurableTask.Worker.Grpc; /// sealed partial class GrpcDurableTaskWorker : DurableTaskWorker { + static TimeSpan deferredDisposeGracePeriod = TimeSpan.FromSeconds(30); + readonly GrpcDurableTaskWorkerOptions grpcOptions; readonly DurableTaskWorkerOptions workerOptions; readonly IServiceProvider services; @@ -82,17 +85,12 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) ChannelRecreateResult result = await this.TryRecreateChannelAsync(stoppingToken, workerOwnedChannelDisposable, latestObservedChannel); if (result.Recreated) { - callInvoker = result.NewCallInvoker!; - address = result.NewAddress!; - latestObservedChannel = result.NewChannel; - AsyncDisposable previousDisposable = workerOwnedChannelDisposable; - workerOwnedChannelDisposable = result.NewWorkerOwnedDisposable; - this.logger.ChannelRecreated(address); - - // Dispose the prior worker-owned channel if we had one. Path 1 hands ownership of the - // replacement channel to the recreator, Path 2 installs a fresh worker-owned disposable, - // and Path 3 never recreates at all. - await previousDisposable.DisposeAsync(); + this.ApplySuccessfulRecreate( + result, + ref callInvoker, + ref address, + ref latestObservedChannel, + ref workerOwnedChannelDisposable); } // If we couldn't recreate (e.g., caller-owned CallInvoker), fall through and retry on the @@ -246,6 +244,52 @@ static async ValueTask ShutdownAndDisposeChannelAsync(GrpcChannel channel) } } + void ApplySuccessfulRecreate( + ChannelRecreateResult result, + ref CallInvoker callInvoker, + ref string address, + ref GrpcChannel? latestObservedChannel, + ref AsyncDisposable workerOwnedChannelDisposable) + { + callInvoker = result.NewCallInvoker!; + address = result.NewAddress!; + latestObservedChannel = result.NewChannel; + AsyncDisposable previousDisposable = workerOwnedChannelDisposable; + workerOwnedChannelDisposable = result.NewWorkerOwnedDisposable; + this.logger.ChannelRecreated(address); + + // Defer disposal of the prior worker-owned channel so background completion/abandon RPCs + // from the previous processor instance can drain before the transport is torn down. + // Path 1 hands ownership of the replacement channel to the recreator, Path 2 installs a + // fresh worker-owned disposable, and Path 3 never recreates at all. + _ = ScheduleDeferredDisposeAsync(previousDisposable, deferredDisposeGracePeriod); + } + + static async Task ScheduleDeferredDisposeAsync(AsyncDisposable disposable, TimeSpan delay) + { + try + { + if (delay > TimeSpan.Zero) + { + await Task.Delay(delay).ConfigureAwait(false); + } + + await disposable.DisposeAsync().ConfigureAwait(false); + } + catch (Exception ex) when (ex is not OutOfMemoryException + and not StackOverflowException + and not AccessViolationException + and not ThreadAbortException) + { + if (ex is not OperationCanceledException and not ObjectDisposedException) + { + Trace.TraceError( + "Unexpected exception while deferred-disposing gRPC channel in GrpcDurableTaskWorker.ScheduleDeferredDisposeAsync: {0}", + ex); + } + } + } + AsyncDisposable GetCallInvoker(out CallInvoker callInvoker, out string address) { if (this.grpcOptions.Channel is GrpcChannel c) diff --git a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs index c071123ca..b26b36cc6 100644 --- a/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs +++ b/src/Worker/Grpc/Internal/InternalOptionsExtensions.cs @@ -28,16 +28,16 @@ public static void ConfigureForAzureManaged(this GrpcDurableTaskWorkerOptions op options.Internal.InsertEntityUnlocksOnCompletion = true; } - /// - /// Sets a callback that the worker invokes when the underlying gRPC channel needs to be recreated - /// after repeated connect failures (e.g., because the backend was replaced and the existing channel - /// is wedged on a half-open HTTP/2 connection). The callback receives the channel the worker last - /// observed and must return its replacement. Implementations are responsible for publishing the - /// replacement channel and deferring disposal of the old channel so in-flight RPCs already using it - /// are not interrupted. - /// - /// The gRPC worker options. - /// The recreate callback. + /// + /// Sets a callback that the worker invokes when the underlying gRPC channel needs to be recreated + /// after repeated connect failures (e.g., because the backend was replaced and the existing channel + /// is wedged on a half-open HTTP/2 connection). The callback receives the channel the worker last + /// observed and must return its replacement. Implementations are responsible for publishing the + /// replacement channel and deferring disposal of the old channel so in-flight RPCs already using it + /// are not interrupted. + /// + /// The gRPC worker options. + /// The recreate callback. /// /// This is an internal API that supports the DurableTask infrastructure and not subject to /// the same compatibility standards as public APIs. It may be changed or removed without notice in diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index ba0f1ba59..a0189d99e 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -19,6 +19,10 @@ public class GrpcDurableTaskWorkerTests const string Category = "Microsoft.DurableTask.Worker.Grpc"; static readonly MethodInfo ExecuteAsyncMethod = typeof(GrpcDurableTaskWorker) .GetMethod("ExecuteAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; + static readonly MethodInfo ApplySuccessfulRecreateMethod = typeof(GrpcDurableTaskWorker) + .GetMethod("ApplySuccessfulRecreate", BindingFlags.Instance | BindingFlags.NonPublic)!; + static readonly FieldInfo DeferredDisposeGracePeriodField = typeof(GrpcDurableTaskWorker) + .GetField("deferredDisposeGracePeriod", BindingFlags.Static | BindingFlags.NonPublic)!; static readonly MethodInfo ProcessorConnectAsyncMethod = typeof(GrpcDurableTaskWorker) .GetNestedType("Processor", BindingFlags.NonPublic)! .GetMethod("ConnectAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; @@ -166,6 +170,64 @@ public async Task TryRecreateChannelAsync_ConfiguredRecreatorReturningDifferentC } } + [Fact] + public async Task ApplySuccessfulRecreate_DefersDisposalOfPreviousWorkerOwnedChannel() + { + // Arrange + GrpcChannel currentChannel = GrpcChannel.ForAddress("http://localhost:5004"); + GrpcChannel recreatedChannel = GrpcChannel.ForAddress("http://localhost:5005"); + GrpcDurableTaskWorkerOptions grpcOptions = new() + { + Channel = currentChannel, + }; + grpcOptions.SetChannelRecreator((channel, ct) => Task.FromResult(recreatedChannel)); + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions); + + int disposeCalls = 0; + TaskCompletionSource disposalObserved = new(TaskCreationOptions.RunContinuationsAsynchronously); + AsyncDisposable disposable = new(() => + { + Interlocked.Increment(ref disposeCalls); + disposalObserved.TrySetResult(); + return ValueTask.CompletedTask; + }); + + TimeSpan originalGracePeriod = (TimeSpan)DeferredDisposeGracePeriodField.GetValue(null)!; + CallInvoker callInvoker = currentChannel.CreateCallInvoker(); + string address = currentChannel.Target; + GrpcChannel? latestObservedChannel = currentChannel; + AsyncDisposable workerOwnedChannelDisposable = disposable; + + try + { + DeferredDisposeGracePeriodField.SetValue(null, TimeSpan.FromMilliseconds(100)); + object result = await InvokeTryRecreateChannelAsync(worker, disposable, currentChannel); + + // Act + InvokeApplySuccessfulRecreate( + worker, + result, + ref callInvoker, + ref address, + ref latestObservedChannel, + ref workerOwnedChannelDisposable); + + // Assert + disposalObserved.Task.IsCompleted.Should().BeFalse(); + Volatile.Read(ref disposeCalls).Should().Be(0); + address.Should().Be(recreatedChannel.Target); + latestObservedChannel.Should().BeSameAs(recreatedChannel); + await disposalObserved.Task.WaitAsync(TimeSpan.FromSeconds(5)); + Volatile.Read(ref disposeCalls).Should().Be(1); + } + finally + { + DeferredDisposeGracePeriodField.SetValue(null, originalGracePeriod); + await DisposeChannelAsync(currentChannel); + await DisposeChannelAsync(recreatedChannel); + } + } + [Fact] public async Task ConnectAsync_VeryLargeHelloDeadline_UsesUtcMaxValueDeadline() { @@ -233,6 +295,22 @@ static object CreateProcessor(GrpcDurableTaskWorker worker, P.TaskHubSidecarServ return (AsyncServerStreamingCall)task.GetType().GetProperty("Result")!.GetValue(task)!; } + static void InvokeApplySuccessfulRecreate( + GrpcDurableTaskWorker worker, + object result, + ref CallInvoker callInvoker, + ref string address, + ref GrpcChannel? latestObservedChannel, + ref AsyncDisposable workerOwnedChannelDisposable) + { + object?[] args = { result, callInvoker, address, latestObservedChannel, workerOwnedChannelDisposable }; + ApplySuccessfulRecreateMethod.Invoke(worker, args); + callInvoker = (CallInvoker)args[1]!; + address = (string)args[2]!; + latestObservedChannel = (GrpcChannel?)args[3]; + workerOwnedChannelDisposable = (AsyncDisposable)args[4]!; + } + static async Task InvokeTryRecreateChannelAsync(GrpcDurableTaskWorker worker, GrpcChannel currentChannel) { return await InvokeTryRecreateChannelAsync(worker, default, currentChannel); From dc243e180d8d7d9c73d17898c0d80f9249fb9daf Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 12:54:46 -0700 Subject: [PATCH 20/27] Fix continue-as-new event carryover Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Dispatcher/TaskOrchestrationDispatcher.cs | 16 +++++++ .../Shims/TaskOrchestrationContextWrapper.cs | 48 ++++++++++++++----- src/Worker/Grpc/GrpcDurableTaskWorker.cs | 8 ++-- .../TaskOrchestrationContextWrapperTests.cs | 36 +++++++++++++- .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 13 ++--- 5 files changed, 97 insertions(+), 24 deletions(-) diff --git a/src/InProcessTestHost/Sidecar/Dispatcher/TaskOrchestrationDispatcher.cs b/src/InProcessTestHost/Sidecar/Dispatcher/TaskOrchestrationDispatcher.cs index cc2812a83..f6bafb450 100644 --- a/src/InProcessTestHost/Sidecar/Dispatcher/TaskOrchestrationDispatcher.cs +++ b/src/InProcessTestHost/Sidecar/Dispatcher/TaskOrchestrationDispatcher.cs @@ -149,6 +149,22 @@ protected override async Task ExecuteWorkItemAsync(TaskOrchestrationWorkItem wor out bool continueAsNew); if (continueAsNew) { + // Self-targeted external events emitted during ContinueAsNew (for example by + // preserveUnprocessedEvents) must be carried into the new runtime state before + // we clear the per-iteration message lists. Otherwise they are dropped here and + // the next generation starts without the buffered events it is expecting. + foreach (TaskMessage message in orchestratorMessages) + { + if (message.Event is EventRaisedEvent eventRaised + && string.Equals( + message.OrchestrationInstance.InstanceId, + workItem.InstanceId, + StringComparison.Ordinal)) + { + workItem.OrchestrationRuntimeState.AddEvent(eventRaised); + } + } + // The previous execution is being replaced by a new one. Clear any // accumulated messages from the old execution so they are not // re-enqueued when the work item completes. Without this, stale diff --git a/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs b/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs index bfbf1e47f..e91ba4261 100644 --- a/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs +++ b/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs @@ -30,6 +30,7 @@ sealed partial class TaskOrchestrationContextWrapper : TaskOrchestrationContext int newGuidCounter; object? customStatus; + bool preserveUnprocessedEventsOnContinueAsNew; TaskOrchestrationEntityContext? entityFeature; /// @@ -349,24 +350,31 @@ public override void ContinueAsNew(ContinueAsNewOptions options) { Check.NotNull(options); - if (!string.IsNullOrWhiteSpace(options.NewVersion)) + this.preserveUnprocessedEventsOnContinueAsNew = options.PreserveUnprocessedEvents; + + try { - this.innerContext.ContinueAsNew(options.NewVersion, options.NewInput); + if (!string.IsNullOrWhiteSpace(options.NewVersion)) + { + this.innerContext.ContinueAsNew(options.NewVersion, options.NewInput); + } + else + { + this.innerContext.ContinueAsNew(options.NewInput); + } } - else + catch { - this.innerContext.ContinueAsNew(options.NewInput); + this.preserveUnprocessedEventsOnContinueAsNew = false; + throw; } if (options.PreserveUnprocessedEvents) { // Send all the buffered external events to ourself. - OrchestrationInstance instance = new() { InstanceId = this.InstanceId }; foreach ((string eventName, string eventPayload) in this.externalEventBuffer.TakeAll()) { -#pragma warning disable CS0618 // Type or member is obsolete -- 'internal' usage. - this.innerContext.SendEvent(instance, eventName, new RawInput(eventPayload)); -#pragma warning restore CS0618 // Type or member is obsolete + this.ForwardBufferedExternalEvent(eventName, eventPayload); } } } @@ -477,12 +485,30 @@ internal void CompleteExternalEvent(string eventName, string rawEventPayload) } else { - // The orchestrator isn't waiting for this event (yet?). Save it in case - // the orchestrator wants it later. - this.externalEventBuffer.Add(eventName, rawEventPayload); + if (this.preserveUnprocessedEventsOnContinueAsNew) + { + // ContinueAsNew has already been scheduled with event preservation enabled. + // Forward late-arriving events directly to the next execution instead of buffering + // them on the current wrapper instance, which is about to be discarded. + this.ForwardBufferedExternalEvent(eventName, rawEventPayload); + } + else + { + // The orchestrator isn't waiting for this event (yet?). Save it in case + // the orchestrator wants it later. + this.externalEventBuffer.Add(eventName, rawEventPayload); + } } } + void ForwardBufferedExternalEvent(string eventName, string eventPayload) + { + OrchestrationInstance instance = new() { InstanceId = this.InstanceId }; +#pragma warning disable CS0618 // Type or member is obsolete -- 'internal' usage. + this.innerContext.SendEvent(instance, eventName, new RawInput(eventPayload)); +#pragma warning restore CS0618 // Type or member is obsolete + } + /// /// Gets the serialized custom status. /// diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.cs index 252719386..a200cedbf 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.cs @@ -13,7 +13,7 @@ namespace Microsoft.DurableTask.Worker.Grpc; /// sealed partial class GrpcDurableTaskWorker : DurableTaskWorker { - static TimeSpan deferredDisposeGracePeriod = TimeSpan.FromSeconds(30); + static readonly TimeSpan DeferredDisposeGracePeriod = TimeSpan.FromSeconds(30); readonly GrpcDurableTaskWorkerOptions grpcOptions; readonly DurableTaskWorkerOptions workerOptions; @@ -90,7 +90,8 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) ref callInvoker, ref address, ref latestObservedChannel, - ref workerOwnedChannelDisposable); + ref workerOwnedChannelDisposable, + DeferredDisposeGracePeriod); } // If we couldn't recreate (e.g., caller-owned CallInvoker), fall through and retry on the @@ -249,7 +250,8 @@ void ApplySuccessfulRecreate( ref CallInvoker callInvoker, ref string address, ref GrpcChannel? latestObservedChannel, - ref AsyncDisposable workerOwnedChannelDisposable) + ref AsyncDisposable workerOwnedChannelDisposable, + TimeSpan deferredDisposeGracePeriod) { callInvoker = result.NewCallInvoker!; address = result.NewAddress!; diff --git a/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs b/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs index 06df43170..fa894e9bb 100644 --- a/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs +++ b/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Reflection; using DurableTask.Core; using Microsoft.Extensions.Logging.Abstractions; @@ -8,6 +9,9 @@ namespace Microsoft.DurableTask.Worker.Shims; public class TaskOrchestrationContextWrapperTests { + static readonly MethodInfo CompleteExternalEventMethod = typeof(TaskOrchestrationContextWrapper) + .GetMethod("CompleteExternalEvent", BindingFlags.Instance | BindingFlags.NonPublic)!; + [Fact] public void Ctor_NullParent_Populates() { @@ -103,6 +107,30 @@ public void ContinueAsNew_WithOptionsNoVersion_CallsInnerContextWithoutVersion() innerContext.LastContinueAsNewVersion.Should().BeNull(); } + [Fact] + public void ContinueAsNew_WithPreserveUnprocessedEvents_ForwardsLateArrivingEventsToNextExecution() + { + // Arrange + TrackingOrchestrationContext innerContext = new(); + OrchestrationInvocationContext invocationContext = new("Test", new(), NullLoggerFactory.Instance, null); + TaskOrchestrationContextWrapper wrapper = new(innerContext, invocationContext, "input"); + + // Act + wrapper.ContinueAsNew("new-input", preserveUnprocessedEvents: true); + InvokeCompleteExternalEvent(wrapper, "Event", "\"payload\""); + + // Assert + innerContext.SentEvents.Should().ContainSingle(); + innerContext.SentEvents[0].InstanceId.Should().Be(wrapper.InstanceId); + innerContext.SentEvents[0].EventName.Should().Be("Event"); + innerContext.LastContinueAsNewInput.Should().Be("new-input"); + } + + static void InvokeCompleteExternalEvent(TaskOrchestrationContextWrapper wrapper, string eventName, string rawEventPayload) + { + CompleteExternalEventMethod.Invoke(wrapper, [eventName, rawEventPayload]); + } + sealed class TrackingOrchestrationContext : OrchestrationContext { public TrackingOrchestrationContext() @@ -118,6 +146,8 @@ public TrackingOrchestrationContext() public string? LastContinueAsNewVersion { get; private set; } + public List<(string InstanceId, string EventName, object EventData)> SentEvents { get; } = []; + public override void ContinueAsNew(object input) { this.LastContinueAsNewInput = input; @@ -149,7 +179,9 @@ public override Task ScheduleTask(string name, string version, => throw new NotImplementedException(); public override void SendEvent(OrchestrationInstance orchestrationInstance, string eventName, object eventData) - => throw new NotImplementedException(); + { + this.SentEvents.Add((orchestrationInstance.InstanceId, eventName, eventData)); + } } class TestOrchestrationContext : OrchestrationContext @@ -210,4 +242,4 @@ public override void SendEvent(OrchestrationInstance orchestrationInstance, stri throw new NotImplementedException(); } } -} \ No newline at end of file +} diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index a0189d99e..e63905887 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -21,8 +21,6 @@ public class GrpcDurableTaskWorkerTests .GetMethod("ExecuteAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; static readonly MethodInfo ApplySuccessfulRecreateMethod = typeof(GrpcDurableTaskWorker) .GetMethod("ApplySuccessfulRecreate", BindingFlags.Instance | BindingFlags.NonPublic)!; - static readonly FieldInfo DeferredDisposeGracePeriodField = typeof(GrpcDurableTaskWorker) - .GetField("deferredDisposeGracePeriod", BindingFlags.Static | BindingFlags.NonPublic)!; static readonly MethodInfo ProcessorConnectAsyncMethod = typeof(GrpcDurableTaskWorker) .GetNestedType("Processor", BindingFlags.NonPublic)! .GetMethod("ConnectAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; @@ -192,7 +190,6 @@ public async Task ApplySuccessfulRecreate_DefersDisposalOfPreviousWorkerOwnedCha return ValueTask.CompletedTask; }); - TimeSpan originalGracePeriod = (TimeSpan)DeferredDisposeGracePeriodField.GetValue(null)!; CallInvoker callInvoker = currentChannel.CreateCallInvoker(); string address = currentChannel.Target; GrpcChannel? latestObservedChannel = currentChannel; @@ -200,7 +197,6 @@ public async Task ApplySuccessfulRecreate_DefersDisposalOfPreviousWorkerOwnedCha try { - DeferredDisposeGracePeriodField.SetValue(null, TimeSpan.FromMilliseconds(100)); object result = await InvokeTryRecreateChannelAsync(worker, disposable, currentChannel); // Act @@ -210,7 +206,8 @@ public async Task ApplySuccessfulRecreate_DefersDisposalOfPreviousWorkerOwnedCha ref callInvoker, ref address, ref latestObservedChannel, - ref workerOwnedChannelDisposable); + ref workerOwnedChannelDisposable, + TimeSpan.FromMilliseconds(100)); // Assert disposalObserved.Task.IsCompleted.Should().BeFalse(); @@ -222,7 +219,6 @@ public async Task ApplySuccessfulRecreate_DefersDisposalOfPreviousWorkerOwnedCha } finally { - DeferredDisposeGracePeriodField.SetValue(null, originalGracePeriod); await DisposeChannelAsync(currentChannel); await DisposeChannelAsync(recreatedChannel); } @@ -301,9 +297,10 @@ static void InvokeApplySuccessfulRecreate( ref CallInvoker callInvoker, ref string address, ref GrpcChannel? latestObservedChannel, - ref AsyncDisposable workerOwnedChannelDisposable) + ref AsyncDisposable workerOwnedChannelDisposable, + TimeSpan deferredDisposeGracePeriod) { - object?[] args = { result, callInvoker, address, latestObservedChannel, workerOwnedChannelDisposable }; + object?[] args = { result, callInvoker, address, latestObservedChannel, workerOwnedChannelDisposable, deferredDisposeGracePeriod }; ApplySuccessfulRecreateMethod.Invoke(worker, args); callInvoker = (CallInvoker)args[1]!; address = (string)args[2]!; From 18e23a3f9f12d3f5f31df130209d5d429a251a7f Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 13:23:08 -0700 Subject: [PATCH 21/27] Add worker disconnect coverage tests Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc.Tests/GrpcDurableTaskWorkerTests.cs | 377 ++++++++++++++++++ 1 file changed, 377 insertions(+) diff --git a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs index e63905887..db6c98da5 100644 --- a/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs +++ b/test/Worker/Grpc.Tests/GrpcDurableTaskWorkerTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.IO; using System.Reflection; using Google.Protobuf.WellKnownTypes; using Grpc.Core; @@ -21,6 +22,9 @@ public class GrpcDurableTaskWorkerTests .GetMethod("ExecuteAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; static readonly MethodInfo ApplySuccessfulRecreateMethod = typeof(GrpcDurableTaskWorker) .GetMethod("ApplySuccessfulRecreate", BindingFlags.Instance | BindingFlags.NonPublic)!; + static readonly MethodInfo ProcessorExecuteAsyncMethod = typeof(GrpcDurableTaskWorker) + .GetNestedType("Processor", BindingFlags.NonPublic)! + .GetMethod("ExecuteAsync", BindingFlags.Instance | BindingFlags.Public)!; static readonly MethodInfo ProcessorConnectAsyncMethod = typeof(GrpcDurableTaskWorker) .GetNestedType("Processor", BindingFlags.NonPublic)! .GetMethod("ConnectAsync", BindingFlags.Instance | BindingFlags.NonPublic)!; @@ -90,6 +94,246 @@ public async Task ExecuteAsync_ConnectFailureThreshold_RecreatesConfiguredChanne } } + [Fact] + public async Task ExecuteAsync_TransportResetDuringHello_RecreatesConfiguredChannel() + { + // Arrange + CallbackHttpMessageHandler initialHandler = new((request, cancellationToken) => + throw new HttpRequestException( + "connection reset by peer", + new IOException("An existing connection was forcibly closed by the remote host."))); + TaskCompletionSource recreatedTransportUsed = new(TaskCreationOptions.RunContinuationsAsynchronously); + CallbackHttpMessageHandler recreatedHandler = new(async (request, cancellationToken) => + { + recreatedTransportUsed.TrySetResult(); + await Task.Delay(Timeout.Infinite, cancellationToken); + return CreateFailureResponse(StatusCode.Cancelled, "recreated transport cancelled"); + }); + + GrpcChannel currentChannel = CreateChannel("http://transport-reset.worker.test", initialHandler); + GrpcChannel recreatedChannel = CreateChannel("http://recreated-after-reset.worker.test", recreatedHandler); + GrpcDurableTaskWorkerOptions grpcOptions = new() + { + Channel = currentChannel, + }; + grpcOptions.Internal.ChannelRecreateFailureThreshold = 1; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; + + DurableTaskWorkerOptions workerOptions = new() + { + Logging = { UseLegacyCategories = false }, + }; + TestLogProvider logProvider = new(new NullOutput()); + using CancellationTokenSource stoppingToken = new(); + int recreatorCalls = 0; + grpcOptions.SetChannelRecreator((channel, ct) => + { + recreatorCalls++; + return Task.FromResult(recreatedChannel); + }); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions, workerOptions, new SimpleLoggerFactory(logProvider)); + Task? executeTask = null; + + try + { + // Act + executeTask = InvokeExecuteAsync(worker, stoppingToken.Token); + await recreatedTransportUsed.Task.WaitAsync(TimeSpan.FromSeconds(5)); + stoppingToken.Cancel(); + await executeTask; + + // Assert + recreatorCalls.Should().Be(1); + initialHandler.CallCount.Should().Be(1); + recreatedHandler.CallCount.Should().Be(1); + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => log.Message.Contains("gRPC channel to backend has been recreated")); + } + finally + { + stoppingToken.Cancel(); + if (executeTask is not null) + { + await executeTask; + } + + await DisposeChannelAsync(currentChannel); + await DisposeChannelAsync(recreatedChannel); + } + } + + [Fact] + public async Task ProcessorExecuteAsync_SilentDisconnectBeforeFirstMessage_ReturnsChannelRecreateRequested() + { + // Arrange + GrpcDurableTaskWorkerOptions grpcOptions = new(); + grpcOptions.Internal.ChannelRecreateFailureThreshold = 1; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; + grpcOptions.Internal.SilentDisconnectTimeout = TimeSpan.FromMilliseconds(100); + + DurableTaskWorkerOptions workerOptions = new() + { + Logging = { UseLegacyCategories = false }, + }; + TestLogProvider logProvider = new(new NullOutput()); + ScriptedWorkerCallInvoker callInvoker = new( + helloFactory: static (callNumber, options) => CreateUnaryCall(Task.FromResult(new Empty())), + getWorkItemsFactory: static (callNumber, options) => CreateServerStreamingCall( + new HangingAsyncStreamReader(throwAsRpc: true))); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions, workerOptions, new SimpleLoggerFactory(logProvider)); + object processor = CreateProcessor(worker, new P.TaskHubSidecarService.TaskHubSidecarServiceClient(callInvoker)); + + // Act + ProcessorExitReason reason = await InvokeProcessorExecuteAsync(processor, CancellationToken.None); + + // Assert + reason.Should().Be(ProcessorExitReason.ChannelRecreateRequested); + callInvoker.HelloCallCount.Should().Be(1); + callInvoker.GetWorkItemsCallCount.Should().Be(1); + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => log.Message.Contains("Channel to backend has stopped receiving traffic")); + logs.Should().Contain(log => log.Message.Contains("Recreating gRPC channel to backend")); + } + + [Fact] + public async Task ProcessorExecuteAsync_GracefulDrainAfterFirstMessage_ReconnectsWithoutChannelRecreate() + { + // Arrange + GrpcDurableTaskWorkerOptions grpcOptions = new(); + grpcOptions.Internal.ChannelRecreateFailureThreshold = 1; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; + grpcOptions.Internal.SilentDisconnectTimeout = TimeSpan.FromSeconds(5); + + DurableTaskWorkerOptions workerOptions = new() + { + Logging = { UseLegacyCategories = false }, + }; + TestLogProvider logProvider = new(new NullOutput()); + TaskCompletionSource secondStreamOpened = new(TaskCreationOptions.RunContinuationsAsynchronously); + using CancellationTokenSource stoppingToken = new(); + + ScriptedWorkerCallInvoker callInvoker = new( + helloFactory: static (callNumber, options) => CreateUnaryCall(Task.FromResult(new Empty())), + getWorkItemsFactory: (callNumber, options) => + { + if (callNumber == 1) + { + return CreateServerStreamingCall( + new SequenceAsyncStreamReader(new P.WorkItem { HealthPing = new P.HealthPing() })); + } + + secondStreamOpened.TrySetResult(); + return CreateServerStreamingCall( + new HangingAsyncStreamReader(throwAsRpc: false)); + }); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions, workerOptions, new SimpleLoggerFactory(logProvider)); + object processor = CreateProcessor(worker, new P.TaskHubSidecarService.TaskHubSidecarServiceClient(callInvoker)); + + // Act + Task executeTask = InvokeProcessorExecuteAsync(processor, stoppingToken.Token); + await secondStreamOpened.Task.WaitAsync(TimeSpan.FromSeconds(5)); + stoppingToken.Cancel(); + ProcessorExitReason reason = await executeTask; + + // Assert + reason.Should().Be(ProcessorExitReason.Shutdown); + callInvoker.HelloCallCount.Should().BeGreaterThanOrEqualTo(2); + callInvoker.GetWorkItemsCallCount.Should().BeGreaterThanOrEqualTo(2); + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => log.Message.Contains("Work-item stream ended by the backend")); + logs.Should().NotContain(log => log.Message.Contains("Recreating gRPC channel to backend")); + } + + [Fact] + public async Task ProcessorExecuteAsync_HelloDeadlineExceeded_ReturnsChannelRecreateRequested() + { + // Arrange + GrpcDurableTaskWorkerOptions grpcOptions = new(); + grpcOptions.SetHelloDeadline(TimeSpan.FromMilliseconds(123)); + grpcOptions.Internal.ChannelRecreateFailureThreshold = 1; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; + + DurableTaskWorkerOptions workerOptions = new() + { + Logging = { UseLegacyCategories = false }, + }; + TestLogProvider logProvider = new(new NullOutput()); + ScriptedWorkerCallInvoker callInvoker = new( + helloFactory: static (callNumber, options) => CreateUnaryCall( + Task.FromException(new RpcException(new Status(StatusCode.DeadlineExceeded, "hello timed out")))), + getWorkItemsFactory: static (callNumber, options) => throw new InvalidOperationException("GetWorkItems should not be called.")); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions, workerOptions, new SimpleLoggerFactory(logProvider)); + object processor = CreateProcessor(worker, new P.TaskHubSidecarService.TaskHubSidecarServiceClient(callInvoker)); + + // Act + ProcessorExitReason reason = await InvokeProcessorExecuteAsync(processor, CancellationToken.None); + + // Assert + reason.Should().Be(ProcessorExitReason.ChannelRecreateRequested); + callInvoker.HelloCallCount.Should().Be(1); + callInvoker.GetWorkItemsCallCount.Should().Be(0); + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => log.Message.Contains("Hello handshake to backend timed out after 00:00:00.123")); + logs.Should().Contain(log => log.Message.Contains("Recreating gRPC channel to backend")); + } + + [Theory] + [InlineData(StatusCode.Cancelled, "Durable Task gRPC worker has disconnected from gRPC server.")] + [InlineData(StatusCode.Unauthenticated, "Authentication failed when connecting to backend. Will retry.")] + [InlineData(StatusCode.NotFound, "Task hub NotFound. Will continue retrying.")] + public async Task ProcessorExecuteAsync_NonPoisonHandshakeFailures_RetryWithoutChannelRecreate( + StatusCode statusCode, + string expectedLogMessage) + { + // Arrange + GrpcDurableTaskWorkerOptions grpcOptions = new(); + grpcOptions.Internal.ChannelRecreateFailureThreshold = 1; + grpcOptions.Internal.ReconnectBackoffBase = TimeSpan.Zero; + grpcOptions.Internal.ReconnectBackoffCap = TimeSpan.Zero; + + DurableTaskWorkerOptions workerOptions = new() + { + Logging = { UseLegacyCategories = false }, + }; + TestLogProvider logProvider = new(new NullOutput()); + using CancellationTokenSource stoppingToken = new(); + + ScriptedWorkerCallInvoker callInvoker = new( + helloFactory: (callNumber, options) => + { + if (callNumber == 2) + { + stoppingToken.Cancel(); + } + + return CreateUnaryCall( + Task.FromException(new RpcException(new Status(statusCode, "hello failed")))); + }, + getWorkItemsFactory: static (callNumber, options) => throw new InvalidOperationException("GetWorkItems should not be called.")); + + GrpcDurableTaskWorker worker = CreateWorker(grpcOptions, workerOptions, new SimpleLoggerFactory(logProvider)); + object processor = CreateProcessor(worker, new P.TaskHubSidecarService.TaskHubSidecarServiceClient(callInvoker)); + + // Act + ProcessorExitReason reason = await InvokeProcessorExecuteAsync(processor, stoppingToken.Token); + + // Assert + reason.Should().Be(ProcessorExitReason.Shutdown); + callInvoker.HelloCallCount.Should().BeGreaterThanOrEqualTo(2); + callInvoker.GetWorkItemsCallCount.Should().Be(0); + logProvider.TryGetLogs(Category, out IReadOnlyCollection? logs).Should().BeTrue(); + logs!.Should().Contain(log => log.Message.Contains(expectedLogMessage)); + logs.Should().NotContain(log => log.Message.Contains("Recreating gRPC channel to backend")); + } + [Fact] public async Task TryRecreateChannelAsync_ConfiguredRecreatorReturningSameChannel_DoesNotRecreate() { @@ -291,6 +535,13 @@ static object CreateProcessor(GrpcDurableTaskWorker worker, P.TaskHubSidecarServ return (AsyncServerStreamingCall)task.GetType().GetProperty("Result")!.GetValue(task)!; } + static async Task InvokeProcessorExecuteAsync(object processor, CancellationToken cancellationToken) + { + Task task = (Task)ProcessorExecuteAsyncMethod.Invoke(processor, new object?[] { cancellationToken })!; + await task; + return (ProcessorExitReason)task.GetType().GetProperty("Result")!.GetValue(task)!; + } + static void InvokeApplySuccessfulRecreate( GrpcDurableTaskWorker worker, object result, @@ -357,6 +608,26 @@ static HttpResponseMessage CreateFailureResponse(StatusCode statusCode, string d return response; } + static AsyncUnaryCall CreateUnaryCall(Task responseTask) + { + return new AsyncUnaryCall( + responseTask, + Task.FromResult(new Metadata()), + () => new Status(StatusCode.OK, string.Empty), + () => new Metadata(), + () => { }); + } + + static AsyncServerStreamingCall CreateServerStreamingCall(IAsyncStreamReader reader) + { + return new AsyncServerStreamingCall( + reader, + Task.FromResult(new Metadata()), + () => new Status(StatusCode.OK, string.Empty), + () => new Metadata(), + () => (reader as IDisposable)?.Dispose()); + } + sealed class CallbackHttpMessageHandler : HttpMessageHandler { readonly Func> callback; @@ -434,4 +705,110 @@ sealed class EmptyAsyncStreamReader : IAsyncStreamReader public Task MoveNext(CancellationToken cancellationToken) => Task.FromResult(false); } + + sealed class SequenceAsyncStreamReader : IAsyncStreamReader + { + readonly Queue items; + + public SequenceAsyncStreamReader(params T[] items) + { + this.items = new Queue(items); + } + + public T Current { get; private set; } = default!; + + public Task MoveNext(CancellationToken cancellationToken) + { + if (this.items.Count == 0) + { + return Task.FromResult(false); + } + + this.Current = this.items.Dequeue(); + return Task.FromResult(true); + } + } + + sealed class HangingAsyncStreamReader : IAsyncStreamReader + { + readonly bool throwAsRpc; + + public HangingAsyncStreamReader(bool throwAsRpc) + { + this.throwAsRpc = throwAsRpc; + } + + public T Current => default!; + + public async Task MoveNext(CancellationToken cancellationToken) + { + try + { + await Task.Delay(Timeout.Infinite, cancellationToken); + } + catch (OperationCanceledException) when (this.throwAsRpc) + { + throw new RpcException(new Status(StatusCode.Cancelled, "stream cancelled")); + } + + return false; + } + } + + sealed class ScriptedWorkerCallInvoker : CallInvoker + { + readonly Func> helloFactory; + readonly Func> getWorkItemsFactory; + int helloCallCount; + int getWorkItemsCallCount; + + public ScriptedWorkerCallInvoker( + Func> helloFactory, + Func> getWorkItemsFactory) + { + this.helloFactory = helloFactory; + this.getWorkItemsFactory = getWorkItemsFactory; + } + + public int HelloCallCount => Volatile.Read(ref this.helloCallCount); + + public int GetWorkItemsCallCount => Volatile.Read(ref this.getWorkItemsCallCount); + + public override TResponse BlockingUnaryCall(Method method, string? host, CallOptions options, TRequest request) + { + throw new NotSupportedException(); + } + + public override AsyncUnaryCall AsyncUnaryCall(Method method, string? host, CallOptions options, TRequest request) + { + if (method.FullName == "/TaskHubSidecarService/Hello") + { + AsyncUnaryCall call = this.helloFactory(Interlocked.Increment(ref this.helloCallCount), options); + return (AsyncUnaryCall)(object)call; + } + + throw new NotSupportedException($"Unexpected unary method {method.FullName}."); + } + + public override AsyncServerStreamingCall AsyncServerStreamingCall(Method method, string? host, CallOptions options, TRequest request) + { + if (method.FullName == "/TaskHubSidecarService/GetWorkItems") + { + AsyncServerStreamingCall call = this.getWorkItemsFactory(Interlocked.Increment(ref this.getWorkItemsCallCount), options); + return (AsyncServerStreamingCall)(object)call; + } + + throw new NotSupportedException($"Unexpected server-streaming method {method.FullName}."); + } + + public override AsyncClientStreamingCall AsyncClientStreamingCall(Method method, string? host, CallOptions options) + { + throw new NotSupportedException(); + } + + public override AsyncDuplexStreamingCall AsyncDuplexStreamingCall(Method method, string? host, CallOptions options) + { + throw new NotSupportedException(); + } + } } From dd9ad77f8e6cd39767841dbafbbbea38f1ab3608 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 13:31:11 -0700 Subject: [PATCH 22/27] Fix fatal deferred-dispose filters Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs | 2 ++ src/Client/Grpc/ChannelRecreatingCallInvoker.cs | 2 ++ src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs | 2 ++ 3 files changed, 6 insertions(+) diff --git a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs index aa7a80a78..efe79748c 100644 --- a/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs +++ b/src/Client/AzureManaged/DurableTaskSchedulerClientExtensions.cs @@ -264,6 +264,7 @@ static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException + and not AccessViolationException and not ThreadAbortException) { if (ex is not OperationCanceledException and not ObjectDisposedException) @@ -291,6 +292,7 @@ public async ValueTask DisposeAsync() } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException + and not AccessViolationException and not ThreadAbortException) { // Swallow disposal exceptions - disposal should be best-effort to ensure diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index fcf10023e..95b3d95ed 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -356,6 +356,7 @@ and not StackOverflowException } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException + and not AccessViolationException and not ThreadAbortException) { this.logger.ChannelRecreateFailed(ex); @@ -380,6 +381,7 @@ static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException + and not AccessViolationException and not ThreadAbortException) { if (ex is not OperationCanceledException and not ObjectDisposedException) diff --git a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs index 69f4fa6fe..3b9d4e55c 100644 --- a/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs +++ b/src/Worker/AzureManaged/DurableTaskSchedulerWorkerExtensions.cs @@ -252,6 +252,7 @@ static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException + and not AccessViolationException and not ThreadAbortException) { if (ex is not OperationCanceledException and not ObjectDisposedException) @@ -286,6 +287,7 @@ public async ValueTask DisposeAsync() } catch (Exception ex) when (ex is not OutOfMemoryException and not StackOverflowException + and not AccessViolationException and not ThreadAbortException) { // Swallow disposal exceptions - disposal should be best-effort to ensure From 4d30b9b983eaa0d4e06b2bd17ae130b553ee6616 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 13:58:17 -0700 Subject: [PATCH 23/27] Fix client recreate dispose race Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Grpc/ChannelRecreatingCallInvoker.cs | 19 +++++- ...DurableTaskClientChannelRecreationTests.cs | 58 +++++++++++++++++++ 2 files changed, 75 insertions(+), 2 deletions(-) diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index 95b3d95ed..13e11acd0 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -299,8 +299,7 @@ async Task RecreateAsync(int observedCount) // Link to the disposal CTS so DisposeAsync can promptly abort an in-flight recreate. // The 30s timeout keeps a wedged recreator from holding the single-flight slot indefinitely. - using CancellationTokenSource cts = CancellationTokenSource.CreateLinkedTokenSource(this.disposalCts.Token); - cts.CancelAfter(TimeSpan.FromSeconds(30)); + using CancellationTokenSource cts = this.CreateRecreateCancellationSource(); GrpcChannel newChannel = await this.recreator(current.Channel, cts.Token).ConfigureAwait(false); if (!ReferenceEquals(newChannel, current.Channel)) @@ -370,6 +369,22 @@ and not AccessViolationException } } + CancellationTokenSource CreateRecreateCancellationSource() + { + try + { + CancellationTokenSource cts = CancellationTokenSource.CreateLinkedTokenSource(this.disposalCts.Token); + cts.CancelAfter(TimeSpan.FromSeconds(30)); + return cts; + } + catch (ObjectDisposedException) when (Volatile.Read(ref this.disposed) != 0) + { + CancellationTokenSource cts = new(); + cts.Cancel(); + return cts; + } + } + static async Task ScheduleDeferredDisposeAsync(GrpcChannel channel) { try diff --git a/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs b/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs index df3487459..29d3e9105 100644 --- a/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs +++ b/test/Client/Grpc.Tests/GrpcDurableTaskClientChannelRecreationTests.cs @@ -117,6 +117,50 @@ public async Task GetCallInvoker_WithAddressAndRecreator_UsesWrapperThatOwnsCrea } } + [Fact] + public async Task CreateRecreateCancellationSource_WhenDisposedDuringRecreateWindow_ReturnsCanceledTokenSource() + { + // Arrange + GrpcChannel channel = GrpcChannel.ForAddress("http://disposed-race.client.test"); + GrpcDurableTaskClientOptions options = new() + { + Channel = channel, + }; + options.SetChannelRecreator((existingChannel, ct) => Task.FromResult(existingChannel)); + + try + { + (AsyncDisposable disposable, CallInvoker callInvoker) = InvokeGetCallInvoker(options); + + try + { + ChannelRecreatingCallInvoker wrapper = callInvoker.Should().BeOfType().Subject; + MethodInfo? method = typeof(ChannelRecreatingCallInvoker).GetMethod( + "CreateRecreateCancellationSource", + BindingFlags.Instance | BindingFlags.NonPublic); + method.Should().NotBeNull(); + + SetDisposed(wrapper, 1); + GetDisposalCancellationSource(wrapper).Dispose(); + + // Act + using CancellationTokenSource recreateCts = + (CancellationTokenSource)method!.Invoke(wrapper, Array.Empty())!; + + // Assert + recreateCts.IsCancellationRequested.Should().BeTrue(); + } + finally + { + await disposable.DisposeAsync(); + } + } + finally + { + await DisposeChannelAsync(channel); + } + } + [Theory] [InlineData(0, 0)] [InlineData(-1, 0)] @@ -160,6 +204,20 @@ static bool GetOwnsChannel(CallInvoker callInvoker) .GetValue(callInvoker)!; } + static CancellationTokenSource GetDisposalCancellationSource(CallInvoker callInvoker) + { + return (CancellationTokenSource)typeof(ChannelRecreatingCallInvoker) + .GetField("disposalCts", BindingFlags.Instance | BindingFlags.NonPublic)! + .GetValue(callInvoker)!; + } + + static void SetDisposed(CallInvoker callInvoker, int value) + { + typeof(ChannelRecreatingCallInvoker) + .GetField("disposed", BindingFlags.Instance | BindingFlags.NonPublic)! + .SetValue(callInvoker, value); + } + static long InvokeToStopwatchTicks(TimeSpan interval) { return (long)ToStopwatchTicksMethod.Invoke(null, new object?[] { interval })!; From caf307b6a5745760762e35266ec4dc125cb2224b Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 14:10:41 -0700 Subject: [PATCH 24/27] Ignore local git worktrees Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f49bbfa2f..7d510eec8 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ bld/ # Visual Studio 2015/2017 cache/options directory .vs/ +.worktrees/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ From c85234de8718fba1803c0443c7c5e77bf5b26e88 Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 14:14:37 -0700 Subject: [PATCH 25/27] Move wrapper changes to separate PR Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../Shims/TaskOrchestrationContextWrapper.cs | 48 +++++-------------- .../TaskOrchestrationContextWrapperTests.cs | 36 +------------- 2 files changed, 13 insertions(+), 71 deletions(-) diff --git a/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs b/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs index e91ba4261..bfbf1e47f 100644 --- a/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs +++ b/src/Worker/Core/Shims/TaskOrchestrationContextWrapper.cs @@ -30,7 +30,6 @@ sealed partial class TaskOrchestrationContextWrapper : TaskOrchestrationContext int newGuidCounter; object? customStatus; - bool preserveUnprocessedEventsOnContinueAsNew; TaskOrchestrationEntityContext? entityFeature; /// @@ -350,31 +349,24 @@ public override void ContinueAsNew(ContinueAsNewOptions options) { Check.NotNull(options); - this.preserveUnprocessedEventsOnContinueAsNew = options.PreserveUnprocessedEvents; - - try + if (!string.IsNullOrWhiteSpace(options.NewVersion)) { - if (!string.IsNullOrWhiteSpace(options.NewVersion)) - { - this.innerContext.ContinueAsNew(options.NewVersion, options.NewInput); - } - else - { - this.innerContext.ContinueAsNew(options.NewInput); - } + this.innerContext.ContinueAsNew(options.NewVersion, options.NewInput); } - catch + else { - this.preserveUnprocessedEventsOnContinueAsNew = false; - throw; + this.innerContext.ContinueAsNew(options.NewInput); } if (options.PreserveUnprocessedEvents) { // Send all the buffered external events to ourself. + OrchestrationInstance instance = new() { InstanceId = this.InstanceId }; foreach ((string eventName, string eventPayload) in this.externalEventBuffer.TakeAll()) { - this.ForwardBufferedExternalEvent(eventName, eventPayload); +#pragma warning disable CS0618 // Type or member is obsolete -- 'internal' usage. + this.innerContext.SendEvent(instance, eventName, new RawInput(eventPayload)); +#pragma warning restore CS0618 // Type or member is obsolete } } } @@ -485,30 +477,12 @@ internal void CompleteExternalEvent(string eventName, string rawEventPayload) } else { - if (this.preserveUnprocessedEventsOnContinueAsNew) - { - // ContinueAsNew has already been scheduled with event preservation enabled. - // Forward late-arriving events directly to the next execution instead of buffering - // them on the current wrapper instance, which is about to be discarded. - this.ForwardBufferedExternalEvent(eventName, rawEventPayload); - } - else - { - // The orchestrator isn't waiting for this event (yet?). Save it in case - // the orchestrator wants it later. - this.externalEventBuffer.Add(eventName, rawEventPayload); - } + // The orchestrator isn't waiting for this event (yet?). Save it in case + // the orchestrator wants it later. + this.externalEventBuffer.Add(eventName, rawEventPayload); } } - void ForwardBufferedExternalEvent(string eventName, string eventPayload) - { - OrchestrationInstance instance = new() { InstanceId = this.InstanceId }; -#pragma warning disable CS0618 // Type or member is obsolete -- 'internal' usage. - this.innerContext.SendEvent(instance, eventName, new RawInput(eventPayload)); -#pragma warning restore CS0618 // Type or member is obsolete - } - /// /// Gets the serialized custom status. /// diff --git a/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs b/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs index fa894e9bb..06df43170 100644 --- a/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs +++ b/test/Worker/Core.Tests/Shims/TaskOrchestrationContextWrapperTests.cs @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. -using System.Reflection; using DurableTask.Core; using Microsoft.Extensions.Logging.Abstractions; @@ -9,9 +8,6 @@ namespace Microsoft.DurableTask.Worker.Shims; public class TaskOrchestrationContextWrapperTests { - static readonly MethodInfo CompleteExternalEventMethod = typeof(TaskOrchestrationContextWrapper) - .GetMethod("CompleteExternalEvent", BindingFlags.Instance | BindingFlags.NonPublic)!; - [Fact] public void Ctor_NullParent_Populates() { @@ -107,30 +103,6 @@ public void ContinueAsNew_WithOptionsNoVersion_CallsInnerContextWithoutVersion() innerContext.LastContinueAsNewVersion.Should().BeNull(); } - [Fact] - public void ContinueAsNew_WithPreserveUnprocessedEvents_ForwardsLateArrivingEventsToNextExecution() - { - // Arrange - TrackingOrchestrationContext innerContext = new(); - OrchestrationInvocationContext invocationContext = new("Test", new(), NullLoggerFactory.Instance, null); - TaskOrchestrationContextWrapper wrapper = new(innerContext, invocationContext, "input"); - - // Act - wrapper.ContinueAsNew("new-input", preserveUnprocessedEvents: true); - InvokeCompleteExternalEvent(wrapper, "Event", "\"payload\""); - - // Assert - innerContext.SentEvents.Should().ContainSingle(); - innerContext.SentEvents[0].InstanceId.Should().Be(wrapper.InstanceId); - innerContext.SentEvents[0].EventName.Should().Be("Event"); - innerContext.LastContinueAsNewInput.Should().Be("new-input"); - } - - static void InvokeCompleteExternalEvent(TaskOrchestrationContextWrapper wrapper, string eventName, string rawEventPayload) - { - CompleteExternalEventMethod.Invoke(wrapper, [eventName, rawEventPayload]); - } - sealed class TrackingOrchestrationContext : OrchestrationContext { public TrackingOrchestrationContext() @@ -146,8 +118,6 @@ public TrackingOrchestrationContext() public string? LastContinueAsNewVersion { get; private set; } - public List<(string InstanceId, string EventName, object EventData)> SentEvents { get; } = []; - public override void ContinueAsNew(object input) { this.LastContinueAsNewInput = input; @@ -179,9 +149,7 @@ public override Task ScheduleTask(string name, string version, => throw new NotImplementedException(); public override void SendEvent(OrchestrationInstance orchestrationInstance, string eventName, object eventData) - { - this.SentEvents.Add((orchestrationInstance.InstanceId, eventName, eventData)); - } + => throw new NotImplementedException(); } class TestOrchestrationContext : OrchestrationContext @@ -242,4 +210,4 @@ public override void SendEvent(OrchestrationInstance orchestrationInstance, stri throw new NotImplementedException(); } } -} +} \ No newline at end of file From f246e90b4c9a8624bdf0bf4ff5976f3b59beb64a Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 14:22:27 -0700 Subject: [PATCH 26/27] Keep worktree ignore local Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index 7d510eec8..f49bbfa2f 100644 --- a/.gitignore +++ b/.gitignore @@ -33,7 +33,6 @@ bld/ # Visual Studio 2015/2017 cache/options directory .vs/ -.worktrees/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ From d291efde01b8f094626ee30a0e5689d35823834b Mon Sep 17 00:00:00 2001 From: Bernd Verst Date: Thu, 23 Apr 2026 14:50:40 -0700 Subject: [PATCH 27/27] Address latest PR feedback on reconnect cleanup Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/Client/Grpc/ChannelRecreatingCallInvoker.cs | 1 + .../Grpc/GrpcDurableTaskWorker.Processor.cs | 2 +- src/Worker/Grpc/ReconnectBackoff.cs | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs index 13e11acd0..f68ccfe9b 100644 --- a/src/Client/Grpc/ChannelRecreatingCallInvoker.cs +++ b/src/Client/Grpc/ChannelRecreatingCallInvoker.cs @@ -317,6 +317,7 @@ async Task RecreateAsync(int observedCount) } catch (Exception shutdownEx) when (shutdownEx is not OutOfMemoryException and not StackOverflowException + and not AccessViolationException and not ThreadAbortException) { // Best-effort cleanup. diff --git a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs index f4b6dafd7..7f61b68a9 100644 --- a/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs +++ b/src/Worker/Grpc/GrpcDurableTaskWorker.Processor.cs @@ -62,7 +62,7 @@ public async Task ExecuteAsync(CancellationToken cancellati // Tracks consecutive retry attempts for backoff calculation. Reset on first stream message. int reconnectAttempt = 0; - Random backoffRandom = new(); + Random backoffRandom = ReconnectBackoff.CreateRandom(); while (!cancellation.IsCancellationRequested) { diff --git a/src/Worker/Grpc/ReconnectBackoff.cs b/src/Worker/Grpc/ReconnectBackoff.cs index cfa01f1b0..dd08a58ce 100644 --- a/src/Worker/Grpc/ReconnectBackoff.cs +++ b/src/Worker/Grpc/ReconnectBackoff.cs @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT License. +using System.Security.Cryptography; + namespace Microsoft.DurableTask.Worker.Grpc; /// @@ -8,6 +10,19 @@ namespace Microsoft.DurableTask.Worker.Grpc; /// static class ReconnectBackoff { + /// + /// Creates a random source for reconnect jitter using an explicit random seed so multiple workers on + /// older runtimes don't converge on the same time-based seed. + /// + /// A random source suitable for reconnect jitter. + public static Random CreateRandom() + { + byte[] seedBytes = new byte[sizeof(int)]; + using RandomNumberGenerator randomNumberGenerator = RandomNumberGenerator.Create(); + randomNumberGenerator.GetBytes(seedBytes); + return new Random(BitConverter.ToInt32(seedBytes, 0)); + } + /// /// Computes a full-jitter exponential backoff delay: a uniformly random TimeSpan in /// [0, min(cap, base * 2^attempt)]. Returns when