From 5901c4faa7b2e1defd743a96610caf1ba1c171cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 00:03:23 +0900 Subject: [PATCH 01/37] Implement graceful shutdown for Garnet server Adds a graceful shutdown mechanism to the Garnet server, ensuring new connections are stopped, active connections are awaited, and data is safely persisted (AOF commit and checkpoint) before exit. Updates include new ShutdownAsync logic in GarnetServer, StopListening support in server classes, and integration of shutdown handling in both Windows service and console entry points. --- hosting/Windows/Garnet.worker/Program.cs | 8 ++ hosting/Windows/Garnet.worker/Worker.cs | 21 ++- libs/host/GarnetServer.cs | 175 +++++++++++++++++++++++ libs/server/Servers/GarnetServerBase.cs | 6 + libs/server/Servers/GarnetServerTcp.cs | 22 +++ libs/server/Servers/IGarnetServer.cs | 6 + libs/server/Servers/StoreApi.cs | 29 ++++ main/GarnetServer/Program.cs | 54 ++++++- 8 files changed, 316 insertions(+), 5 deletions(-) diff --git a/hosting/Windows/Garnet.worker/Program.cs b/hosting/Windows/Garnet.worker/Program.cs index 8418da86716..ed14257d136 100644 --- a/hosting/Windows/Garnet.worker/Program.cs +++ b/hosting/Windows/Garnet.worker/Program.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +using System; using Garnet; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; @@ -12,6 +13,13 @@ static void Main(string[] args) var builder = Host.CreateApplicationBuilder(args); builder.Services.AddHostedService(_ => new Worker(args)); + // Configure Host shutdown timeout + builder.Services.Configure(options => + { + // Set graceful shutdown timeout to 15 seconds + options.ShutdownTimeout = TimeSpan.FromSeconds(5); + }); + builder.Services.AddWindowsService(options => { options.ServiceName = "Microsoft Garnet Server"; diff --git a/hosting/Windows/Garnet.worker/Worker.cs b/hosting/Windows/Garnet.worker/Worker.cs index d69adb7e3c0..133aeea966a 100644 --- a/hosting/Windows/Garnet.worker/Worker.cs +++ b/hosting/Windows/Garnet.worker/Worker.cs @@ -43,8 +43,23 @@ protected override async Task ExecuteAsync(CancellationToken stoppingToken) /// Indicates that the shutdown process should no longer be graceful. public override async Task StopAsync(CancellationToken cancellationToken) { - Dispose(); - await base.StopAsync(cancellationToken).ConfigureAwait(false); + try + { + if (server != null) + { + // Perform graceful shutdown with AOF commit and checkpoint + await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5), token: cancellationToken).ConfigureAwait(false); + } + } + catch (OperationCanceledException) + { + // Force shutdown requested - proceed to dispose + } + finally + { + await base.StopAsync(cancellationToken).ConfigureAwait(false); + Dispose(); + } } public override void Dispose() @@ -55,6 +70,8 @@ public override void Dispose() } server?.Dispose(); _isDisposed = true; + base.Dispose(); + GC.SuppressFinalize(this); } } } \ No newline at end of file diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 2d12f43a0d4..4286725debf 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -10,6 +10,7 @@ using System.Runtime.InteropServices; using System.Text; using System.Threading; +using System.Threading.Tasks; using Garnet.cluster; using Garnet.common; using Garnet.networking; @@ -422,6 +423,180 @@ public void Start() Console.WriteLine("* Ready to accept connections"); } + /// + /// Performs graceful shutdown of the server. + /// Stops accepting new connections, waits for active connections to complete, commits AOF, and takes checkpoint if needed. + /// + /// Timeout for waiting on active connections (default: 30 seconds) + /// Cancellation token + /// Task representing the async shutdown operation + public async Task ShutdownAsync(TimeSpan? timeout = null, CancellationToken token = default) + { + var shutdownTimeout = timeout ?? TimeSpan.FromSeconds(30); + + try + { + // Stop accepting new connections first + StopListening(); + + // Wait for existing connections to complete + await WaitForActiveConnectionsAsync(shutdownTimeout, token).ConfigureAwait(false); + + // Commit AOF and take checkpoint if needed + await FinalizeDataAsync(token).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + // Force shutdown requested + } + catch (Exception ex) + { + logger?.LogError(ex, "Error during graceful shutdown"); + } + } + + /// + /// Stop all servers from accepting new connections. + /// + private void StopListening() + { + if (servers == null) return; + + logger?.LogInformation("Stopping listeners to prevent new connections..."); + foreach (var server in servers) + { + try + { + server?.StopListening(); + } + catch (Exception ex) + { + logger?.LogWarning(ex, "Error stopping listener"); + } + } + } + + /// + /// Waits for active connections to complete within the specified timeout. + /// + private async Task WaitForActiveConnectionsAsync(TimeSpan timeout, CancellationToken token) + { + if (Metrics == null) return; + + var stopwatch = Stopwatch.StartNew(); + var delays = new[] { 50, 300, 1000 }; + var delayIndex = 0; + + while (stopwatch.Elapsed < timeout && !token.IsCancellationRequested) + { + try + { + var activeConnections = GetActiveConnectionCount(); + + if (activeConnections == 0) + { + logger?.LogInformation("All connections have been closed gracefully."); + return; + } + + logger?.LogInformation("Waiting for {ActiveConnections} active connections to complete...", activeConnections); + + var currentDelay = delays[delayIndex]; + if (delayIndex < delays.Length - 1) delayIndex++; + + await Task.Delay(currentDelay, token).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + throw; + } + catch (Exception ex) + { + logger?.LogWarning(ex, "Error checking active connections"); + delayIndex = 0; + await Task.Delay(500, token).ConfigureAwait(false); + } + } + + if (stopwatch.Elapsed >= timeout) + { + logger?.LogWarning("Timeout reached after {TimeoutSeconds} seconds. Some connections may still be active.", + timeout.TotalSeconds); + } + } + + /// + /// Gets the current number of active connections directly from server instances. + /// + private int GetActiveConnectionCount() + { + int count = 0; + if (servers != null) + { + foreach (var server in servers) + { + if (server is GarnetServerBase garnetServerBase) + { + count += (int)garnetServerBase.get_conn_active(); + } + } + } + return count; + } + + /// + /// Commits AOF and takes checkpoint for data durability during shutdown. + /// + private async Task FinalizeDataAsync(CancellationToken token) + { + var enableAOF = opts.EnableAOF; + var enableStorageTier = opts.EnableStorageTier; + + // Commit AOF before checkpoint/shutdown + if (enableAOF) + { + logger?.LogInformation("Committing AOF before shutdown..."); + try + { + var commitSuccess = await Store.CommitAOFAsync(token).ConfigureAwait(false); + if (commitSuccess) + { + logger?.LogInformation("AOF committed successfully."); + } + else + { + logger?.LogInformation("AOF commit skipped (another commit in progress or replica mode)."); + } + } + catch (Exception ex) + { + logger?.LogError(ex, "Error committing AOF during shutdown"); + } + } + + // Take checkpoint for tiered storage + if (enableStorageTier) + { + logger?.LogInformation("Taking checkpoint for tiered storage..."); + try + { + var checkpointSuccess = Store.TakeCheckpoint(background: false, token); + if (checkpointSuccess) + { + logger?.LogInformation("Checkpoint completed successfully."); + } + else + { + logger?.LogInformation("Checkpoint skipped (another checkpoint in progress or replica mode)."); + } + } + catch (Exception ex) + { + logger?.LogError(ex, "Error taking checkpoint during shutdown"); + } + } + } + /// /// Dispose store (including log and checkpoint directory) /// diff --git a/libs/server/Servers/GarnetServerBase.cs b/libs/server/Servers/GarnetServerBase.cs index 5bfd1ff62ff..7f6386ac523 100644 --- a/libs/server/Servers/GarnetServerBase.cs +++ b/libs/server/Servers/GarnetServerBase.cs @@ -154,6 +154,12 @@ public bool AddSession(WireFormat protocol, ref ISessionProvider provider, INetw /// public abstract void Start(); + /// + public virtual void StopListening() + { + // Base implementation does nothing; derived classes should override + } + /// public virtual void Dispose() { diff --git a/libs/server/Servers/GarnetServerTcp.cs b/libs/server/Servers/GarnetServerTcp.cs index c681e09befa..59e4c8f762a 100644 --- a/libs/server/Servers/GarnetServerTcp.cs +++ b/libs/server/Servers/GarnetServerTcp.cs @@ -28,6 +28,7 @@ public class GarnetServerTcp : GarnetServerBase, IServerHook readonly int networkConnectionLimit; readonly string unixSocketPath; readonly UnixFileMode unixSocketPermission; + volatile bool isListening; /// public override IEnumerable ActiveConsumers() @@ -117,10 +118,31 @@ public override void Start() } listenSocket.Listen(512); + isListening = true; if (!listenSocket.AcceptAsync(acceptEventArg)) AcceptEventArg_Completed(null, acceptEventArg); } + /// + public override void StopListening() + { + if (!isListening) + return; + + isListening = false; + try + { + // Close the listen socket to stop accepting new connections + // This will cause any pending AcceptAsync to complete with an error + listenSocket.Close(); + logger?.LogInformation("Stopped accepting new connections on {endpoint}", EndPoint); + } + catch (Exception ex) + { + logger?.LogWarning(ex, "Error closing listen socket on {endpoint}", EndPoint); + } + } + private void AcceptEventArg_Completed(object sender, SocketAsyncEventArgs e) { try diff --git a/libs/server/Servers/IGarnetServer.cs b/libs/server/Servers/IGarnetServer.cs index 9e197451627..639e4a2a259 100644 --- a/libs/server/Servers/IGarnetServer.cs +++ b/libs/server/Servers/IGarnetServer.cs @@ -46,5 +46,11 @@ public interface IGarnetServer : IDisposable /// Start server /// public void Start(); + + /// + /// Stop accepting new connections (for graceful shutdown). + /// Existing connections remain active until they complete or are disposed. + /// + public void StopListening(); } } \ No newline at end of file diff --git a/libs/server/Servers/StoreApi.cs b/libs/server/Servers/StoreApi.cs index 5ff169c9fd5..2ed70ef7a3e 100644 --- a/libs/server/Servers/StoreApi.cs +++ b/libs/server/Servers/StoreApi.cs @@ -130,6 +130,35 @@ public bool FlushDB(int dbId = 0, bool unsafeTruncateLog = false) } } + /// + /// Take checkpoint for all active databases + /// + /// True if method can return before checkpoint is taken + /// Cancellation token + /// false if checkpoint was skipped due to node state or another checkpoint in progress + public bool TakeCheckpoint(bool background = false, CancellationToken token = default) + { + using (PreventRoleChange(out var acquired)) + { + if (!acquired || IsReplica) + { + return false; + } + + return storeWrapper.TakeCheckpoint(background, logger: null, token: token); + } + } + + /// + /// Check if storage tier is enabled + /// + public bool IsStorageTierEnabled => storeWrapper.serverOptions.EnableStorageTier; + + /// + /// Check if AOF is enabled + /// + public bool IsAOFEnabled => storeWrapper.serverOptions.EnableAOF; + /// /// Helper to disable role changes during a using block. /// diff --git a/main/GarnetServer/Program.cs b/main/GarnetServer/Program.cs index 7b2673ebc41..c9c19e46571 100644 --- a/main/GarnetServer/Program.cs +++ b/main/GarnetServer/Program.cs @@ -10,23 +10,71 @@ namespace Garnet /// public class Program { - static void Main(string[] args) + static async Task Main(string[] args) { + GarnetServer server = null; + var shutdownCts = new CancellationTokenSource(); + int shutdownInitiated = 0; // Guard to ensure single shutdown/dispose + try { - using var server = new GarnetServer(args); + server = new GarnetServer(args); // Optional: register custom extensions RegisterExtensions(server); + // Set up graceful shutdown handlers for Ctrl+C and SIGTERM + Console.CancelKeyPress += (sender, e) => + { + e.Cancel = true; // Prevent immediate termination + Console.WriteLine("Shutdown signal received. Starting graceful shutdown..."); + shutdownCts.Cancel(); + }; + + AppDomain.CurrentDomain.ProcessExit += (sender, e) => + { + // Only initiate shutdown if not already done + if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0) + { + Console.WriteLine("Process exit signal received. Starting graceful shutdown..."); + shutdownCts.Cancel(); + // Wait for graceful shutdown with timeout + server?.ShutdownAsync(TimeSpan.FromSeconds(5), CancellationToken.None) + .GetAwaiter().GetResult(); + server?.Dispose(); + } + }; + // Start the server server.Start(); - Thread.Sleep(Timeout.Infinite); + // Wait for shutdown signal + try + { + await Task.Delay(Timeout.Infinite, shutdownCts.Token).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + // Normal shutdown path + } + + // Only initiate shutdown if not already done by ProcessExit handler + if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0) + { + // Block synchronously for shutdown - ensures cleanup completes before process exits + server.ShutdownAsync(TimeSpan.FromSeconds(5), CancellationToken.None) + .GetAwaiter().GetResult(); + server?.Dispose(); + } } catch (Exception ex) { Console.WriteLine($"Unable to initialize server due to exception: {ex.Message}"); + // Ensure cleanup on exception if shutdown wasn't initiated + if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0) + { + server?.Dispose(); + } } } From 9ea93398bdb5f00a9f0f8561321fa0e6d0861592 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:06:45 +0900 Subject: [PATCH 02/37] Update hosting/Windows/Garnet.worker/Program.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- hosting/Windows/Garnet.worker/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hosting/Windows/Garnet.worker/Program.cs b/hosting/Windows/Garnet.worker/Program.cs index ed14257d136..bb2e46173ea 100644 --- a/hosting/Windows/Garnet.worker/Program.cs +++ b/hosting/Windows/Garnet.worker/Program.cs @@ -16,7 +16,7 @@ static void Main(string[] args) // Configure Host shutdown timeout builder.Services.Configure(options => { - // Set graceful shutdown timeout to 15 seconds + // Set graceful shutdown timeout to 5 seconds options.ShutdownTimeout = TimeSpan.FromSeconds(5); }); From 7731df6ec4130f5f1cd8a773e045ef5be539cdf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:07:27 +0900 Subject: [PATCH 03/37] Update libs/host/GarnetServer.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- libs/host/GarnetServer.cs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 4286725debf..62a1b659a5a 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -533,12 +533,9 @@ private int GetActiveConnectionCount() int count = 0; if (servers != null) { - foreach (var server in servers) + foreach (var garnetServerBase in servers.OfType()) { - if (server is GarnetServerBase garnetServerBase) - { - count += (int)garnetServerBase.get_conn_active(); - } + count += (int)garnetServerBase.get_conn_active(); } } return count; From 597ec5745f6a17764a213df5ccd2caf043962bca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:08:14 +0900 Subject: [PATCH 04/37] Update main/GarnetServer/Program.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- main/GarnetServer/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/GarnetServer/Program.cs b/main/GarnetServer/Program.cs index c9c19e46571..7860988e34f 100644 --- a/main/GarnetServer/Program.cs +++ b/main/GarnetServer/Program.cs @@ -39,7 +39,7 @@ static async Task Main(string[] args) Console.WriteLine("Process exit signal received. Starting graceful shutdown..."); shutdownCts.Cancel(); // Wait for graceful shutdown with timeout - server?.ShutdownAsync(TimeSpan.FromSeconds(5), CancellationToken.None) + server?.ShutdownAsync(TimeSpan.FromSeconds(3), CancellationToken.None) .GetAwaiter().GetResult(); server?.Dispose(); } From 9e168f4d05728cd299303c77b37c9b97f525c233 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:08:22 +0900 Subject: [PATCH 05/37] Update main/GarnetServer/Program.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- main/GarnetServer/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/GarnetServer/Program.cs b/main/GarnetServer/Program.cs index 7860988e34f..6c1427500c8 100644 --- a/main/GarnetServer/Program.cs +++ b/main/GarnetServer/Program.cs @@ -64,7 +64,7 @@ static async Task Main(string[] args) // Block synchronously for shutdown - ensures cleanup completes before process exits server.ShutdownAsync(TimeSpan.FromSeconds(5), CancellationToken.None) .GetAwaiter().GetResult(); - server?.Dispose(); + server.Dispose(); } } catch (Exception ex) From 67a7f2e6b1f711a6fc6de7176d531bcd14c27aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:08:44 +0900 Subject: [PATCH 06/37] Update main/GarnetServer/Program.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- main/GarnetServer/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/GarnetServer/Program.cs b/main/GarnetServer/Program.cs index 6c1427500c8..214814d93f1 100644 --- a/main/GarnetServer/Program.cs +++ b/main/GarnetServer/Program.cs @@ -13,7 +13,7 @@ public class Program static async Task Main(string[] args) { GarnetServer server = null; - var shutdownCts = new CancellationTokenSource(); + using var shutdownCts = new CancellationTokenSource(); int shutdownInitiated = 0; // Guard to ensure single shutdown/dispose try From 5b25407c256c60fb4d79327f9811649e4f598cfb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:08:55 +0900 Subject: [PATCH 07/37] Update libs/host/GarnetServer.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- libs/host/GarnetServer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 62a1b659a5a..9cf79c6d2d3 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -481,7 +481,7 @@ private void StopListening() /// private async Task WaitForActiveConnectionsAsync(TimeSpan timeout, CancellationToken token) { - if (Metrics == null) return; + if (servers == null) return; var stopwatch = Stopwatch.StartNew(); var delays = new[] { 50, 300, 1000 }; From 476d629528f7ee9c2f71bc8b82331d9a06eac9ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:32:15 +0900 Subject: [PATCH 08/37] =?UTF-8?q?=F0=9F=90=9B=20Resolve=20Race=20Condition?= =?UTF-8?q?=20risk=20in=20"StopListening"=20impl=20at=20GarnetServerTcp.cs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libs/server/Servers/GarnetServerTcp.cs | 5 +- test/Garnet.test/GarnetServerTcpTests.cs | 139 +++++++++++++++++++++++ 2 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 test/Garnet.test/GarnetServerTcpTests.cs diff --git a/libs/server/Servers/GarnetServerTcp.cs b/libs/server/Servers/GarnetServerTcp.cs index 59e4c8f762a..be0ea105a4b 100644 --- a/libs/server/Servers/GarnetServerTcp.cs +++ b/libs/server/Servers/GarnetServerTcp.cs @@ -149,9 +149,12 @@ private void AcceptEventArg_Completed(object sender, SocketAsyncEventArgs e) { do { + // Check isListening flag before processing and before calling AcceptAsync again + if (!isListening) break; + if (!HandleNewConnection(e)) break; e.AcceptSocket = null; - } while (!listenSocket.AcceptAsync(e)); + } while (isListening && !listenSocket.AcceptAsync(e)); } // socket disposed catch (ObjectDisposedException) { } diff --git a/test/Garnet.test/GarnetServerTcpTests.cs b/test/Garnet.test/GarnetServerTcpTests.cs new file mode 100644 index 00000000000..551dbf6db32 --- /dev/null +++ b/test/Garnet.test/GarnetServerTcpTests.cs @@ -0,0 +1,139 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using Garnet.server; +using NUnit.Framework; +using NUnit.Framework.Legacy; +using StackExchange.Redis; + +namespace Garnet.test +{ + [TestFixture] + public class GarnetServerTcpTests + { + private GarnetServer server; + + [SetUp] + public void Setup() + { + TestUtils.DeleteDirectory(TestUtils.MethodTestDir, wait: true); + server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir); + server.Start(); + } + + [TearDown] + public void TearDown() + { + server?.Dispose(); + TestUtils.DeleteDirectory(TestUtils.MethodTestDir); + } + + [Test] + public void StopListeningPreventsNewConnections() + { + // Arrange - Establish a working connection first + using var redis1 = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db1 = redis1.GetDatabase(0); + db1.StringSet("test", "value"); + ClassicAssert.AreEqual("value", (string)db1.StringGet("test")); + + // Act - Stop listening on all servers + foreach (var tcpServer in server.Provider.StoreWrapper.Servers.OfType()) + { + tcpServer.StopListening(); + } + + Thread.Sleep(100); // Brief delay to ensure socket is closed + + // Assert - New connections should fail + Assert.Throws(() => + { + using var redis2 = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + redis2.GetDatabase(0).Ping(); + }); + + // Existing connection should still work + ClassicAssert.AreEqual("value", (string)db1.StringGet("test")); + } + + [Test] + public void StopListeningIdempotent() + { + // Arrange + foreach (var tcpServer in server.Provider.StoreWrapper.Servers.OfType()) + { + tcpServer.StopListening(); + } + + // Act & Assert - Calling StopListening again should not throw + Assert.DoesNotThrow(() => + { + foreach (var tcpServer in server.Provider.StoreWrapper.Servers.OfType()) + { + tcpServer.StopListening(); + } + }); + } + + [Test] + public void StopListeningLogsInformation() + { + // This test verifies that StopListening logs appropriate information + // You would need to set up a logger and verify the log output + // For now, we just verify no exceptions are thrown + + Assert.DoesNotThrow(() => + { + foreach (var tcpServer in server.Provider.StoreWrapper.Servers.OfType()) + { + tcpServer.StopListening(); + } + }); + } + + [Test] + public async Task StopListeningDuringActiveConnectionAttempts() + { + // Arrange - Start multiple connection attempts + var connectionTasks = new System.Collections.Generic.List(); + var cts = new CancellationTokenSource(); + + for (int i = 0; i < 10; i++) + { + connectionTasks.Add(Task.Run(async () => + { + while (!cts.Token.IsCancellationRequested) + { + try + { + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + await redis.GetDatabase(0).PingAsync(); + await Task.Delay(10); + } + catch + { + // Connection failures are expected after StopListening + } + } + }, cts.Token)); + } + + await Task.Delay(50); // Let some connections establish + + // Act + foreach (var tcpServer in server.Provider.StoreWrapper.Servers.OfType()) + { + tcpServer.StopListening(); + } + + await Task.Delay(100); + cts.Cancel(); + + // Assert - All tasks should complete without unhandled exceptions + Assert.DoesNotThrowAsync(async () => await Task.WhenAll(connectionTasks)); + } + } +} \ No newline at end of file From 9bf52de2d7da266ddcef15d6912a7dc80dd09dda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:39:45 +0900 Subject: [PATCH 09/37] =?UTF-8?q?=E2=9C=85=20add=20test=20for=20gracefulsh?= =?UTF-8?q?utdown=20about=20main/garnetserver?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/Garnet.test/RespAdminCommandsTests.cs | 157 +++++++++++++++++++++ 1 file changed, 157 insertions(+) diff --git a/test/Garnet.test/RespAdminCommandsTests.cs b/test/Garnet.test/RespAdminCommandsTests.cs index f7819b7a776..5123d6944bd 100644 --- a/test/Garnet.test/RespAdminCommandsTests.cs +++ b/test/Garnet.test/RespAdminCommandsTests.cs @@ -660,5 +660,162 @@ public void ConfigGetWrongNumberOfArguments() ClassicAssert.AreEqual(expectedMessage, ex.Message); } #endregion + + #region GracefulShutdownTests + [Test] + public async Task ShutdownAsyncStopsAcceptingNewConnections() + { + // Arrange + server.Dispose(); + var testServer = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir + "_shutdown"); + testServer.Start(); + + using var redis1 = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db1 = redis1.GetDatabase(0); + db1.StringSet("test", "value"); + + // Act - Initiate shutdown (no need for Task.Run, ShutdownAsync is already async) + var shutdownTask = testServer.ShutdownAsync(TimeSpan.FromSeconds(5)); + + // Give shutdown a moment to stop listening + await Task.Delay(200); + + // Assert - New connections should fail + var ex = Assert.ThrowsAsync(async () => + { + using var redis2 = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + await redis2.GetDatabase(0).PingAsync(); + }); + ClassicAssert.IsNotNull(ex, "Expected connection to fail after shutdown initiated"); + + await shutdownTask; + testServer.Dispose(); + } + + [Test] + public async Task ShutdownAsyncWaitsForActiveConnections() + { + // Arrange + server.Dispose(); + var testServer = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir + "_shutdown2"); + testServer.Start(); + + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db = redis.GetDatabase(0); + + // Set initial value + db.StringSet("key1", "value1"); + + // Act - Start shutdown while connection is active + var shutdownTask = testServer.ShutdownAsync(TimeSpan.FromSeconds(10)); + + // Connection should still work during grace period + // Perform multiple operations to ensure connection remains active + var result = db.StringGet("key1"); + ClassicAssert.AreEqual("value1", (string)result); + + // Verify we can still perform operations during grace period + db.StringSet("key2", "value2"); + var result2 = db.StringGet("key2"); + ClassicAssert.AreEqual("value2", (string)result2); + + await shutdownTask; + testServer.Dispose(); + } + + [Test] + public async Task ShutdownAsyncCommitsAOF() + { + // Arrange + server.Dispose(); + server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableAOF: true); + server.Start(); + + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig(allowAdmin: true))) + { + var db = redis.GetDatabase(0); + db.StringSet("aofKey", "aofValue"); + } + + // Act - Shutdown which should commit AOF + await server.ShutdownAsync(TimeSpan.FromSeconds(5)); + server.Dispose(false); + + // Assert - Recover and verify data persisted + server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableAOF: true, tryRecover: true); + server.Start(); + + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db = redis.GetDatabase(0); + var recoveredValue = db.StringGet("aofKey"); + ClassicAssert.AreEqual("aofValue", recoveredValue.ToString()); + } + } + + [Test] + public async Task ShutdownAsyncTakesCheckpointWhenStorageTierEnabled() + { + // Arrange + server.Dispose(); + // Storage tier is enabled by default when logCheckpointDir is provided + server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir); + server.Start(); + + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig(allowAdmin: true))) + { + var db = redis.GetDatabase(0); + db.StringSet("checkpointKey", "checkpointValue"); + } + + // Act - Shutdown which should take checkpoint + await server.ShutdownAsync(TimeSpan.FromSeconds(5)); + server.Dispose(false); + + // Assert - Recover from checkpoint + server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, tryRecover: true); + server.Start(); + + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db = redis.GetDatabase(0); + var recoveredValue = db.StringGet("checkpointKey"); + ClassicAssert.AreEqual("checkpointValue", recoveredValue.ToString()); + } + } + + [Test] + public async Task ShutdownAsyncRespectsTimeout() + { + // This test verifies that shutdown respects the timeout parameter + // Arrange + server.Dispose(); + var testServer = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir + "_timeout"); + testServer.Start(); + + // Create a connection that will remain active + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db = redis.GetDatabase(0); + db.StringSet("key", "value"); + + // Act - Shutdown with very short timeout (100ms) + // With an active connection, shutdown should timeout quickly rather than waiting indefinitely + var stopwatch = System.Diagnostics.Stopwatch.StartNew(); + await testServer.ShutdownAsync(TimeSpan.FromMilliseconds(100)); + stopwatch.Stop(); + + // Assert - Should complete within reasonable time (timeout + some overhead for AOF/checkpoint) + // The timeout is for waiting on connections, but shutdown also does AOF commit and checkpoint + // So we allow more time than the timeout itself + ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 5000, + $"Shutdown should complete within reasonable time. Actual: {stopwatch.ElapsedMilliseconds}ms"); + + // Verify it completed faster than a longer timeout would take + ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 2000, + "Shutdown with short timeout should be faster than longer timeout"); + + testServer.Dispose(); + } + #endregion } } \ No newline at end of file From 11d115bfee9bcf7262a84211c21aecc9a88401f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:43:40 +0900 Subject: [PATCH 10/37] =?UTF-8?q?=F0=9F=90=9B=20Fix=20risk=20of=20shutdown?= =?UTF-8?q?=20handler=20remaining?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- main/GarnetServer/Program.cs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/main/GarnetServer/Program.cs b/main/GarnetServer/Program.cs index 214814d93f1..f7463e4db8f 100644 --- a/main/GarnetServer/Program.cs +++ b/main/GarnetServer/Program.cs @@ -15,6 +15,7 @@ static async Task Main(string[] args) GarnetServer server = null; using var shutdownCts = new CancellationTokenSource(); int shutdownInitiated = 0; // Guard to ensure single shutdown/dispose + int serverStarted = 0; // Guard to track if server started successfully try { @@ -33,8 +34,9 @@ static async Task Main(string[] args) AppDomain.CurrentDomain.ProcessExit += (sender, e) => { - // Only initiate shutdown if not already done - if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0) + // Only initiate shutdown if not already done and server has started + if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0 && + Interlocked.CompareExchange(ref serverStarted, 0, 0) == 1) { Console.WriteLine("Process exit signal received. Starting graceful shutdown..."); shutdownCts.Cancel(); @@ -47,6 +49,7 @@ static async Task Main(string[] args) // Start the server server.Start(); + Interlocked.Exchange(ref serverStarted, 1); // Mark server as started // Wait for shutdown signal try @@ -58,8 +61,9 @@ static async Task Main(string[] args) // Normal shutdown path } - // Only initiate shutdown if not already done by ProcessExit handler - if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0) + // Only initiate shutdown if not already done by ProcessExit handler and server has started + if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0 && + Interlocked.CompareExchange(ref serverStarted, 0, 0) == 1) { // Block synchronously for shutdown - ensures cleanup completes before process exits server.ShutdownAsync(TimeSpan.FromSeconds(5), CancellationToken.None) From 3b3df07df93ebb56a4a981371f4026a74ee4bb63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Thu, 27 Nov 2025 01:43:48 +0900 Subject: [PATCH 11/37] =?UTF-8?q?=E2=9C=8F=EF=B8=8F=20fix=20by=20dotnet=20?= =?UTF-8?q?format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/Garnet.test/RespAdminCommandsTests.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/Garnet.test/RespAdminCommandsTests.cs b/test/Garnet.test/RespAdminCommandsTests.cs index 5123d6944bd..44b4aac9804 100644 --- a/test/Garnet.test/RespAdminCommandsTests.cs +++ b/test/Garnet.test/RespAdminCommandsTests.cs @@ -713,7 +713,7 @@ public async Task ShutdownAsyncWaitsForActiveConnections() // Perform multiple operations to ensure connection remains active var result = db.StringGet("key1"); ClassicAssert.AreEqual("value1", (string)result); - + // Verify we can still perform operations during grace period db.StringSet("key2", "value2"); var result2 = db.StringGet("key2"); @@ -807,11 +807,11 @@ public async Task ShutdownAsyncRespectsTimeout() // Assert - Should complete within reasonable time (timeout + some overhead for AOF/checkpoint) // The timeout is for waiting on connections, but shutdown also does AOF commit and checkpoint // So we allow more time than the timeout itself - ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 5000, + ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 5000, $"Shutdown should complete within reasonable time. Actual: {stopwatch.ElapsedMilliseconds}ms"); - + // Verify it completed faster than a longer timeout would take - ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 2000, + ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 2000, "Shutdown with short timeout should be faster than longer timeout"); testServer.Dispose(); From d77bd6bdbee6851fec2ae2dff4c2adf0bbe205e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Mon, 26 Jan 2026 02:04:36 +0900 Subject: [PATCH 12/37] =?UTF-8?q?=E2=9C=85=F0=9F=94=80=20Fix=20Test=20with?= =?UTF-8?q?=20Allure=20related=20Requirements=20(reflect=20#1457)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/Garnet.test/GarnetServerTcpTests.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/Garnet.test/GarnetServerTcpTests.cs b/test/Garnet.test/GarnetServerTcpTests.cs index 551dbf6db32..852f0332a6c 100644 --- a/test/Garnet.test/GarnetServerTcpTests.cs +++ b/test/Garnet.test/GarnetServerTcpTests.cs @@ -4,6 +4,7 @@ using System.Linq; using System.Threading; using System.Threading.Tasks; +using Allure.NUnit; using Garnet.server; using NUnit.Framework; using NUnit.Framework.Legacy; @@ -11,8 +12,9 @@ namespace Garnet.test { - [TestFixture] - public class GarnetServerTcpTests + [AllureNUnit] + [TestFixture, NonParallelizable] + public class GarnetServerTcpTests : AllureTestBase { private GarnetServer server; From 3177001d74260450002abdd8a3795dec42f42ab2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 12:38:35 +0900 Subject: [PATCH 13/37] =?UTF-8?q?=E2=8F=AA=20Revert=20:=20Windows=20Servic?= =?UTF-8?q?e=20shutdown=20timeout=20custom?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit default 30 seconds value will be more torelant in production operations (our company's small size is enough 5 seconds, but large scale production has risks) --- hosting/Windows/Garnet.worker/Program.cs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/hosting/Windows/Garnet.worker/Program.cs b/hosting/Windows/Garnet.worker/Program.cs index bb2e46173ea..8418da86716 100644 --- a/hosting/Windows/Garnet.worker/Program.cs +++ b/hosting/Windows/Garnet.worker/Program.cs @@ -1,7 +1,6 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. -using System; using Garnet; using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Hosting; @@ -13,13 +12,6 @@ static void Main(string[] args) var builder = Host.CreateApplicationBuilder(args); builder.Services.AddHostedService(_ => new Worker(args)); - // Configure Host shutdown timeout - builder.Services.Configure(options => - { - // Set graceful shutdown timeout to 5 seconds - options.ShutdownTimeout = TimeSpan.FromSeconds(5); - }); - builder.Services.AddWindowsService(options => { options.ServiceName = "Microsoft Garnet Server"; From 920c328e8aed9dfa926277f62a7043e1b0c74860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 12:48:12 +0900 Subject: [PATCH 14/37] Use long for active connection count Change GetActiveConnectionCount return type and local accumulator from int to long and remove the redundant cast when adding garnetServerBase.get_conn_active(). This prevents potential integer overflow when summing active connections across multiple server instances; callers may need to handle the updated long return value. --- libs/host/GarnetServer.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 782f80e2469..2014db71061 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -537,14 +537,14 @@ private async Task WaitForActiveConnectionsAsync(TimeSpan timeout, CancellationT /// /// Gets the current number of active connections directly from server instances. /// - private int GetActiveConnectionCount() + private long GetActiveConnectionCount() { - int count = 0; + long count = 0; if (servers != null) { foreach (var garnetServerBase in servers.OfType()) { - count += (int)garnetServerBase.get_conn_active(); + count += garnetServerBase.get_conn_active(); } } return count; From 8ea0c241a6e07bc146dbba89e59ab944b80ef7b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 13:05:58 +0900 Subject: [PATCH 15/37] minor perf fix in connection counting replace OfType to is --- libs/host/GarnetServer.cs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 2014db71061..8e9eebeae59 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -542,9 +542,12 @@ private long GetActiveConnectionCount() long count = 0; if (servers != null) { - foreach (var garnetServerBase in servers.OfType()) + foreach (var garnetServer in servers) { - count += garnetServerBase.get_conn_active(); + if (garnetServer is GarnetServerBase garnetServerBase) + { + count += garnetServerBase.get_conn_active(); + } } } return count; From bf05d232f7cb8293aff5dd98ac1eec18f2a7e86d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 13:21:12 +0900 Subject: [PATCH 16/37] Use linked CancellationTokenSource for shutdown wait Replace manual Stopwatch-based timeout logic with a linked CancellationTokenSource (linked to the external token) and CancelAfter(timeout). The loop now observes cts.Token for both external cancellation and timeout, and delay calls use the linked token. Improved exception handling: rethrow when the external token is canceled, log a warning when the timeout triggers, and centralize other error logging/retry behavior. This ensures correct timeout semantics and clearer error handling while waiting for active connections to close. --- libs/host/GarnetServer.cs | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 8e9eebeae59..db47d798777 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -492,16 +492,18 @@ private async Task WaitForActiveConnectionsAsync(TimeSpan timeout, CancellationT { if (servers == null) return; - var stopwatch = Stopwatch.StartNew(); + // Linked Token : between external token and timeout + using var cts = CancellationTokenSource.CreateLinkedTokenSource(token); + cts.CancelAfter(timeout); + var delays = new[] { 50, 300, 1000 }; var delayIndex = 0; - while (stopwatch.Elapsed < timeout && !token.IsCancellationRequested) + try { - try + while (!cts.Token.IsCancellationRequested) { var activeConnections = GetActiveConnectionCount(); - if (activeConnections == 0) { logger?.LogInformation("All connections have been closed gracefully."); @@ -513,25 +515,25 @@ private async Task WaitForActiveConnectionsAsync(TimeSpan timeout, CancellationT var currentDelay = delays[delayIndex]; if (delayIndex < delays.Length - 1) delayIndex++; - await Task.Delay(currentDelay, token).ConfigureAwait(false); - } - catch (OperationCanceledException) - { - throw; - } - catch (Exception ex) - { - logger?.LogWarning(ex, "Error checking active connections"); - delayIndex = 0; - await Task.Delay(500, token).ConfigureAwait(false); + await Task.Delay(currentDelay, cts.Token).ConfigureAwait(false); } } - - if (stopwatch.Elapsed >= timeout) + catch (OperationCanceledException) when (token.IsCancellationRequested) + { + throw; + } + catch (OperationCanceledException) { + // timeout reached error logging logger?.LogWarning("Timeout reached after {TimeoutSeconds} seconds. Some connections may still be active.", timeout.TotalSeconds); } + catch (Exception ex) + { + logger?.LogWarning(ex, "Error checking active connections"); + delayIndex = 0; + await Task.Delay(500, token).ConfigureAwait(false); + } } /// From 01a2547e5d857ca2f08267b1ba35276bf2e22268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 13:41:59 +0900 Subject: [PATCH 17/37] update log levels infomation -> debug for perfomance inhancing --- libs/host/GarnetServer.cs | 17 +++++++---------- libs/server/Servers/GarnetServerTcp.cs | 4 ++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index db47d798777..6a44e606638 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -471,7 +471,7 @@ private void StopListening() { if (servers == null) return; - logger?.LogInformation("Stopping listeners to prevent new connections..."); + logger?.LogDebug("Stopping listeners to prevent new connections..."); foreach (var server in servers) { try @@ -560,19 +560,16 @@ private long GetActiveConnectionCount() /// private async Task FinalizeDataAsync(CancellationToken token) { - var enableAOF = opts.EnableAOF; - var enableStorageTier = opts.EnableStorageTier; - // Commit AOF before checkpoint/shutdown - if (enableAOF) + if (opts.EnableAOF) { - logger?.LogInformation("Committing AOF before shutdown..."); + logger?.LogDebug("Committing AOF before shutdown..."); try { var commitSuccess = await Store.CommitAOFAsync(token).ConfigureAwait(false); if (commitSuccess) { - logger?.LogInformation("AOF committed successfully."); + logger?.LogDebug("AOF committed successfully."); } else { @@ -586,15 +583,15 @@ private async Task FinalizeDataAsync(CancellationToken token) } // Take checkpoint for tiered storage - if (enableStorageTier) + if (opts.EnableStorageTier) { - logger?.LogInformation("Taking checkpoint for tiered storage..."); + logger?.LogDebug("Taking checkpoint for tiered storage..."); try { var checkpointSuccess = Store.TakeCheckpoint(background: false, token); if (checkpointSuccess) { - logger?.LogInformation("Checkpoint completed successfully."); + logger?.LogDebug("Checkpoint completed successfully."); } else { diff --git a/libs/server/Servers/GarnetServerTcp.cs b/libs/server/Servers/GarnetServerTcp.cs index be0ea105a4b..1a1979f170d 100644 --- a/libs/server/Servers/GarnetServerTcp.cs +++ b/libs/server/Servers/GarnetServerTcp.cs @@ -135,11 +135,11 @@ public override void StopListening() // Close the listen socket to stop accepting new connections // This will cause any pending AcceptAsync to complete with an error listenSocket.Close(); - logger?.LogInformation("Stopped accepting new connections on {endpoint}", EndPoint); + logger?.LogDebug("Stopped accepting new connections on {endpoint}", EndPoint); } catch (Exception ex) { - logger?.LogWarning(ex, "Error closing listen socket on {endpoint}", EndPoint); + logger?.LogDebug(ex, "Error closing listen socket on {endpoint}", EndPoint); } } From 65b13919958d960bdd0fe2d494daf617bdafe8fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 13:43:51 +0900 Subject: [PATCH 18/37] minor bug risk of cancelled canclationToken provide into finally --- hosting/Windows/Garnet.worker/Worker.cs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hosting/Windows/Garnet.worker/Worker.cs b/hosting/Windows/Garnet.worker/Worker.cs index 133aeea966a..82d47408aa4 100644 --- a/hosting/Windows/Garnet.worker/Worker.cs +++ b/hosting/Windows/Garnet.worker/Worker.cs @@ -57,7 +57,8 @@ public override async Task StopAsync(CancellationToken cancellationToken) } finally { - await base.StopAsync(cancellationToken).ConfigureAwait(false); + // Ensure base class cleanup although cancellationToken is cancelled + await base.StopAsync(CancellationToken.None).ConfigureAwait(false); Dispose(); } } From 514509405dafcc5a53f5e06358397708980fa026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 13:51:26 +0900 Subject: [PATCH 19/37] =?UTF-8?q?=F0=9F=A7=AA=20temporal=20test=20code=20s?= =?UTF-8?q?ave=20(test=20for=20gracefull=20shutown=20in=20single=20instanc?= =?UTF-8?q?e)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/Garnet.test/GarnetServerTcpTests.cs | 82 ++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/test/Garnet.test/GarnetServerTcpTests.cs b/test/Garnet.test/GarnetServerTcpTests.cs index 852f0332a6c..13bdb9f7b3d 100644 --- a/test/Garnet.test/GarnetServerTcpTests.cs +++ b/test/Garnet.test/GarnetServerTcpTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. +using System; using System.Linq; using System.Threading; using System.Threading.Tasks; @@ -137,5 +138,86 @@ public async Task StopListeningDuringActiveConnectionAttempts() // Assert - All tasks should complete without unhandled exceptions Assert.DoesNotThrowAsync(async () => await Task.WhenAll(connectionTasks)); } + + [Test] + public async Task ShutdownAsyncCompletesGracefully() + { + // Arrange - Write data that should survive shutdown + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db = redis.GetDatabase(0); + db.StringSet("shutdown-test", "data"); + ClassicAssert.AreEqual("data", (string)db.StringGet("shutdown-test")); + + // Act - Graceful shutdown + await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5)).ConfigureAwait(false); + + // Assert - New connections should fail after shutdown + Assert.Throws(() => + { + using var redis2 = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + redis2.GetDatabase(0).Ping(); + }); + } + + [Test] + public async Task ShutdownAsyncRespectsTimeout() + { + // Arrange - Establish a connection that will stay open + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db = redis.GetDatabase(0); + db.Ping(); + + // Act - Shutdown with a very short timeout + var sw = System.Diagnostics.Stopwatch.StartNew(); + await server.ShutdownAsync(timeout: TimeSpan.FromMilliseconds(200)).ConfigureAwait(false); + sw.Stop(); + + // Assert - Should complete without hanging indefinitely + // Allow generous upper bound for CI environments + ClassicAssert.Less(sw.ElapsedMilliseconds, 10_000, + "ShutdownAsync should complete within a reasonable time even with active connections"); + } + + [Test] + public async Task ShutdownAsyncRespectsCancellation() + { + // Arrange + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + redis.GetDatabase(0).Ping(); + + using var cts = new CancellationTokenSource(); + + // Act - Cancel immediately + cts.Cancel(); + Assert.DoesNotThrowAsync(async () => + { + await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(30), token: cts.Token).ConfigureAwait(false); + }); + } + + [Test] + public async Task ShutdownAsyncWithAofCommit() + { + // Arrange - Create server with AOF enabled + server?.Dispose(); + TestUtils.DeleteDirectory(TestUtils.MethodTestDir); + server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableAOF: true); + server.Start(); + + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db = redis.GetDatabase(0); + + // Write some data + for (int i = 0; i < 100; i++) + { + db.StringSet($"aof-key-{i}", $"value-{i}"); + } + + // Act - Shutdown should commit AOF without errors + Assert.DoesNotThrowAsync(async () => + { + await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5)).ConfigureAwait(false); + }); + } } } \ No newline at end of file From cf9c997e84affb1437ab51406e2c47437c1efcdc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 14:10:40 +0900 Subject: [PATCH 20/37] Fix Program.CS main method refine minimal fix for ensure graceful shutodown (exsiting PR's is too much and complex option) - I didn't know ManualResetEventSlim.... --- main/GarnetServer/Program.cs | 63 +++++++----------------------------- 1 file changed, 11 insertions(+), 52 deletions(-) diff --git a/main/GarnetServer/Program.cs b/main/GarnetServer/Program.cs index f7463e4db8f..12edd434605 100644 --- a/main/GarnetServer/Program.cs +++ b/main/GarnetServer/Program.cs @@ -10,75 +10,34 @@ namespace Garnet /// public class Program { - static async Task Main(string[] args) + static void Main(string[] args) { - GarnetServer server = null; - using var shutdownCts = new CancellationTokenSource(); - int shutdownInitiated = 0; // Guard to ensure single shutdown/dispose - int serverStarted = 0; // Guard to track if server started successfully - try { - server = new GarnetServer(args); + using var server = new GarnetServer(args); // Optional: register custom extensions RegisterExtensions(server); - // Set up graceful shutdown handlers for Ctrl+C and SIGTERM - Console.CancelKeyPress += (sender, e) => - { - e.Cancel = true; // Prevent immediate termination - Console.WriteLine("Shutdown signal received. Starting graceful shutdown..."); - shutdownCts.Cancel(); - }; - - AppDomain.CurrentDomain.ProcessExit += (sender, e) => - { - // Only initiate shutdown if not already done and server has started - if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0 && - Interlocked.CompareExchange(ref serverStarted, 0, 0) == 1) - { - Console.WriteLine("Process exit signal received. Starting graceful shutdown..."); - shutdownCts.Cancel(); - // Wait for graceful shutdown with timeout - server?.ShutdownAsync(TimeSpan.FromSeconds(3), CancellationToken.None) - .GetAwaiter().GetResult(); - server?.Dispose(); - } - }; - // Start the server server.Start(); - Interlocked.Exchange(ref serverStarted, 1); // Mark server as started - // Wait for shutdown signal - try - { - await Task.Delay(Timeout.Infinite, shutdownCts.Token).ConfigureAwait(false); - } - catch (OperationCanceledException) - { - // Normal shutdown path - } + var shutdownEvent = new ManualResetEventSlim(false); - // Only initiate shutdown if not already done by ProcessExit handler and server has started - if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0 && - Interlocked.CompareExchange(ref serverStarted, 0, 0) == 1) + Console.CancelKeyPress += (sender, e) => { - // Block synchronously for shutdown - ensures cleanup completes before process exits - server.ShutdownAsync(TimeSpan.FromSeconds(5), CancellationToken.None) + e.Cancel = true; + // Graceful shutdown: drain connections, commit AOF, take checkpoint + server.ShutdownAsync(TimeSpan.FromSeconds(5)) .GetAwaiter().GetResult(); - server.Dispose(); - } + shutdownEvent.Set(); + }; + + shutdownEvent.Wait(); } catch (Exception ex) { Console.WriteLine($"Unable to initialize server due to exception: {ex.Message}"); - // Ensure cleanup on exception if shutdown wasn't initiated - if (Interlocked.Exchange(ref shutdownInitiated, 1) == 0) - { - server?.Dispose(); - } } } From 514456c39db2bff56c7214a4df01b278e588b2c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 14:30:39 +0900 Subject: [PATCH 21/37] Update libs/host/GarnetServer.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- libs/host/GarnetServer.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 6a44e606638..de3a532a359 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -531,7 +531,6 @@ private async Task WaitForActiveConnectionsAsync(TimeSpan timeout, CancellationT catch (Exception ex) { logger?.LogWarning(ex, "Error checking active connections"); - delayIndex = 0; await Task.Delay(500, token).ConfigureAwait(false); } } From db633e3711193c8dc87bc75c025c51ee45df77c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 14:31:08 +0900 Subject: [PATCH 22/37] Update test/Garnet.test/GarnetServerTcpTests.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- test/Garnet.test/GarnetServerTcpTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Garnet.test/GarnetServerTcpTests.cs b/test/Garnet.test/GarnetServerTcpTests.cs index 13bdb9f7b3d..eafb9709475 100644 --- a/test/Garnet.test/GarnetServerTcpTests.cs +++ b/test/Garnet.test/GarnetServerTcpTests.cs @@ -102,7 +102,7 @@ public async Task StopListeningDuringActiveConnectionAttempts() { // Arrange - Start multiple connection attempts var connectionTasks = new System.Collections.Generic.List(); - var cts = new CancellationTokenSource(); + using var cts = new CancellationTokenSource(); for (int i = 0; i < 10; i++) { From de4e462a7d5af5336e8fd320941d3454544c06e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 14:31:20 +0900 Subject: [PATCH 23/37] Update main/GarnetServer/Program.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- main/GarnetServer/Program.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/GarnetServer/Program.cs b/main/GarnetServer/Program.cs index 12edd434605..1b145f2a83d 100644 --- a/main/GarnetServer/Program.cs +++ b/main/GarnetServer/Program.cs @@ -22,7 +22,7 @@ static void Main(string[] args) // Start the server server.Start(); - var shutdownEvent = new ManualResetEventSlim(false); + using var shutdownEvent = new ManualResetEventSlim(false); Console.CancelKeyPress += (sender, e) => { From 74f1c0a5740f5e39e53efec4ac2d976e38aee9da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 14:39:36 +0900 Subject: [PATCH 24/37] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- libs/host/GarnetServer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index de3a532a359..25e4d5cd32f 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -587,7 +587,7 @@ private async Task FinalizeDataAsync(CancellationToken token) logger?.LogDebug("Taking checkpoint for tiered storage..."); try { - var checkpointSuccess = Store.TakeCheckpoint(background: false, token); + var checkpointSuccess = Store.TakeCheckpoint(background: false, token: token); if (checkpointSuccess) { logger?.LogDebug("Checkpoint completed successfully."); From 7bb2ab20925789eac75d3e3438942f8f6d1d3261 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 14:54:57 +0900 Subject: [PATCH 25/37] Fix about copilot's concern about race conditions https://github.com/microsoft/garnet/pull/1551/changes/BASE..cf9c997e84affb1437ab51406e2c47437c1efcdc#r2778598912 --- libs/server/Servers/GarnetServerTcp.cs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libs/server/Servers/GarnetServerTcp.cs b/libs/server/Servers/GarnetServerTcp.cs index 1a1979f170d..f00f181303b 100644 --- a/libs/server/Servers/GarnetServerTcp.cs +++ b/libs/server/Servers/GarnetServerTcp.cs @@ -150,7 +150,13 @@ private void AcceptEventArg_Completed(object sender, SocketAsyncEventArgs e) do { // Check isListening flag before processing and before calling AcceptAsync again - if (!isListening) break; + if (!isListening) + { + // Dispose any accepted socket that won't be handled + e.AcceptSocket?.Dispose(); + e.AcceptSocket = null; + break; + } if (!HandleNewConnection(e)) break; e.AcceptSocket = null; From 7cb1e9cdccd2d1f8f3e47c60a69a03c4665668d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 14:59:41 +0900 Subject: [PATCH 26/37] fix issue claimed by copilot https://github.com/microsoft/garnet/pull/1551/changes#r2778598905 --- libs/host/GarnetServer.cs | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 25e4d5cd32f..fc1b6d6c208 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -448,20 +448,35 @@ public async Task ShutdownAsync(TimeSpan? timeout = null, CancellationToken toke // Stop accepting new connections first StopListening(); - // Wait for existing connections to complete - await WaitForActiveConnectionsAsync(shutdownTimeout, token).ConfigureAwait(false); - - // Commit AOF and take checkpoint if needed - await FinalizeDataAsync(token).ConfigureAwait(false); - } - catch (OperationCanceledException) - { - // Force shutdown requested + // Wait for existing connections to complete (cancellable) + try + { + await WaitForActiveConnectionsAsync(shutdownTimeout, token).ConfigureAwait(false); + } + catch (OperationCanceledException) + { + logger?.LogWarning("Connection draining was cancelled. Proceeding with data finalization..."); + } } catch (Exception ex) { logger?.LogError(ex, "Error during graceful shutdown"); } + finally + { + // Always attempt AOF commit and checkpoint as best-effort, + // even if connection draining was cancelled or failed. + // Use a bounded timeout instead of the caller's token to ensure completion. + using var finalizeCts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); + try + { + await FinalizeDataAsync(finalizeCts.Token).ConfigureAwait(false); + } + catch (Exception ex) + { + logger?.LogError(ex, "Error during data finalization"); + } + } } /// From 253026083f987629f783c98561b40bf0e06e1930 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 15:03:38 +0900 Subject: [PATCH 27/37] =?UTF-8?q?=F0=9F=94=A5=20remove=20duplicated=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/Garnet.test/GarnetServerTcpTests.cs | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/test/Garnet.test/GarnetServerTcpTests.cs b/test/Garnet.test/GarnetServerTcpTests.cs index eafb9709475..fc78119c504 100644 --- a/test/Garnet.test/GarnetServerTcpTests.cs +++ b/test/Garnet.test/GarnetServerTcpTests.cs @@ -81,22 +81,6 @@ public void StopListeningIdempotent() }); } - [Test] - public void StopListeningLogsInformation() - { - // This test verifies that StopListening logs appropriate information - // You would need to set up a logger and verify the log output - // For now, we just verify no exceptions are thrown - - Assert.DoesNotThrow(() => - { - foreach (var tcpServer in server.Provider.StoreWrapper.Servers.OfType()) - { - tcpServer.StopListening(); - } - }); - } - [Test] public async Task StopListeningDuringActiveConnectionAttempts() { From 784f6308fea2cfa797c44ecc49e063cd314a2f99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 15:07:11 +0900 Subject: [PATCH 28/37] Fix Test code running flow --- test/Garnet.test/GarnetServerTcpTests.cs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/test/Garnet.test/GarnetServerTcpTests.cs b/test/Garnet.test/GarnetServerTcpTests.cs index fc78119c504..9591243a938 100644 --- a/test/Garnet.test/GarnetServerTcpTests.cs +++ b/test/Garnet.test/GarnetServerTcpTests.cs @@ -126,13 +126,15 @@ public async Task StopListeningDuringActiveConnectionAttempts() [Test] public async Task ShutdownAsyncCompletesGracefully() { - // Arrange - Write data that should survive shutdown - using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); - var db = redis.GetDatabase(0); - db.StringSet("shutdown-test", "data"); - ClassicAssert.AreEqual("data", (string)db.StringGet("shutdown-test")); + // Arrange - Write data and then close the connection + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db = redis.GetDatabase(0); + db.StringSet("shutdown-test", "data"); + ClassicAssert.AreEqual("data", (string)db.StringGet("shutdown-test")); + } - // Act - Graceful shutdown + // Act - Graceful shutdown (no active connections) await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5)).ConfigureAwait(false); // Assert - New connections should fail after shutdown From bc6580d19b237f0097d60cdbf9e6cdce1f72067d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Sun, 8 Feb 2026 15:13:24 +0900 Subject: [PATCH 29/37] Fix test codes to reflect copilot's suggestions --- test/Garnet.test/RespAdminCommandsTests.cs | 92 +++++++--------------- 1 file changed, 28 insertions(+), 64 deletions(-) diff --git a/test/Garnet.test/RespAdminCommandsTests.cs b/test/Garnet.test/RespAdminCommandsTests.cs index dae1f31577a..9047da6422e 100644 --- a/test/Garnet.test/RespAdminCommandsTests.cs +++ b/test/Garnet.test/RespAdminCommandsTests.cs @@ -667,62 +667,42 @@ public void ConfigGetWrongNumberOfArguments() [Test] public async Task ShutdownAsyncStopsAcceptingNewConnections() { - // Arrange - server.Dispose(); - var testServer = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir + "_shutdown"); - testServer.Start(); - - using var redis1 = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); - var db1 = redis1.GetDatabase(0); - db1.StringSet("test", "value"); - - // Act - Initiate shutdown (no need for Task.Run, ShutdownAsync is already async) - var shutdownTask = testServer.ShutdownAsync(TimeSpan.FromSeconds(5)); + // Arrange - write data then close connection + using (var redis1 = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db1 = redis1.GetDatabase(0); + db1.StringSet("test", "value"); + } - // Give shutdown a moment to stop listening - await Task.Delay(200); + // Act - Initiate shutdown + await server.ShutdownAsync(TimeSpan.FromSeconds(5)); // Assert - New connections should fail - var ex = Assert.ThrowsAsync(async () => + Assert.Throws(() => { using var redis2 = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); - await redis2.GetDatabase(0).PingAsync(); + redis2.GetDatabase(0).Ping(); }); - ClassicAssert.IsNotNull(ex, "Expected connection to fail after shutdown initiated"); - - await shutdownTask; - testServer.Dispose(); } [Test] public async Task ShutdownAsyncWaitsForActiveConnections() { - // Arrange - server.Dispose(); - var testServer = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir + "_shutdown2"); - testServer.Start(); - + // Arrange - keep connection open to test timeout behavior using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); var db = redis.GetDatabase(0); - - // Set initial value db.StringSet("key1", "value1"); - // Act - Start shutdown while connection is active - var shutdownTask = testServer.ShutdownAsync(TimeSpan.FromSeconds(10)); - - // Connection should still work during grace period - // Perform multiple operations to ensure connection remains active - var result = db.StringGet("key1"); - ClassicAssert.AreEqual("value1", (string)result); + // Act - Start shutdown with short timeout; active connection will force timeout + var sw = System.Diagnostics.Stopwatch.StartNew(); + await server.ShutdownAsync(TimeSpan.FromMilliseconds(500)); + sw.Stop(); - // Verify we can still perform operations during grace period - db.StringSet("key2", "value2"); - var result2 = db.StringGet("key2"); - ClassicAssert.AreEqual("value2", (string)result2); - - await shutdownTask; - testServer.Dispose(); + // Assert - Should wait approximately the timeout duration before proceeding + ClassicAssert.GreaterOrEqual(sw.ElapsedMilliseconds, 400, + "Shutdown should wait for the timeout duration when connections are active"); + ClassicAssert.Less(sw.ElapsedMilliseconds, 5_000, + "Shutdown should not hang beyond a reasonable bound"); } [Test] @@ -760,7 +740,6 @@ public async Task ShutdownAsyncTakesCheckpointWhenStorageTierEnabled() { // Arrange server.Dispose(); - // Storage tier is enabled by default when logCheckpointDir is provided server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir); server.Start(); @@ -789,34 +768,19 @@ public async Task ShutdownAsyncTakesCheckpointWhenStorageTierEnabled() [Test] public async Task ShutdownAsyncRespectsTimeout() { - // This test verifies that shutdown respects the timeout parameter - // Arrange - server.Dispose(); - var testServer = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir + "_timeout"); - testServer.Start(); - - // Create a connection that will remain active + // Arrange - keep connection open to force timeout path using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); var db = redis.GetDatabase(0); db.StringSet("key", "value"); - // Act - Shutdown with very short timeout (100ms) - // With an active connection, shutdown should timeout quickly rather than waiting indefinitely - var stopwatch = System.Diagnostics.Stopwatch.StartNew(); - await testServer.ShutdownAsync(TimeSpan.FromMilliseconds(100)); - stopwatch.Stop(); - - // Assert - Should complete within reasonable time (timeout + some overhead for AOF/checkpoint) - // The timeout is for waiting on connections, but shutdown also does AOF commit and checkpoint - // So we allow more time than the timeout itself - ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 5000, - $"Shutdown should complete within reasonable time. Actual: {stopwatch.ElapsedMilliseconds}ms"); - - // Verify it completed faster than a longer timeout would take - ClassicAssert.Less(stopwatch.ElapsedMilliseconds, 2000, - "Shutdown with short timeout should be faster than longer timeout"); + // Act - Shutdown with very short timeout + var sw = System.Diagnostics.Stopwatch.StartNew(); + await server.ShutdownAsync(TimeSpan.FromMilliseconds(100)); + sw.Stop(); - testServer.Dispose(); + // Assert - Should complete within reasonable time + ClassicAssert.Less(sw.ElapsedMilliseconds, 5_000, + $"Shutdown should complete within reasonable time. Actual: {sw.ElapsedMilliseconds}ms"); } #endregion } From e9b0a3e5e04abf0c58af3e1b9f18b6ceb1f1c4e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Mon, 16 Feb 2026 21:14:31 +0900 Subject: [PATCH 30/37] =?UTF-8?q?=E2=9C=A8Add=20noSave=20Arg=20to=20Server?= =?UTF-8?q?.ShutdownAsync()?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit for skip availiablity about Data Persistence making during Shutdown Process : To Reflect PR Comment from @badrishc --- libs/host/GarnetServer.cs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index fc1b6d6c208..3d53b6c46a1 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -437,9 +437,10 @@ public void Start() /// Stops accepting new connections, waits for active connections to complete, commits AOF, and takes checkpoint if needed. /// /// Timeout for waiting on active connections (default: 30 seconds) + /// If true, skip data persistence (AOF commit and checkpoint) during shutdown /// Cancellation token /// Task representing the async shutdown operation - public async Task ShutdownAsync(TimeSpan? timeout = null, CancellationToken token = default) + public async Task ShutdownAsync(TimeSpan? timeout = null, bool noSave = false, CancellationToken token = default) { var shutdownTimeout = timeout ?? TimeSpan.FromSeconds(30); @@ -464,17 +465,24 @@ public async Task ShutdownAsync(TimeSpan? timeout = null, CancellationToken toke } finally { - // Always attempt AOF commit and checkpoint as best-effort, - // even if connection draining was cancelled or failed. - // Use a bounded timeout instead of the caller's token to ensure completion. - using var finalizeCts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); - try + if (!noSave) { - await FinalizeDataAsync(finalizeCts.Token).ConfigureAwait(false); + // Attempt AOF commit and checkpoint as best-effort, + // even if connection draining was cancelled or failed. + // Use a bounded timeout instead of the caller's token to ensure completion. + using var finalizeCts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); + try + { + await FinalizeDataAsync(finalizeCts.Token).ConfigureAwait(false); + } + catch (Exception ex) + { + logger?.LogError(ex, "Error during data finalization"); + } } - catch (Exception ex) + else { - logger?.LogError(ex, "Error during data finalization"); + logger?.LogInformation("Shutdown with noSave flag - skipping data persistence."); } } } From aaaf45bdf6b0596231c1abad9483990ace8d14f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Mon, 16 Feb 2026 22:21:54 +0900 Subject: [PATCH 31/37] Add shutdown data consistency tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new NUnit test suite ShutdownDataConsistencyTests to exercise data recovery across various shutdown/finalization sequences. The tests create a GarnetServer with AOF and storage enabled, populate main (string) and object (sorted set) stores, perform combinations of checkpoint and AOF commit (including interleaved writes), recover the server, and assert correctness. Covers scenarios: checkpoint→AOF, AOF→checkpoint, AOF only, checkpoint only, no finalization (baseline), checkpoint→more writes→AOF, and AOF→more writes→checkpoint. Uses StackExchange.Redis, TestUtils helpers, and verifies both main and object store recovery for KeyCount entries. --- .../ShutdownDataConsistencyTests.cs | 420 ++++++++++++++++++ 1 file changed, 420 insertions(+) create mode 100644 test/Garnet.test/ShutdownDataConsistencyTests.cs diff --git a/test/Garnet.test/ShutdownDataConsistencyTests.cs b/test/Garnet.test/ShutdownDataConsistencyTests.cs new file mode 100644 index 00000000000..8b2b4618fba --- /dev/null +++ b/test/Garnet.test/ShutdownDataConsistencyTests.cs @@ -0,0 +1,420 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +using Allure.NUnit; +using NUnit.Framework; +using NUnit.Framework.Legacy; +using StackExchange.Redis; + +namespace Garnet.test +{ + /// + /// Tests to investigate data consistency under different shutdown finalization sequences: + /// 1. Checkpoint first, then AOF commit + /// 2. AOF commit first, then Checkpoint (current production order) + /// 3. AOF commit only + /// 4. Checkpoint only + /// + /// Each test writes data to main store (string keys) and object store (sorted sets), + /// performs the finalization sequence, disposes without cleanup, recovers, and verifies data. + /// + [AllureNUnit] + [TestFixture] + public class ShutdownDataConsistencyTests : AllureTestBase + { + private GarnetServer server; + + private const int KeyCount = 50; + private const string MainStoreKeyPrefix = "shutdowntest:key:"; + private const string MainStoreValuePrefix = "shutdowntest:value:"; + private const string ObjectStoreKeyPrefix = "shutdowntest:zset:"; + + [SetUp] + public void Setup() + { + TestUtils.DeleteDirectory(TestUtils.MethodTestDir, wait: true); + } + + [TearDown] + public void TearDown() + { + server?.Dispose(); + TestUtils.DeleteDirectory(TestUtils.MethodTestDir); + } + + /// + /// Creates a server with both AOF and storage tier enabled (low memory forces spill to disk). + /// + private GarnetServer CreateServerWithAofAndStorage(bool tryRecover = false) + { + return TestUtils.CreateGarnetServer( + TestUtils.MethodTestDir, + enableAOF: true, + lowMemory: true, + tryRecover: tryRecover); + } + + /// + /// Populates main store with string key-value pairs and object store with sorted sets. + /// + private void PopulateData() + { + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig(allowAdmin: true)); + var db = redis.GetDatabase(0); + + // Main store: string keys + for (var i = 0; i < KeyCount; i++) + { + db.StringSet($"{MainStoreKeyPrefix}{i}", $"{MainStoreValuePrefix}{i}"); + } + + // Object store: sorted sets + for (var i = 0; i < KeyCount; i++) + { + var entries = new SortedSetEntry[] + { + new($"member_a_{i}", i * 10), + new($"member_b_{i}", i * 10 + 1), + new($"member_c_{i}", i * 10 + 2), + }; + db.SortedSetAdd($"{ObjectStoreKeyPrefix}{i}", entries); + } + } + + /// + /// Verifies all main store and object store data is recovered correctly. + /// Returns (mainStoreRecovered, objectStoreRecovered) counts. + /// + private (int mainStoreRecovered, int objectStoreRecovered) VerifyRecoveredData() + { + using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); + var db = redis.GetDatabase(0); + + var mainStoreRecovered = 0; + for (var i = 0; i < KeyCount; i++) + { + var value = db.StringGet($"{MainStoreKeyPrefix}{i}"); + if (value.HasValue) + { + ClassicAssert.AreEqual($"{MainStoreValuePrefix}{i}", value.ToString(), + $"Main store key {MainStoreKeyPrefix}{i} has wrong value after recovery"); + mainStoreRecovered++; + } + } + + var objectStoreRecovered = 0; + for (var i = 0; i < KeyCount; i++) + { + var members = db.SortedSetRangeByScoreWithScores($"{ObjectStoreKeyPrefix}{i}"); + if (members.Length > 0) + { + ClassicAssert.AreEqual(3, members.Length, + $"Object store key {ObjectStoreKeyPrefix}{i} should have 3 members"); + ClassicAssert.AreEqual($"member_a_{i}", members[0].Element.ToString()); + ClassicAssert.AreEqual(i * 10, members[0].Score); + ClassicAssert.AreEqual($"member_b_{i}", members[1].Element.ToString()); + ClassicAssert.AreEqual(i * 10 + 1, members[1].Score); + ClassicAssert.AreEqual($"member_c_{i}", members[2].Element.ToString()); + ClassicAssert.AreEqual(i * 10 + 2, members[2].Score); + objectStoreRecovered++; + } + } + + return (mainStoreRecovered, objectStoreRecovered); + } + + /// + /// Scenario 1: Checkpoint → AOF commit sequence. + /// Takes checkpoint first, then commits AOF. + /// + [Test] + public void CheckpointThenAofCommit_DataConsistencyTest() + { + server = CreateServerWithAofAndStorage(); + server.Start(); + + PopulateData(); + + // Sequence: Checkpoint first, then AOF commit + server.Store.TakeCheckpoint(background: false); + server.Store.CommitAOF(spinWait: true); + + server.Dispose(false); + + // Recover and verify + server = CreateServerWithAofAndStorage(tryRecover: true); + server.Start(); + + var (mainRecovered, objRecovered) = VerifyRecoveredData(); + + TestContext.Progress.WriteLine( + $"[Checkpoint→AOF] Main store: {mainRecovered}/{KeyCount}, Object store: {objRecovered}/{KeyCount}"); + + ClassicAssert.AreEqual(KeyCount, mainRecovered, + "Checkpoint→AOF: Not all main store keys recovered"); + ClassicAssert.AreEqual(KeyCount, objRecovered, + "Checkpoint→AOF: Not all object store keys recovered"); + } + + /// + /// Scenario 2: AOF commit → Checkpoint sequence (current production order). + /// Commits AOF first, then takes checkpoint. + /// + [Test] + public void AofCommitThenCheckpoint_DataConsistencyTest() + { + server = CreateServerWithAofAndStorage(); + server.Start(); + + PopulateData(); + + // Sequence: AOF commit first, then Checkpoint (matches current FinalizeDataAsync order) + server.Store.CommitAOF(spinWait: true); + server.Store.TakeCheckpoint(background: false); + + server.Dispose(false); + + // Recover and verify + server = CreateServerWithAofAndStorage(tryRecover: true); + server.Start(); + + var (mainRecovered, objRecovered) = VerifyRecoveredData(); + + TestContext.Progress.WriteLine( + $"[AOF→Checkpoint] Main store: {mainRecovered}/{KeyCount}, Object store: {objRecovered}/{KeyCount}"); + + ClassicAssert.AreEqual(KeyCount, mainRecovered, + "AOF→Checkpoint: Not all main store keys recovered"); + ClassicAssert.AreEqual(KeyCount, objRecovered, + "AOF→Checkpoint: Not all object store keys recovered"); + } + + /// + /// Scenario 3: AOF commit only (no checkpoint). + /// Only commits AOF before shutdown. + /// + [Test] + public void AofCommitOnly_DataConsistencyTest() + { + server = CreateServerWithAofAndStorage(); + server.Start(); + + PopulateData(); + + // Sequence: AOF commit only + server.Store.CommitAOF(spinWait: true); + + server.Dispose(false); + + // Recover and verify + server = CreateServerWithAofAndStorage(tryRecover: true); + server.Start(); + + var (mainRecovered, objRecovered) = VerifyRecoveredData(); + + TestContext.Progress.WriteLine( + $"[AOF Only] Main store: {mainRecovered}/{KeyCount}, Object store: {objRecovered}/{KeyCount}"); + + ClassicAssert.AreEqual(KeyCount, mainRecovered, + "AOF Only: Not all main store keys recovered"); + ClassicAssert.AreEqual(KeyCount, objRecovered, + "AOF Only: Not all object store keys recovered"); + } + + /// + /// Scenario 4: Checkpoint only (no AOF commit). + /// Only takes checkpoint before shutdown. + /// + [Test] + public void CheckpointOnly_DataConsistencyTest() + { + server = CreateServerWithAofAndStorage(); + server.Start(); + + PopulateData(); + + // Sequence: Checkpoint only (no AOF commit) + server.Store.TakeCheckpoint(background: false); + + server.Dispose(false); + + // Recover and verify + server = CreateServerWithAofAndStorage(tryRecover: true); + server.Start(); + + var (mainRecovered, objRecovered) = VerifyRecoveredData(); + + TestContext.Progress.WriteLine( + $"[Checkpoint Only] Main store: {mainRecovered}/{KeyCount}, Object store: {objRecovered}/{KeyCount}"); + + // Note: With checkpoint only (no AOF commit), data written after the last + // checkpoint but before the AOF commit may be lost. This test documents + // the actual behavior for investigation purposes. + ClassicAssert.AreEqual(KeyCount, mainRecovered, + "Checkpoint Only: Not all main store keys recovered"); + ClassicAssert.AreEqual(KeyCount, objRecovered, + "Checkpoint Only: Not all object store keys recovered"); + } + + /// + /// Scenario 5: No finalization at all (baseline - expect potential data loss). + /// Neither AOF commit nor checkpoint before shutdown. + /// This serves as a negative baseline to confirm that finalization is actually needed. + /// + [Test] + public void NoFinalization_DataConsistencyTest() + { + server = CreateServerWithAofAndStorage(); + server.Start(); + + PopulateData(); + + // No finalization at all - just dispose + server.Dispose(false); + + // Recover and verify + server = CreateServerWithAofAndStorage(tryRecover: true); + server.Start(); + + var (mainRecovered, objRecovered) = VerifyRecoveredData(); + + TestContext.Progress.WriteLine( + $"[No Finalization] Main store: {mainRecovered}/{KeyCount}, Object store: {objRecovered}/{KeyCount}"); + + // This is a baseline test - data loss is expected here. + // The purpose is to show that finalization (AOF commit and/or checkpoint) is required. + TestContext.Progress.WriteLine( + $"[No Finalization] Data loss: main store={KeyCount - mainRecovered}, object store={KeyCount - objRecovered}"); + } + + /// + /// Scenario 6: Interleaved writes with checkpoint then additional writes with AOF commit. + /// Simulates the case where new writes happen between checkpoint and AOF commit. + /// + [Test] + public void CheckpointThenMoreWritesThenAofCommit_DataConsistencyTest() + { + server = CreateServerWithAofAndStorage(); + server.Start(); + + // Phase 1: Initial data + PopulateData(); + + // Take checkpoint + server.Store.TakeCheckpoint(background: false); + + // Phase 2: Write additional data AFTER checkpoint + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db = redis.GetDatabase(0); + for (var i = KeyCount; i < KeyCount * 2; i++) + { + db.StringSet($"{MainStoreKeyPrefix}{i}", $"{MainStoreValuePrefix}{i}"); + } + } + + // Now commit AOF (should capture phase 2 writes) + server.Store.CommitAOF(spinWait: true); + + server.Dispose(false); + + // Recover and verify + server = CreateServerWithAofAndStorage(tryRecover: true); + server.Start(); + + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db = redis.GetDatabase(0); + + var phase1Recovered = 0; + for (var i = 0; i < KeyCount; i++) + { + var value = db.StringGet($"{MainStoreKeyPrefix}{i}"); + if (value.HasValue && value.ToString() == $"{MainStoreValuePrefix}{i}") + phase1Recovered++; + } + + var phase2Recovered = 0; + for (var i = KeyCount; i < KeyCount * 2; i++) + { + var value = db.StringGet($"{MainStoreKeyPrefix}{i}"); + if (value.HasValue && value.ToString() == $"{MainStoreValuePrefix}{i}") + phase2Recovered++; + } + + TestContext.Progress.WriteLine( + $"[Checkpoint→Writes→AOF] Phase1: {phase1Recovered}/{KeyCount}, Phase2 (post-checkpoint): {phase2Recovered}/{KeyCount}"); + + ClassicAssert.AreEqual(KeyCount, phase1Recovered, + "Checkpoint→Writes→AOF: Not all phase 1 keys recovered"); + ClassicAssert.AreEqual(KeyCount, phase2Recovered, + "Checkpoint→Writes→AOF: Not all phase 2 (post-checkpoint) keys recovered"); + } + } + + /// + /// Scenario 7: AOF commit then additional writes then checkpoint. + /// Simulates the case where new writes happen between AOF commit and checkpoint. + /// + [Test] + public void AofCommitThenMoreWritesThenCheckpoint_DataConsistencyTest() + { + server = CreateServerWithAofAndStorage(); + server.Start(); + + // Phase 1: Initial data + PopulateData(); + + // Commit AOF + server.Store.CommitAOF(spinWait: true); + + // Phase 2: Write additional data AFTER AOF commit + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db = redis.GetDatabase(0); + for (var i = KeyCount; i < KeyCount * 2; i++) + { + db.StringSet($"{MainStoreKeyPrefix}{i}", $"{MainStoreValuePrefix}{i}"); + } + } + + // Now take checkpoint (should capture phase 2 writes) + server.Store.TakeCheckpoint(background: false); + + server.Dispose(false); + + // Recover and verify + server = CreateServerWithAofAndStorage(tryRecover: true); + server.Start(); + + using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) + { + var db = redis.GetDatabase(0); + + var phase1Recovered = 0; + for (var i = 0; i < KeyCount; i++) + { + var value = db.StringGet($"{MainStoreKeyPrefix}{i}"); + if (value.HasValue && value.ToString() == $"{MainStoreValuePrefix}{i}") + phase1Recovered++; + } + + var phase2Recovered = 0; + for (var i = KeyCount; i < KeyCount * 2; i++) + { + var value = db.StringGet($"{MainStoreKeyPrefix}{i}"); + if (value.HasValue && value.ToString() == $"{MainStoreValuePrefix}{i}") + phase2Recovered++; + } + + TestContext.Progress.WriteLine( + $"[AOF→Writes→Checkpoint] Phase1: {phase1Recovered}/{KeyCount}, Phase2 (post-AOF): {phase2Recovered}/{KeyCount}"); + + ClassicAssert.AreEqual(KeyCount, phase1Recovered, + "AOF→Writes→Checkpoint: Not all phase 1 keys recovered"); + ClassicAssert.AreEqual(KeyCount, phase2Recovered, + "AOF→Writes→Checkpoint: Not all phase 2 (post-AOF) keys recovered"); + } + } + } +} From 714c2f6ff856870efae36783901997a03ba2409a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Mon, 16 Feb 2026 22:28:22 +0900 Subject: [PATCH 32/37] Rearrange AOF commit and checkpoint during Data saving : To refelect PR Review comment from @badrishc --- libs/host/GarnetServer.cs | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 3d53b6c46a1..2b87bc66160 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -582,49 +582,49 @@ private long GetActiveConnectionCount() /// private async Task FinalizeDataAsync(CancellationToken token) { - // Commit AOF before checkpoint/shutdown - if (opts.EnableAOF) + // Take checkpoint for tiered storage + if (opts.EnableStorageTier) { - logger?.LogDebug("Committing AOF before shutdown..."); + logger?.LogDebug("Taking checkpoint for tiered storage..."); try { - var commitSuccess = await Store.CommitAOFAsync(token).ConfigureAwait(false); - if (commitSuccess) + var checkpointSuccess = Store.TakeCheckpoint(background: false, token: token); + if (checkpointSuccess) { - logger?.LogDebug("AOF committed successfully."); + logger?.LogDebug("Checkpoint completed successfully."); } else { - logger?.LogInformation("AOF commit skipped (another commit in progress or replica mode)."); + logger?.LogInformation("Checkpoint skipped (another checkpoint in progress or replica mode)."); } } catch (Exception ex) { - logger?.LogError(ex, "Error committing AOF during shutdown"); + logger?.LogError(ex, "Error taking checkpoint during shutdown"); } } - // Take checkpoint for tiered storage - if (opts.EnableStorageTier) + // Commit AOF after checkpoint to ensure all data is persisted, but only if AOF is enabled and we're not in replica mode (since replicas should not commit AOF) + if (opts.EnableAOF) { - logger?.LogDebug("Taking checkpoint for tiered storage..."); + logger?.LogDebug("Committing AOF before shutdown..."); try { - var checkpointSuccess = Store.TakeCheckpoint(background: false, token: token); - if (checkpointSuccess) + var commitSuccess = await Store.CommitAOFAsync(token).ConfigureAwait(false); + if (commitSuccess) { - logger?.LogDebug("Checkpoint completed successfully."); + logger?.LogDebug("AOF committed successfully."); } else { - logger?.LogInformation("Checkpoint skipped (another checkpoint in progress or replica mode)."); + logger?.LogInformation("AOF commit skipped (another commit in progress or replica mode)."); } } catch (Exception ex) { - logger?.LogError(ex, "Error taking checkpoint during shutdown"); + logger?.LogError(ex, "Error committing AOF during shutdown"); } - } + } } /// From 08e48a283aeaf1e8af166823bcd733e9b4635b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Mon, 16 Feb 2026 23:01:19 +0900 Subject: [PATCH 33/37] Save data only once during single shutdown process To reflect @badrishc 's Comments AOF or Checkpoint single call is enough to prevent data loss --- libs/host/GarnetServer.cs | 52 +++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/libs/host/GarnetServer.cs b/libs/host/GarnetServer.cs index 2b87bc66160..2a1bf56254e 100644 --- a/libs/host/GarnetServer.cs +++ b/libs/host/GarnetServer.cs @@ -467,7 +467,7 @@ public async Task ShutdownAsync(TimeSpan? timeout = null, bool noSave = false, C { if (!noSave) { - // Attempt AOF commit and checkpoint as best-effort, + // Attempt AOF commit or checkpoint as best-effort, // even if connection draining was cancelled or failed. // Use a bounded timeout instead of the caller's token to ensure completion. using var finalizeCts = new CancellationTokenSource(TimeSpan.FromSeconds(15)); @@ -578,33 +578,10 @@ private long GetActiveConnectionCount() } /// - /// Commits AOF and takes checkpoint for data durability during shutdown. + /// Persists data during shutdown using AOF or checkpoint based on configuration. /// private async Task FinalizeDataAsync(CancellationToken token) { - // Take checkpoint for tiered storage - if (opts.EnableStorageTier) - { - logger?.LogDebug("Taking checkpoint for tiered storage..."); - try - { - var checkpointSuccess = Store.TakeCheckpoint(background: false, token: token); - if (checkpointSuccess) - { - logger?.LogDebug("Checkpoint completed successfully."); - } - else - { - logger?.LogInformation("Checkpoint skipped (another checkpoint in progress or replica mode)."); - } - } - catch (Exception ex) - { - logger?.LogError(ex, "Error taking checkpoint during shutdown"); - } - } - - // Commit AOF after checkpoint to ensure all data is persisted, but only if AOF is enabled and we're not in replica mode (since replicas should not commit AOF) if (opts.EnableAOF) { logger?.LogDebug("Committing AOF before shutdown..."); @@ -624,7 +601,30 @@ private async Task FinalizeDataAsync(CancellationToken token) { logger?.LogError(ex, "Error committing AOF during shutdown"); } - } + + return; + } + + if (!opts.EnableStorageTier) + return; + + logger?.LogDebug("Taking checkpoint for tiered storage..."); + try + { + var checkpointSuccess = Store.TakeCheckpoint(background: false, token: token); + if (checkpointSuccess) + { + logger?.LogDebug("Checkpoint completed successfully."); + } + else + { + logger?.LogInformation("Checkpoint skipped (another checkpoint in progress or replica mode)."); + } + } + catch (Exception ex) + { + logger?.LogError(ex, "Error taking checkpoint during shutdown"); + } } /// From c6fcb198ff2273366cceab88db7c4c2088b2b008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Mon, 16 Feb 2026 23:43:39 +0900 Subject: [PATCH 34/37] Remove isListening Flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I test with test code and we don't needs isListnening Flag following test codes proved ``` @ -0,0 +1,295 @@ // Copyright (c) Microsoft Corporation. // Licensed under the MIT license. using System; using System.Net; using System.Net.Sockets; using System.Threading; using System.Threading.Tasks; using Allure.NUnit; using NUnit.Framework; using NUnit.Framework.Legacy; using StackExchange.Redis; namespace Garnet.test { /// /// Tests validating graceful shutdown behavior: /// - StopListening prevents new connections (via listenSocket.Close + ObjectDisposedException/SocketError) /// - Existing connections remain functional after StopListening /// - ShutdownAsync performs end-to-end graceful shutdown /// - Data written before shutdown is preserved after recovery /// /// These tests demonstrate that the socket-close mechanism (ObjectDisposedException catch /// and SocketError.OperationAborted in HandleNewConnection) is sufficient to stop accepting /// new connections — the isListening flag is not strictly required for correctness. /// [AllureNUnit] [TestFixture] public class GracefulShutdownTests : AllureTestBase { private GarnetServer server; [SetUp] public void Setup() { TestUtils.DeleteDirectory(TestUtils.MethodTestDir, wait: true); } [TearDown] public void TearDown() { server?.Dispose(); TestUtils.DeleteDirectory(TestUtils.MethodTestDir); } /// /// Validates that after calling ShutdownAsync (which calls StopListening internally), /// no new TCP connections can be established to the server port. /// This proves that listenSocket.Close() alone is sufficient to reject new connections. /// [Test] public async Task StopListening_RejectsNewConnections() { server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir); server.Start(); // Verify server is accepting connections using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) { var db = redis.GetDatabase(0); db.StringSet("before-shutdown", "value1"); ClassicAssert.AreEqual("value1", db.StringGet("before-shutdown").ToString()); } // Perform graceful shutdown (calls StopListening → listenSocket.Close()) await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5)).ConfigureAwait(false); // Verify new TCP connections are refused after StopListening // Use raw socket to avoid SE.Redis retry/reconnect logic using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); var connectException = Assert.ThrowsAsync(async () => { await socket.ConnectAsync(new IPEndPoint(IPAddress.Loopback, TestUtils.TestPort)).ConfigureAwait(false); }); // Connection should be refused (or reset) because the listen socket is closed ClassicAssert.IsTrue( connectException.SocketErrorCode == SocketError.ConnectionRefused || connectException.SocketErrorCode == SocketError.ConnectionReset, $"Expected ConnectionRefused or ConnectionReset but got {connectException.SocketErrorCode}"); } /// /// Validates that an existing connection established before StopListening /// continues to function until the server is fully disposed. /// This proves that StopListening only affects the accept loop, not active handlers. /// [Test] public async Task StopListening_ExistingConnectionsContinueWorking() { server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir); server.Start(); // Establish a connection before shutdown using var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig()); var db = redis.GetDatabase(0); // Verify connection works db.StringSet("key1", "value1"); ClassicAssert.AreEqual("value1", db.StringGet("key1").ToString()); // Stop listening (but don't dispose yet — just stop accepting new connections) // ShutdownAsync calls StopListening internally, but with a short timeout // and noSave=true to avoid waiting for data finalization await server.ShutdownAsync(timeout: TimeSpan.FromMilliseconds(100), noSave: true).ConfigureAwait(false); // The existing connection should still be able to execute commands // because StopListening only closes the listen socket, not active connections try { var result = db.StringGet("key1"); // If we get here, the existing connection is still alive ClassicAssert.AreEqual("value1", result.ToString()); } catch (RedisConnectionException) { // If the server disposed active handlers as part of shutdown, // a connection exception is also acceptable behavior } } /// /// End-to-end graceful shutdown test: writes data, calls ShutdownAsync with data persistence, /// disposes, recovers, and verifies all data is intact. /// This validates the full shutdown sequence: StopListening → WaitForConnections → FinalizeData. /// [Test] public async Task ShutdownAsync_PreservesDataAfterRecovery() { server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableAOF: true); server.Start(); // Write data using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) { var db = redis.GetDatabase(0); for (var i = 0; i < 20; i++) { db.StringSet($"graceful:key:{i}", $"graceful:value:{i}"); } } // Perform graceful shutdown (includes AOF commit) await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5)).ConfigureAwait(false); server.Dispose(false); // Recover and verify server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableAOF: true, tryRecover: true); server.Start(); using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) { var db = redis.GetDatabase(0); for (var i = 0; i < 20; i++) { var value = db.StringGet($"graceful:key:{i}"); ClassicAssert.IsTrue(value.HasValue, $"Key graceful:key:{i} should exist after recovery"); ClassicAssert.AreEqual($"graceful:value:{i}", value.ToString(), $"Key graceful:key:{i} has wrong value after recovery"); } } } /// /// Validates that calling ShutdownAsync with cancellation token works correctly. /// The shutdown should stop gracefully when cancelled and still leave the server /// in a consistent state for dispose. /// [Test] public async Task ShutdownAsync_CancellationStopsGracefully() { server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableAOF: true); server.Start(); // Write some data using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) { var db = redis.GetDatabase(0); db.StringSet("cancel-test", "some-value"); } // Cancel immediately using var cts = new CancellationTokenSource(); cts.Cancel(); // ShutdownAsync should handle cancellation gracefully (not throw) await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5), token: cts.Token).ConfigureAwait(false); // Server should still be disposable without error server.Dispose(false); // Recover — data may or may not be persisted (cancellation interrupted finalization), // but the server should start cleanly server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir, enableAOF: true, tryRecover: true); Assert.DoesNotThrow(() => server.Start()); } /// /// Validates that multiple rapid calls to ShutdownAsync / StopListening are idempotent /// and do not cause exceptions or deadlocks. /// Without isListening, this is handled by ObjectDisposedException on the already-closed socket. /// [Test] public async Task ShutdownAsync_MultipleCallsAreIdempotent() { server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir); server.Start(); // Verify server works using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) { var db = redis.GetDatabase(0); db.StringSet("idempotent-test", "value"); ClassicAssert.AreEqual("value", db.StringGet("idempotent-test").ToString()); } // Call ShutdownAsync multiple times — should not throw or deadlock await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(2), noSave: true).ConfigureAwait(false); await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(2), noSave: true).ConfigureAwait(false); await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(2), noSave: true).ConfigureAwait(false); // Should still dispose cleanly server.Dispose(); server = null; } /// /// Validates that the accept loop terminates correctly when the listen socket is closed, /// even under concurrent connection attempts. /// This simulates the race condition where connections arrive while StopListening is called. /// [Test] public async Task StopListening_UnderConcurrentConnectionAttempts() { server = TestUtils.CreateGarnetServer(TestUtils.MethodTestDir); server.Start(); // Verify server is working using (var redis = ConnectionMultiplexer.Connect(TestUtils.GetConfig())) { var db = redis.GetDatabase(0); db.StringSet("concurrent-test", "value"); } // Start concurrent connection attempts var connectionAttempts = 0; var connectionFailures = 0; using var stopCts = new CancellationTokenSource(); var connectTask = Task.Run(async () => { while (!stopCts.Token.IsCancellationRequested) { try { using var socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); socket.ReceiveTimeout = 1000; socket.SendTimeout = 1000; await socket.ConnectAsync(new IPEndPoint(IPAddress.Loopback, TestUtils.TestPort)).ConfigureAwait(false); Interlocked.Increment(ref connectionAttempts); socket.Close(); } catch { Interlocked.Increment(ref connectionFailures); } await Task.Delay(10, CancellationToken.None).ConfigureAwait(false); } }); // Give some time for connection attempts to start await Task.Delay(100).ConfigureAwait(false); // Now shut down await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(3), noSave: true).ConfigureAwait(false); // Stop connection attempts stopCts.Cancel(); await connectTask.ConfigureAwait(false); // Verify that at least some connections succeeded before shutdown // and at least some failed after shutdown TestContext.Progress.WriteLine( $"Connection attempts: {connectionAttempts} succeeded, {connectionFailures} failed"); ClassicAssert.IsTrue(connectionAttempts > 0, "Should have had at least some successful connections before shutdown"); // Clean dispose server.Dispose(); server = null; } } } ``` --- libs/server/Servers/GarnetServerTcp.cs | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/libs/server/Servers/GarnetServerTcp.cs b/libs/server/Servers/GarnetServerTcp.cs index f00f181303b..b7485724dda 100644 --- a/libs/server/Servers/GarnetServerTcp.cs +++ b/libs/server/Servers/GarnetServerTcp.cs @@ -28,7 +28,6 @@ public class GarnetServerTcp : GarnetServerBase, IServerHook readonly int networkConnectionLimit; readonly string unixSocketPath; readonly UnixFileMode unixSocketPermission; - volatile bool isListening; /// public override IEnumerable ActiveConsumers() @@ -118,7 +117,6 @@ public override void Start() } listenSocket.Listen(512); - isListening = true; if (!listenSocket.AcceptAsync(acceptEventArg)) AcceptEventArg_Completed(null, acceptEventArg); } @@ -126,10 +124,6 @@ public override void Start() /// public override void StopListening() { - if (!isListening) - return; - - isListening = false; try { // Close the listen socket to stop accepting new connections @@ -149,18 +143,9 @@ private void AcceptEventArg_Completed(object sender, SocketAsyncEventArgs e) { do { - // Check isListening flag before processing and before calling AcceptAsync again - if (!isListening) - { - // Dispose any accepted socket that won't be handled - e.AcceptSocket?.Dispose(); - e.AcceptSocket = null; - break; - } - if (!HandleNewConnection(e)) break; e.AcceptSocket = null; - } while (isListening && !listenSocket.AcceptAsync(e)); + } while (!listenSocket.AcceptAsync(e)); } // socket disposed catch (ObjectDisposedException) { } From 4ccad8e3988dc3fc627c1b7b4a93d28231d6974f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Mon, 16 Feb 2026 23:51:46 +0900 Subject: [PATCH 35/37] =?UTF-8?q?=E2=9C=8F=EF=B8=8F=20Fix=20format?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/Garnet.test/ShutdownDataConsistencyTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Garnet.test/ShutdownDataConsistencyTests.cs b/test/Garnet.test/ShutdownDataConsistencyTests.cs index 8b2b4618fba..a009155202c 100644 --- a/test/Garnet.test/ShutdownDataConsistencyTests.cs +++ b/test/Garnet.test/ShutdownDataConsistencyTests.cs @@ -417,4 +417,4 @@ public void AofCommitThenMoreWritesThenCheckpoint_DataConsistencyTest() } } } -} +} \ No newline at end of file From 60faf96f50d88fc5b4ac6b14a2018e8bf4191d1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Tue, 17 Feb 2026 00:33:22 +0900 Subject: [PATCH 36/37] =?UTF-8?q?=E2=9C=8F=EF=B8=8F=20Fix=20comment=20abou?= =?UTF-8?q?t=20infomational=20Test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit NoFinalization isn't ensure data can't fully recovery If already backgroud aof or checkpoint created thus recoveryable so this test scenario 5 at Test\ShutdownDataConsistencyTests is only perfoming test for infomational --- test/Garnet.test/ShutdownDataConsistencyTests.cs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/Garnet.test/ShutdownDataConsistencyTests.cs b/test/Garnet.test/ShutdownDataConsistencyTests.cs index a009155202c..9d42853949a 100644 --- a/test/Garnet.test/ShutdownDataConsistencyTests.cs +++ b/test/Garnet.test/ShutdownDataConsistencyTests.cs @@ -257,9 +257,9 @@ public void CheckpointOnly_DataConsistencyTest() } /// - /// Scenario 5: No finalization at all (baseline - expect potential data loss). + /// Scenario 5: No finalization at all (baseline - result is configuration-dependent). /// Neither AOF commit nor checkpoint before shutdown. - /// This serves as a negative baseline to confirm that finalization is actually needed. + /// Recovery results may vary if data was already persisted by background AOF or storage tier spill. /// [Test] public void NoFinalization_DataConsistencyTest() @@ -269,7 +269,7 @@ public void NoFinalization_DataConsistencyTest() PopulateData(); - // No finalization at all - just dispose + // No explicit finalization before shutdown server.Dispose(false); // Recover and verify @@ -281,8 +281,7 @@ public void NoFinalization_DataConsistencyTest() TestContext.Progress.WriteLine( $"[No Finalization] Main store: {mainRecovered}/{KeyCount}, Object store: {objRecovered}/{KeyCount}"); - // This is a baseline test - data loss is expected here. - // The purpose is to show that finalization (AOF commit and/or checkpoint) is required. + // Baseline observation only: recovery depends on prior persistence behavior. TestContext.Progress.WriteLine( $"[No Finalization] Data loss: main store={KeyCount - mainRecovered}, object store={KeyCount - objRecovered}"); } From 36a85b9f7223d17c17dc8ca3fb369e4df2860c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B9=80=EC=9C=A0=EC=84=9D=28Yu=20Seok=20Kim=29?= Date: Tue, 17 Feb 2026 00:43:21 +0900 Subject: [PATCH 37/37] Skip graceful save on forced shutdown Detect cancellation requests and pass a noSave flag to server.ShutdownAsync so AOF commit/checkpoint are skipped during forced (OS) shutdowns. Keeps the existing 5s timeout and cancellation token, but avoids a lengthy graceful save when cancellationToken.IsCancellationRequested, allowing faster disposal while preserving normal graceful shutdown otherwise. --- hosting/Windows/Garnet.worker/Worker.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hosting/Windows/Garnet.worker/Worker.cs b/hosting/Windows/Garnet.worker/Worker.cs index 82d47408aa4..7f8839a55ee 100644 --- a/hosting/Windows/Garnet.worker/Worker.cs +++ b/hosting/Windows/Garnet.worker/Worker.cs @@ -47,8 +47,10 @@ public override async Task StopAsync(CancellationToken cancellationToken) { if (server != null) { - // Perform graceful shutdown with AOF commit and checkpoint - await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5), token: cancellationToken).ConfigureAwait(false); + // If cancellation is requested, we will skip the graceful shutdown and proceed to dispose immediately + bool isForceShutdown = cancellationToken.IsCancellationRequested; + // Perform graceful shutdown with AOF commit and checkpoint when not forced Shutdown From OS. + await server.ShutdownAsync(timeout: TimeSpan.FromSeconds(5), noSave: isForceShutdown, token: cancellationToken).ConfigureAwait(false); } } catch (OperationCanceledException)