From e018705a082815d435ac2c6672a21027c88c6707 Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Mon, 21 Aug 2023 11:36:42 -0700 Subject: [PATCH 1/3] Fixed deadlock when checkAndSync was being called as part of storing message. We violated the locking pattern, so we now make sure we do this in a separate Go routine and put checks to only run it once. Signed-off-by: Derek Collison --- server/jetstream.go | 14 +++++++++++++- server/jetstream_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/server/jetstream.go b/server/jetstream.go index d4f642ed3d..7143c384e9 100644 --- a/server/jetstream.go +++ b/server/jetstream.go @@ -148,6 +148,9 @@ type jsAccount struct { // From server sendq *ipQueue[*pubMsg] + // For limiting only running one checkAndSync at a time. + sync atomic.Bool + // Usage/limits related fields that will be protected by usageMu usageMu sync.RWMutex limits map[string]JetStreamAccountLimits // indexed by tierName @@ -1811,6 +1814,12 @@ func (jsa *jsAccount) remoteUpdateUsage(sub *subscription, c *client, _ *Account // When we detect a skew of some sort this will verify the usage reporting is correct. // No locks should be held. func (jsa *jsAccount) checkAndSyncUsage(tierName string, storeType StorageType) { + // This will run in a separate go routine, so check that we are only running once. + if !jsa.sync.CompareAndSwap(false, true) { + return + } + defer jsa.sync.Store(true) + // Hold the account read lock and the usage lock while we calculate. // We scope by tier and storage type, but if R3 File has 200 streams etc. could // show a pause. I did test with > 100 non-active streams and was 80-200ns or so. @@ -1916,7 +1925,10 @@ func (jsa *jsAccount) updateUsage(tierName string, storeType StorageType, delta jsa.usageMu.Unlock() if needsCheck { - jsa.checkAndSyncUsage(tierName, storeType) + // We could be holding the stream lock from up in the stack, and this + // will want the jsa lock, which would violate locking order. + // So do this in a Go routine. The function will check if it is already running. + go jsa.checkAndSyncUsage(tierName, storeType) } } diff --git a/server/jetstream_test.go b/server/jetstream_test.go index 9151c53367..b2214f56a5 100644 --- a/server/jetstream_test.go +++ b/server/jetstream_test.go @@ -20408,3 +20408,30 @@ func TestJetStreamLastSequenceBySubjectConcurrent(t *testing.T) { }) } } + +func TestJetStreamUsageSyncDeadlock(t *testing.T) { + s := RunBasicJetStreamServer(t) + defer s.Shutdown() + + nc, js := jsClientConnect(t, s) + defer nc.Close() + + _, err := js.AddStream(&nats.StreamConfig{ + Name: "TEST", + Subjects: []string{"*"}, + }) + require_NoError(t, err) + + sendStreamMsg(t, nc, "foo", "hello") + + // Now purposely mess up the usage that will force a sync. + // Without the fix this will deadlock. + jsa := s.getJetStream().lookupAccount(s.GlobalAccount()) + jsa.usageMu.Lock() + st, ok := jsa.usage[_EMPTY_] + require_True(t, ok) + st.local.store = -1000 + jsa.usageMu.Unlock() + + sendStreamMsg(t, nc, "foo", "hello") +} From 10f73e888e8491e6885f9bcda006738eb23a1dfd Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Mon, 21 Aug 2023 12:02:28 -0700 Subject: [PATCH 2/3] Remove 1.18 compile build, support 1.19 and above Signed-off-by: Derek Collison --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c4d99e35e6..75e7d2b18d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,7 @@ jobs: - name: "Run all tests from all other packages" env: TEST_SUITE=non_srv_pkg_tests - name: "Compile with older Go release" - go: 1.18.x + go: 1.19.12 env: TEST_SUITE=build_only script: ./scripts/runTestsOnTravis.sh $TEST_SUITE From 0a86bf4a9a8fa4ee775a426b90cecd05da531f6c Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Mon, 21 Aug 2023 14:57:17 -0700 Subject: [PATCH 3/3] Should reset to false, not true when done Signed-off-by: Derek Collison --- server/jetstream.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/jetstream.go b/server/jetstream.go index 7143c384e9..de79b9d4ea 100644 --- a/server/jetstream.go +++ b/server/jetstream.go @@ -1818,7 +1818,7 @@ func (jsa *jsAccount) checkAndSyncUsage(tierName string, storeType StorageType) if !jsa.sync.CompareAndSwap(false, true) { return } - defer jsa.sync.Store(true) + defer jsa.sync.Store(false) // Hold the account read lock and the usage lock while we calculate. // We scope by tier and storage type, but if R3 File has 200 streams etc. could