From 45859e64760cdb1fdae06acfd8dd5784025aeba1 Mon Sep 17 00:00:00 2001 From: Derek Collison Date: Thu, 23 Feb 2023 12:15:57 -0800 Subject: [PATCH] Make sure preferred peer for stepdown is healthy. Signed-off-by: Derek Collison --- server/jetstream_cluster_2_test.go | 4 +-- server/raft.go | 40 ++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/server/jetstream_cluster_2_test.go b/server/jetstream_cluster_2_test.go index 2b38c45fe6..bd530ca100 100644 --- a/server/jetstream_cluster_2_test.go +++ b/server/jetstream_cluster_2_test.go @@ -199,10 +199,10 @@ func TestJetStreamClusterMultiRestartBug(t *testing.T) { if err != nil { t.Fatalf("Unexpected error: %v", err) } - checkFor(t, 10*time.Second, 250*time.Millisecond, func() error { + checkFor(t, 20*time.Second, 250*time.Millisecond, func() error { si, _ := js2.StreamInfo("TEST") if si == nil || si.Cluster == nil { - t.Fatalf("Did not get stream info") + return fmt.Errorf("No stream info or cluster") } for _, pi := range si.Cluster.Replicas { if !pi.Current { diff --git a/server/raft.go b/server/raft.go index 97b2f00ea2..017f894797 100644 --- a/server/raft.go +++ b/server/raft.go @@ -1281,7 +1281,6 @@ func (n *raft) StepDown(preferred ...string) error { n.debug("Being asked to stepdown") // See if we have up to date followers. - nowts := time.Now().UnixNano() maybeLeader := noLeader if len(preferred) > 0 { if preferred[0] != _EMPTY_ { @@ -1290,21 +1289,42 @@ func (n *raft) StepDown(preferred ...string) error { preferred = nil } } + // Can't pick ourselves. + if maybeLeader == n.id { + maybeLeader = noLeader + preferred = nil + } - for peer, ps := range n.peers { - // If not us and alive and caughtup. - if peer != n.id && (nowts-ps.ts) < int64(hbInterval*3) { - if maybeLeader != noLeader && maybeLeader != peer { + nowts := time.Now().UnixNano() + + // If we have a preferred check it first. + if maybeLeader != noLeader { + var isHealthy bool + if ps, ok := n.peers[maybeLeader]; ok { + si, ok := n.s.nodeToInfo.Load(maybeLeader) + isHealthy = ok && !si.(nodeInfo).offline && (nowts-ps.ts) < int64(hbInterval*3) + } + if !isHealthy { + maybeLeader = noLeader + } + } + + // If we do not have a preferred at this point pick the first healthy one. + // Make sure not ourselves. + if maybeLeader == noLeader { + for peer, ps := range n.peers { + if peer == n.id { continue } - if si, ok := n.s.nodeToInfo.Load(peer); !ok || si.(nodeInfo).offline { - continue + si, ok := n.s.nodeToInfo.Load(peer) + isHealthy := ok && !si.(nodeInfo).offline && (nowts-ps.ts) < int64(hbInterval*3) + if isHealthy { + maybeLeader = peer + break } - n.debug("Looking at %q which is %v behind", peer, time.Duration(nowts-ps.ts)) - maybeLeader = peer - break } } + stepdown := n.stepdown n.Unlock()