From 8dcb8010887516f3dd4235504883f60602360643 Mon Sep 17 00:00:00 2001 From: Marco Primi Date: Thu, 14 Dec 2023 14:44:10 -0800 Subject: [PATCH] WIP failing test investigation --- server/raft_helpers_test.go | 17 +++++++-- server/raft_test.go | 71 ++++++++++++++++++++++++++++--------- 2 files changed, 69 insertions(+), 19 deletions(-) diff --git a/server/raft_helpers_test.go b/server/raft_helpers_test.go index bff2c202abb..d733fd84e8d 100644 --- a/server/raft_helpers_test.go +++ b/server/raft_helpers_test.go @@ -301,6 +301,7 @@ type raftChainStateMachine struct { hash hash.Hash blocksApplied uint64 blocksAppliedSinceSnapshot uint64 + stopped bool } // Block is just a random array of bytes, but contains a little extra metadata to track its source @@ -317,10 +318,14 @@ func (sm *raftChainStateMachine) logDebug(format string, args ...any) { } func (sm *raftChainStateMachine) server() *Server { + sm.Lock() + defer sm.Unlock() return sm.s } func (sm *raftChainStateMachine) node() RaftNode { + sm.Lock() + defer sm.Unlock() return sm.n } @@ -364,6 +369,9 @@ func (sm *raftChainStateMachine) leaderChange(isLeader bool) { sm.logDebug("Leader change") } sm.leader = isLeader + if isLeader != sm.node().Leader() { + sm.logDebug("⚠️ Leader state out of sync with underlying node") + } } func (sm *raftChainStateMachine) stop() { @@ -372,8 +380,10 @@ func (sm *raftChainStateMachine) stop() { sm.n.Stop() // Clear state, on restart it will be recovered from snapshot or peers + sm.stopped = true sm.blocksApplied = 0 sm.hash.Reset() + sm.leader = false sm.logDebug("Stopped") } @@ -383,6 +393,7 @@ func (sm *raftChainStateMachine) restart() { sm.logDebug("Restarting") + sm.stopped = false if sm.n.State() != Closed { return } @@ -455,12 +466,12 @@ func (sm *raftChainStateMachine) applyBlock(data []byte) { sm.logDebug("Hash after %d blocks: %X ", sm.blocksApplied, sm.hash.Sum(nil)) } -func (sm *raftChainStateMachine) getCurrentHash() (uint64, string) { +func (sm *raftChainStateMachine) getCurrentHash() (bool, uint64, string) { sm.Lock() defer sm.Unlock() - // Return the number of blocks applied and the current running hash - return sm.blocksApplied, fmt.Sprintf("%X", sm.hash.Sum(nil)) + // Return running, the number of blocks applied and the current running hash + return !sm.stopped, sm.blocksApplied, fmt.Sprintf("%X", sm.hash.Sum(nil)) } type chainHashSnapshot struct { diff --git a/server/raft_test.go b/server/raft_test.go index c0657cb4d6d..5b48bdc709f 100644 --- a/server/raft_test.go +++ b/server/raft_test.go @@ -322,7 +322,15 @@ func TestRaftChainOneBlockInLockstep(t *testing.T) { stateMachine.node().ID(), ) checkFor(t, timeout, 500*time.Millisecond, func() error { - blocksCount, currentHash := stateMachine.getCurrentHash() + running, blocksCount, currentHash := stateMachine.getCurrentHash() + // All nodes always running + if !running { + return fmt.Errorf( + "node %s is not running", + nodeName, + ) + } + // Node is behind if blocksCount != iteration { return fmt.Errorf( "node %s applied %d blocks out of %d expected", @@ -413,7 +421,15 @@ func TestRaftChainStopAndCatchUp(t *testing.T) { stateMachine.node().ID(), ) checkFor(t, timeout, 500*time.Millisecond, func() error { - blocksCount, currentHash := stateMachine.getCurrentHash() + running, blocksCount, currentHash := stateMachine.getCurrentHash() + // All nodes should be running + if !running { + return fmt.Errorf( + "node %s not running", + nodeName, + ) + } + // Node is behind if blocksCount != expectedBlocks { return fmt.Errorf( "node %s applied %d blocks out of %d expected", @@ -466,10 +482,10 @@ func FuzzRaftChain(f *testing.F) { checkConvergenceTimeout = 30 * time.Second ) - //RaftChainOptions.verbose = true + RaftChainOptions.verbose = true // Cases to run when executed as unit test: - f.Add(100, int64(123456)) + //f.Add(100, int64(123456)) f.Add(1000, int64(123456)) // Run in Fuzz mode to repeat maximizing coverage and looking for failing cases @@ -560,16 +576,27 @@ func FuzzRaftChain(f *testing.F) { b := strings.Builder{} for _, sm := range rg { csm := sm.(*raftChainStateMachine) - blocksCount, blockHash := csm.getCurrentHash() - b.WriteString( - fmt.Sprintf( - " [%s (%s): %d blocks, hash=%s],", - csm.server().Name(), - csm.node().ID(), - blocksCount, - blockHash, - ), - ) + running, blocksCount, blockHash := csm.getCurrentHash() + if running { + b.WriteString( + fmt.Sprintf( + " [%s (%s): %d blocks, hash=%s],", + csm.server().Name(), + csm.node().ID(), + blocksCount, + blockHash, + ), + ) + } else { + b.WriteString( + fmt.Sprintf( + " [%s (%s): STOPPED],", + csm.server().Name(), + csm.node().ID(), + ), + ) + + } } return b.String() } @@ -657,10 +684,22 @@ func FuzzRaftChain(f *testing.F) { checkConvergenceTimeout, 1*time.Second, func() error { - referenceBlocksCount, referenceHash := rg[0].(*raftChainStateMachine).getCurrentHash() + referenceRunning, referenceBlocksCount, referenceHash := rg[0].(*raftChainStateMachine).getCurrentHash() + if !referenceRunning { + return fmt.Errorf( + "reference node not running", + ) + } for _, n := range rg { sm := n.(*raftChainStateMachine) - blocksCount, blockHash := sm.getCurrentHash() + running, blocksCount, blockHash := sm.getCurrentHash() + if !running { + return fmt.Errorf( + "node not running: %s (%s)", + sm.server().Name(), + sm.node().ID(), + ) + } // Track the highest block seen if blocksCount > highestBlocksCount { t.Logf(