Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

go/consensus/roothash: Track runtime proposer liveness #5334

Merged
merged 1 commit into from Aug 3, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 10 additions & 0 deletions .changelog/5334.breaking.md
@@ -0,0 +1,10 @@
go/consensus/roothash: Track runtime proposer liveness

The roothash application now monitors the runtime proposer liveness, which
runtimes can utilize to penalize proposers with insufficient commitments.
To activate penalties for such nodes, the executor committee parameters
need to be updated by configuring the following setting:

- `MaxMissedProposalsPercent`: The maximum percentage of proposed rounds
in an epoch that can fail for a node to be considered live. Zero means
that all proposed rounds can fail.
31 changes: 26 additions & 5 deletions go/consensus/cometbft/apps/roothash/liveness.go
Expand Up @@ -2,6 +2,7 @@ package roothash

import (
"fmt"
"math"

beacon "github.com/oasisprotocol/oasis-core/go/beacon/api"
"github.com/oasisprotocol/oasis-core/go/common/crypto/signature"
Expand Down Expand Up @@ -31,6 +32,7 @@ func processLivenessStatistics(ctx *tmapi.Context, epoch beacon.EpochTime, rtSta
if maxFailures == 0 {
maxFailures = 255
}
maxMissedProposalsPercent := uint64(rtState.Runtime.Executor.MaxMissedProposalsPercent)
slashParams := rtState.Runtime.Staking.Slashing[staking.SlashRuntimeLiveness]

ctx.Logger().Debug("evaluating node liveness",
Expand All @@ -41,14 +43,26 @@ func processLivenessStatistics(ctx *tmapi.Context, epoch beacon.EpochTime, rtSta
)

// Collect per node liveness statistics as a single node can have multiple roles.
goodRoundsPerNode := make(map[signature.PublicKey]uint64)
type Stats struct {
liveRounds uint64
finalizedProposals uint64
missedProposals uint64
}
statsPerNode := make(map[signature.PublicKey]*Stats)
for i, member := range rtState.ExecutorPool.Committee.Members {
goodRoundsPerNode[member.PublicKey] += rtState.LivenessStatistics.LiveRounds[i]
stats, ok := statsPerNode[member.PublicKey]
if !ok {
stats = &Stats{}
statsPerNode[member.PublicKey] = stats
}
stats.liveRounds += rtState.LivenessStatistics.LiveRounds[i]
stats.finalizedProposals += rtState.LivenessStatistics.FinalizedProposals[i]
stats.missedProposals += rtState.LivenessStatistics.MissedProposals[i]
}

// Penalize nodes that were not live enough.
regState := registryState.NewMutableState(ctx.State())
for nodeID, liveRounds := range goodRoundsPerNode {
for nodeID, stats := range statsPerNode {
status, err := regState.NodeStatus(ctx, nodeID)
if err != nil {
return fmt.Errorf("failed to retrieve status for node %s: %w", nodeID, err)
Expand All @@ -57,16 +71,23 @@ func processLivenessStatistics(ctx *tmapi.Context, epoch beacon.EpochTime, rtSta
continue
}

maxMissedProposals := ((stats.missedProposals + stats.finalizedProposals) * maxMissedProposalsPercent) / 100
if maxMissedProposalsPercent == 0 {
maxMissedProposals = math.MaxUint64
}

switch {
case liveRounds >= minLiveRounds:
case stats.liveRounds >= minLiveRounds && stats.missedProposals <= maxMissedProposals:
// Node is live.
status.RecordSuccess(rtState.Runtime.ID, epoch)
default:
// Node is faulty.
ctx.Logger().Debug("node deemed faulty",
"node_id", nodeID,
"live_rounds", liveRounds,
"live_rounds", stats.liveRounds,
"min_live_rounds", minLiveRounds,
"missed_proposals", stats.missedProposals,
"max_missed_proposals", maxMissedProposals,
)

status.RecordFailure(rtState.Runtime.ID, epoch)
Expand Down
66 changes: 64 additions & 2 deletions go/consensus/cometbft/apps/roothash/liveness_test.go
Expand Up @@ -39,6 +39,7 @@ func TestLivenessProcessing(t *testing.T) {
Executor: registry.ExecutorParameters{
MinLiveRoundsForEvaluation: 10,
MinLiveRoundsPercent: 90,
MaxMissedProposalsPercent: 0, // Disabled.
MaxLivenessFailures: 4,
},
}
Expand Down Expand Up @@ -74,8 +75,10 @@ func TestLivenessProcessing(t *testing.T) {
Round: 0,
},
LivenessStatistics: &roothash.LivenessStatistics{
TotalRounds: 100,
LiveRounds: []uint64{91},
TotalRounds: 100,
LiveRounds: []uint64{91}, // At least 90 required.
FinalizedProposals: []uint64{80},
MissedProposals: []uint64{21}, // At most 20 allowed.
},
}
err = roothashState.SetRuntimeState(ctx, rtState)
Expand Down Expand Up @@ -137,4 +140,63 @@ func TestLivenessProcessing(t *testing.T) {
require.NoError(err, "NodeStatus")
require.False(status.IsSuspended(runtime.ID, epoch), "node should not be suspended")
require.Len(status.Faults, 0, "there should be no faults")

// Start tracking proposer liveness.
rtState.Runtime.Executor.MaxMissedProposalsPercent = 20

// When node is proposing, everything should be left as is, no faults should be recorded.
rtState.LivenessStatistics.MissedProposals[0] = 20 // At most 20 allowed.
err = processLivenessStatistics(ctx, epoch, rtState)
require.NoError(err, "processLivenessStatistics")
status, err = registryState.NodeStatus(ctx, sk.Public())
require.NoError(err, "NodeStatus")
require.False(status.IsSuspended(runtime.ID, epoch), "node should not be suspended")

// When node is not proposing, it should be suspended, there should be one fault.
rtState.LivenessStatistics.MissedProposals[0] = 21 // At most 20 allowed.
err = processLivenessStatistics(ctx, epoch, rtState)
require.NoError(err, "processLivenessStatistics")
status, err = registryState.NodeStatus(ctx, sk.Public())
require.NoError(err, "NodeStatus")
require.True(status.IsSuspended(runtime.ID, epoch), "node should be suspended")
require.EqualValues(1, status.Faults[runtime.ID].Failures, "there should be one fault")
require.EqualValues(epoch+2, status.Faults[runtime.ID].SuspendedUntil, "suspension time should be set")

// Bump epoch so the node is no longer suspended.
epoch += 2

// When node is not proposing again, fault counter should increase.
rtState.LivenessStatistics.MissedProposals[0] = 21 // At most 20 allowed.
err = processLivenessStatistics(ctx, epoch, rtState)
require.NoError(err, "processLivenessStatistics")
status, err = registryState.NodeStatus(ctx, sk.Public())
require.NoError(err, "NodeStatus")
require.True(status.IsSuspended(runtime.ID, epoch), "node should be suspended")
require.EqualValues(2, status.Faults[runtime.ID].Failures, "there should be two faults")
require.EqualValues(epoch+4, status.Faults[runtime.ID].SuspendedUntil, "suspension time should be set")

// Bump epoch so the node is no longer suspended.
epoch += 4

// When node is proposing again, fault counter should decrease.
rtState.LivenessStatistics.MissedProposals[0] = 20 // At most 20 allowed.
err = processLivenessStatistics(ctx, epoch, rtState)
require.NoError(err, "processLivenessStatistics")
status, err = registryState.NodeStatus(ctx, sk.Public())
require.NoError(err, "NodeStatus")
require.True(status.IsSuspended(runtime.ID, epoch), "node should be suspended")
require.EqualValues(1, status.Faults[runtime.ID].Failures, "there should be one fault")
require.EqualValues(epoch+2, status.Faults[runtime.ID].SuspendedUntil, "suspension time should be set")

// Bump epoch so the node is no longer suspended.
epoch += 2

// When node is proposing again, fault counter should decrease.
rtState.LivenessStatistics.MissedProposals[0] = 0 // At most 20 allowed.
err = processLivenessStatistics(ctx, epoch, rtState)
require.NoError(err, "processLivenessStatistics")
status, err = registryState.NodeStatus(ctx, sk.Public())
require.NoError(err, "NodeStatus")
require.False(status.IsSuspended(runtime.ID, epoch), "node should not be suspended")
require.Len(status.Faults, 0, "there should be no faults")
}
11 changes: 11 additions & 0 deletions go/consensus/cometbft/apps/roothash/roothash.go
Expand Up @@ -540,6 +540,12 @@
round := rtState.CurrentBlock.Header.Round + 1
pool := rtState.ExecutorPool

// Remember the index of the transaction scheduler within the committee.
schedulerIdx, err := pool.Committee.TransactionSchedulerIdx(pool.Round)
if err != nil {
return err

Check warning on line 546 in go/consensus/cometbft/apps/roothash/roothash.go

View check run for this annotation

Codecov / codecov/patch

go/consensus/cometbft/apps/roothash/roothash.go#L546

Added line #L546 was not covered by tests
}

// Initialize per-epoch liveness statistics.
if rtState.LivenessStatistics == nil {
rtState.LivenessStatistics = roothash.NewLivenessStatistics(len(pool.Committee.Members))
Expand Down Expand Up @@ -573,7 +579,9 @@
"round", round,
)

// Record that the round was finalized and that the scheduler received enough commitments.
livenessStats.TotalRounds++
livenessStats.FinalizedProposals[schedulerIdx]++

ec := commit.ToDDResult().(*commitment.ExecutorCommitment)

Expand Down Expand Up @@ -757,6 +765,9 @@
default:
}

// Record that the scheduler did not receive enough commitments.
livenessStats.MissedProposals[schedulerIdx]++

// Something else went wrong, emit empty error block.
ctx.Logger().Debug("round failed",
"round", round,
Expand Down
10 changes: 10 additions & 0 deletions go/consensus/cometbft/apps/roothash/transactions.go
Expand Up @@ -95,6 +95,16 @@
return err
}

// Record that the scheduler did not propose.
schedulerIdx, err := rtState.ExecutorPool.Committee.TransactionSchedulerIdx(rpt.Round)
if err != nil {
return err

Check warning on line 101 in go/consensus/cometbft/apps/roothash/transactions.go

View check run for this annotation

Codecov / codecov/patch

go/consensus/cometbft/apps/roothash/transactions.go#L101

Added line #L101 was not covered by tests
}
if rtState.LivenessStatistics == nil {
rtState.LivenessStatistics = roothash.NewLivenessStatistics(len(rtState.ExecutorPool.Committee.Members))
}
rtState.LivenessStatistics.MissedProposals[schedulerIdx]++

// Timeout triggered by executor node, emit empty error block.
ctx.Logger().Debug("proposer round timeout",
"round", rpt.Round,
Expand Down
5 changes: 5 additions & 0 deletions go/registry/api/runtime.go
Expand Up @@ -101,6 +101,11 @@ type ExecutorParameters struct {
// penalized.
MinLiveRoundsPercent uint8 `json:"min_live_rounds_percent,omitempty"`

// MaxMissedProposalsPercent is the maximum percentage of proposed rounds in an epoch that
// can fail for a node to be considered live. Nodes not satisfying this may be penalized.
// Zero means that all proposed rounds can fail.
MaxMissedProposalsPercent uint8 `json:"max_missed_proposals_percent,omitempty"`
peternose marked this conversation as resolved.
Show resolved Hide resolved

// MinLiveRoundsForEvaluation is the minimum number of live rounds in an epoch for the liveness
// calculations to be considered for evaluation.
MinLiveRoundsForEvaluation uint64 `json:"min_live_rounds_eval,omitempty"`
Expand Down
32 changes: 19 additions & 13 deletions go/registry/api/runtime_test.go
Expand Up @@ -96,8 +96,9 @@ func TestRuntimeSerialization(t *testing.T) {
RoundTimeout: 6,
MaxMessages: 5,
MinLiveRoundsPercent: 4,
MinLiveRoundsForEvaluation: 3,
MaxLivenessFailures: 2,
MaxMissedProposalsPercent: 3,
MinLiveRoundsForEvaluation: 2,
MaxLivenessFailures: 1,
},
TxnScheduler: TxnSchedulerParameters{
BatchFlushTimeout: 1 * time.Second,
Expand Down Expand Up @@ -144,7 +145,7 @@ func TestRuntimeSerialization(t *testing.T) {
RewardSlashEquvocationRuntimePercent: 0,
MinInMessageFee: quantity.Quantity{},
},
}, "r2F2GCpiaWRYIIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAZGtpbmQCZ2dlbmVzaXOiZXJvdW5kGCtqc3RhdGVfcm9vdFggseUhAZ+3vd413IH+55BlYQy937jvXCXihJg2aBkqbQ1nc3Rha2luZ6FycmV3YXJkX2JhZF9yZXN1bHRzCmdzdG9yYWdlo3NjaGVja3BvaW50X2ludGVydmFsGCFzY2hlY2twb2ludF9udW1fa2VwdAZ1Y2hlY2twb2ludF9jaHVua19zaXplGGVoZXhlY3V0b3Koamdyb3VwX3NpemUJbG1heF9tZXNzYWdlcwVtcm91bmRfdGltZW91dAZxZ3JvdXBfYmFja3VwX3NpemUIcmFsbG93ZWRfc3RyYWdnbGVycwdybWF4X2xpdmVuZXNzX2ZhaWxzAnRtaW5fbGl2ZV9yb3VuZHNfZXZhbAN3bWluX2xpdmVfcm91bmRzX3BlcmNlbnQEaWVudGl0eV9pZFggEjRWeJAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABrY29uc3RyYWludHOhAaEBo2ltYXhfbm9kZXOhZWxpbWl0Cm1taW5fcG9vbF9zaXploWVsaW1pdAVtdmFsaWRhdG9yX3NldKBrZGVwbG95bWVudHOBpGN0ZWVLdmVyc2lvbiB0ZWVndmVyc2lvbqJlbWFqb3IYLGVwYXRjaAFqdmFsaWRfZnJvbQBvYnVuZGxlX2NoZWNrc3VtWCABAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAWtrZXlfbWFuYWdlclgggAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAFsdGVlX2hhcmR3YXJlAW10eG5fc2NoZWR1bGVypW5tYXhfYmF0Y2hfc2l6ZRknEG9tYXhfaW5fbWVzc2FnZXMYIHNiYXRjaF9mbHVzaF90aW1lb3V0GjuaygB0bWF4X2JhdGNoX3NpemVfYnl0ZXMaAJiWgHVwcm9wb3NlX2JhdGNoX3RpbWVvdXQBcGFkbWlzc2lvbl9wb2xpY3mhcGVudGl0eV93aGl0ZWxpc3ShaGVudGl0aWVzoVggEjRWeJAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAChaW1heF9ub2Rlc6IBAwQBcGdvdmVybmFuY2VfbW9kZWwD"},
}, "r2F2GCpiaWRYIIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAZGtpbmQCZ2dlbmVzaXOiZXJvdW5kGCtqc3RhdGVfcm9vdFggseUhAZ+3vd413IH+55BlYQy937jvXCXihJg2aBkqbQ1nc3Rha2luZ6FycmV3YXJkX2JhZF9yZXN1bHRzCmdzdG9yYWdlo3NjaGVja3BvaW50X2ludGVydmFsGCFzY2hlY2twb2ludF9udW1fa2VwdAZ1Y2hlY2twb2ludF9jaHVua19zaXplGGVoZXhlY3V0b3Kpamdyb3VwX3NpemUJbG1heF9tZXNzYWdlcwVtcm91bmRfdGltZW91dAZxZ3JvdXBfYmFja3VwX3NpemUIcmFsbG93ZWRfc3RyYWdnbGVycwdybWF4X2xpdmVuZXNzX2ZhaWxzAXRtaW5fbGl2ZV9yb3VuZHNfZXZhbAJ3bWluX2xpdmVfcm91bmRzX3BlcmNlbnQEeBxtYXhfbWlzc2VkX3Byb3Bvc2Fsc19wZXJjZW50A2llbnRpdHlfaWRYIBI0VniQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAa2NvbnN0cmFpbnRzoQGhAaNpbWF4X25vZGVzoWVsaW1pdAptbWluX3Bvb2xfc2l6ZaFlbGltaXQFbXZhbGlkYXRvcl9zZXSga2RlcGxveW1lbnRzgaRjdGVlS3ZlcnNpb24gdGVlZ3ZlcnNpb26iZW1ham9yGCxlcGF0Y2gBanZhbGlkX2Zyb20Ab2J1bmRsZV9jaGVja3N1bVggAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQFra2V5X21hbmFnZXJYIIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABbHRlZV9oYXJkd2FyZQFtdHhuX3NjaGVkdWxlcqVubWF4X2JhdGNoX3NpemUZJxBvbWF4X2luX21lc3NhZ2VzGCBzYmF0Y2hfZmx1c2hfdGltZW91dBo7msoAdG1heF9iYXRjaF9zaXplX2J5dGVzGgCYloB1cHJvcG9zZV9iYXRjaF90aW1lb3V0AXBhZG1pc3Npb25fcG9saWN5oXBlbnRpdHlfd2hpdGVsaXN0oWhlbnRpdGllc6FYIBI0VniQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAoWltYXhfbm9kZXOiAQMEAXBnb3Zlcm5hbmNlX21vZGVsAw=="},
} {
enc := cbor.Marshal(tc.rr)
require.Equal(tc.expectedBase64, base64.StdEncoding.EncodeToString(enc), "serialization should match")
Expand Down Expand Up @@ -200,8 +201,9 @@ func TestVerifyRuntime(t *testing.T) {
RoundTimeout: 6,
MaxMessages: 5,
MinLiveRoundsPercent: 4,
MinLiveRoundsForEvaluation: 3,
MaxLivenessFailures: 2,
MaxMissedProposalsPercent: 3,
MinLiveRoundsForEvaluation: 2,
MaxLivenessFailures: 1,
},
TxnScheduler: TxnSchedulerParameters{
BatchFlushTimeout: 1 * time.Second,
Expand Down Expand Up @@ -282,8 +284,9 @@ func TestVerifyRuntime(t *testing.T) {
RoundTimeout: 6,
MaxMessages: 5,
MinLiveRoundsPercent: 4,
MinLiveRoundsForEvaluation: 3,
MaxLivenessFailures: 2,
MaxMissedProposalsPercent: 3,
MinLiveRoundsForEvaluation: 2,
MaxLivenessFailures: 1,
},
TxnScheduler: TxnSchedulerParameters{
BatchFlushTimeout: 1 * time.Second,
Expand Down Expand Up @@ -380,8 +383,9 @@ func TestVerifyRuntime(t *testing.T) {
RoundTimeout: 6,
MaxMessages: 5,
MinLiveRoundsPercent: 4,
MinLiveRoundsForEvaluation: 3,
MaxLivenessFailures: 2,
MaxMissedProposalsPercent: 3,
MinLiveRoundsForEvaluation: 2,
MaxLivenessFailures: 1,
},
TxnScheduler: TxnSchedulerParameters{
BatchFlushTimeout: 1 * time.Second,
Expand Down Expand Up @@ -479,8 +483,9 @@ func TestVerifyRuntime(t *testing.T) {
RoundTimeout: 6,
MaxMessages: 5,
MinLiveRoundsPercent: 4,
MinLiveRoundsForEvaluation: 3,
MaxLivenessFailures: 2,
MaxMissedProposalsPercent: 3,
MinLiveRoundsForEvaluation: 2,
MaxLivenessFailures: 1,
},
TxnScheduler: TxnSchedulerParameters{
BatchFlushTimeout: 1 * time.Second,
Expand Down Expand Up @@ -581,8 +586,9 @@ func TestVerifyRuntime(t *testing.T) {
RoundTimeout: 6,
MaxMessages: 5,
MinLiveRoundsPercent: 4,
MinLiveRoundsForEvaluation: 3,
MaxLivenessFailures: 2,
MaxMissedProposalsPercent: 3,
MinLiveRoundsForEvaluation: 2,
MaxLivenessFailures: 1,
},
TxnScheduler: TxnSchedulerParameters{
BatchFlushTimeout: 1 * time.Second,
Expand Down
20 changes: 18 additions & 2 deletions go/roothash/api/liveness.go
Expand Up @@ -9,12 +9,28 @@ type LivenessStatistics struct {
// LiveRounds is a list of counters, specified in committee order (e.g. counter at index i has
// the value for node i in the committee).
LiveRounds []uint64 `json:"good_rounds"`

// FinalizedProposals is a list that records the number of finalized rounds when a node
// acted as a proposer.
//
// The list is ordered according to the committee arrangement (i.e., the counter at index i
// holds the value for the node at index i in the committee).
FinalizedProposals []uint64 `json:"finalized_proposals"`

// MissedProposals is a list that records the number of failed rounds when a node
// acted as a proposer.
//
// The list is ordered according to the committee arrangement (i.e., the counter at index i
// holds the value for the node at index i in the committee).
MissedProposals []uint64 `json:"missed_proposals"`
}

// NewLivenessStatistics creates a new instance of per-epoch liveness statistics.
func NewLivenessStatistics(numNodes int) *LivenessStatistics {
return &LivenessStatistics{
TotalRounds: 0,
LiveRounds: make([]uint64, numNodes),
TotalRounds: 0,
LiveRounds: make([]uint64, numNodes),
FinalizedProposals: make([]uint64, numNodes),
MissedProposals: make([]uint64, numNodes),
}
}