Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

go/roothash: Optimize and refactor commitment pool processing #5274

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
12 changes: 12 additions & 0 deletions .changelog/5274.breaking.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
go/roothash: Optimize and refactor commitment pool processing

The commitment pool processing has been optimized and refactored to improve
code readability:

- The discrepancy detection has been modified to immediately switch to
the resolution mode when two commits differ, eliminating the necessity
to wait for the proposer's commitment.

- The discrepancy resolution process was redesigned to fail as soon
as it becomes evident that no group of votes can attain the majority,
such as when there are too many failures.
247 changes: 88 additions & 159 deletions go/roothash/api/commitment/pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,9 @@ var (
// Error code 14 is reserved for future use.
ErrTimeoutNotCorrectRound = errors.New(moduleName, 15, "roothash/commitment: timeout not for correct round")
ErrNodeIsScheduler = errors.New(moduleName, 16, "roothash/commitment: node is scheduler")
ErrMajorityFailure = errors.New(moduleName, 17, "roothash/commitment: majority commitments indicated failure")
ErrInvalidRound = errors.New(moduleName, 18, "roothash/commitment: invalid round")
ErrNoProposerCommitment = errors.New(moduleName, 19, "roothash/commitment: no proposer commitment")
ErrBadProposerCommitment = errors.New(moduleName, 20, "roothash/commitment: bad proposer commitment")
ErrInvalidRound = errors.New(moduleName, 17, "roothash/commitment: invalid round")
ErrNoProposerCommitment = errors.New(moduleName, 18, "roothash/commitment: no proposer commitment")
ErrBadProposerCommitment = errors.New(moduleName, 19, "roothash/commitment: bad proposer commitment")
)

const (
Expand Down Expand Up @@ -158,25 +157,6 @@ func (p *Pool) ResetCommitments(round uint64) {
p.NextTimeout = TimeoutNever
}

func (p *Pool) getCommitment(id signature.PublicKey) (OpenCommitment, bool) {
if p.Committee == nil {
panic("roothash/commitment: query commitments: " + ErrNoCommittee.Error())
}

var (
com OpenCommitment
ok bool
)

switch p.Committee.Kind {
case scheduler.KindComputeExecutor:
com, ok = p.ExecuteCommitments[id]
default:
panic("roothash/commitment: unknown committee kind: " + p.Committee.Kind.String())
}
return com, ok
}

func (p *Pool) addVerifiedExecutorCommitment( // nolint: gocyclo
ctx context.Context,
blk *block.Block,
Expand Down Expand Up @@ -360,164 +340,133 @@ func (p *Pool) AddExecutorCommitment(
// ProcessCommitments performs a single round of commitment checks. If there are enough commitments
// in the pool, it performs discrepancy detection or resolution.
func (p *Pool) ProcessCommitments(didTimeout bool) (OpenCommitment, error) {
if p.Committee == nil {
switch {
case p.Committee == nil:
return nil, ErrNoCommittee
case p.Committee.Kind != scheduler.KindComputeExecutor:
panic("roothash/commitment: unknown committee kind: " + p.Committee.Kind.String())
}

// Determine whether the proposer has submitted a commitment.
proposerCommit, err := p.getProposerCommitment()
switch err {
case nil:
case ErrNoProposerCommitment:
// Proposer has not submitted a commitment yet.
default:
return nil, err
}

type voteEnt struct {
type vote struct {
commit OpenCommitment
tally int
}

var (
commits, required int
failuresTally int
)
votes := make(map[hash.Hash]*voteEnt)
var total, commits, failures int

// Gather votes.
votes := make(map[hash.Hash]*vote)
for _, n := range p.Committee.Members {
var check bool
if !p.Discrepancy {
check = n.Role == scheduler.RoleWorker
} else {
check = n.Role == scheduler.RoleBackupWorker
}
if !check {
switch {
case !p.Discrepancy && n.Role != scheduler.RoleWorker:
continue
case p.Discrepancy && n.Role != scheduler.RoleBackupWorker:
continue
}

required++
commit, ok := p.getCommitment(n.PublicKey)
total++
commit, ok := p.ExecuteCommitments[n.PublicKey]
if !ok {
continue
}
commits++

if commit.IsIndicatingFailure() {
failuresTally++
failures++
continue
}

// Quick check whether the result is already determined.
//
// i) In case of discrepancy detection, as soon as there is a discrepancy we can proceed
// with discrepancy resolution. No need to wait for all commits.
//
// ii) In case of discrepancy resolution, as soon as majority agrees on a result, we can
// proceed with resolving the round.
switch p.Discrepancy {
case false:
// Discrepancy detection.
if proposerCommit != nil && !proposerCommit.MostlyEqual(commit) {
p.Discrepancy = true
return nil, ErrDiscrepancyDetected
}
case true:
// Discrepancy resolution.
k := commit.ToVote()
if ent, ok := votes[k]; !ok {
votes[k] = &voteEnt{
commit: commit,
tally: 1,
}
} else {
ent.tally++
k := commit.ToVote()
if v, ok := votes[k]; !ok {
votes[k] = &vote{
commit: commit,
tally: 1,
}
}
}

allowedStragglers := int(p.Runtime.Executor.AllowedStragglers)

switch p.Discrepancy {
case true:
// Discrepancy resolution.
minVotes := (required / 2) + 1
if failuresTally >= minVotes {
// Majority indicates failure, round will fail regardless of additional commits.
logger.Warn("discrepancy resolution majority failed",
logging.LogEvent, LogEventDiscrepancyMajorityFailure,
)
return nil, ErrMajorityFailure
} else {
v.tally++
}

// If we already have the proposer commitment, we can try to resolve early based on majority
// vote. Otherwise we need to wait for the proposer commitment.
if proposerCommit == nil {
break
// As soon as there is a discrepancy we can proceed with discrepancy resolution.
// No need to wait for all commits.
if !p.Discrepancy && len(votes) > 1 {
p.Discrepancy = true
return nil, ErrDiscrepancyDetected
}
}

for _, ent := range votes {
if ent.tally >= minVotes {
// Majority agrees on a commit, result is determined regardless of additional
// commits.
//
// Make sure that the majority commitment is the same as the proposer commitment. We
// must return the proposer commitment as that one contains additional data.
if !proposerCommit.MostlyEqual(ent.commit) {
return nil, ErrBadProposerCommitment
}

return proposerCommit, nil
}
}
// Determine whether the proposer has submitted a commitment.
proposer, err := GetTransactionScheduler(p.Committee, p.Round)
if err != nil {
return nil, ErrNoCommittee
}
proposerCommit, ok := p.ExecuteCommitments[proposer.PublicKey]
if !ok && didTimeout {
// TODO: Consider slashing for this offense.
return nil, ErrNoProposerCommitment
}

// No majority commitment so far.
switch p.Discrepancy {
case false:
// Discrepancy detection.
allowedStragglers := int(p.Runtime.Executor.AllowedStragglers)

// If it is already known that the number of valid commitments will not exceed the required
// threshold, there is no need to wait for the timer to expire. Instead, proceed directly to
// the discrepancy resolution mode, regardless of any additional commits.
if failuresTally > allowedStragglers {
if failures > allowedStragglers {
p.Discrepancy = true
return nil, ErrDiscrepancyDetected
}
}

// While a timer is running, all nodes are required to answer.
//
// After the timeout has elapsed, a limited number of stragglers are allowed.
if didTimeout {
if p.Discrepancy {
// This was a forced finalization call due to timeout,
// and the round was in the discrepancy state. Give up.
return nil, ErrInsufficientVotes
}
// While a timer is running, all nodes are required to answer.
required := total

switch p.Committee.Kind {
case scheduler.KindComputeExecutor:
// After the timeout has elapsed, a limited number of stragglers are allowed.
if didTimeout {
required -= allowedStragglers
commits -= failuresTally // Since failures count as stragglers.
default:
// Would panic above.
commits -= failures // Since failures count as stragglers.
}

if proposerCommit == nil {
// If we timed out but the proposer did not submit a commitment, fail the round.
// TODO: Consider slashing for this offense.
return nil, ErrNoProposerCommitment
// Check if the majority has been reached.
if commits < required || proposerCommit == nil {
return nil, ErrStillWaiting
}
}

if commits < required || proposerCommit == nil {
return nil, ErrStillWaiting
}
case true:
// Discrepancy resolution.
required := total/2 + 1

switch p.Discrepancy {
case false:
// No discrepancy.
return proposerCommit, nil
default:
// No majority commitment.
return nil, ErrInsufficientVotes
// Find the commit with the highest number of votes.
topVote := &vote{}
for _, v := range votes {
if v.tally > topVote.tally {
topVote = v
}
}

// Fail the round if the majority cannot be reached due to insufficient votes remaining
// (e.g. too many nodes have failed),
remaining := total - commits
if topVote.tally+remaining < required {
return nil, ErrInsufficientVotes
}

// Check if the majority has been reached.
if topVote.tally < required || proposerCommit == nil {
if didTimeout {
return nil, ErrInsufficientVotes
}
return nil, ErrStillWaiting
}

// Make sure that the majority commitment is the same as the proposer commitment.
if !proposerCommit.MostlyEqual(topVote.commit) {
return nil, ErrBadProposerCommitment
}
}

// We must return the proposer commitment as that one contains additional data.
return proposerCommit, nil
}

// CheckProposerTimeout verifies executor timeout request conditions.
Expand Down Expand Up @@ -560,26 +509,6 @@ func (p *Pool) CheckProposerTimeout(
return nil
}

func (p *Pool) getProposerCommitment() (OpenCommitment, error) {
var proposerCommit OpenCommitment
switch p.Committee.Kind {
case scheduler.KindComputeExecutor:
proposer, err := GetTransactionScheduler(p.Committee, p.Round)
if err != nil {
return nil, ErrNoCommittee
}

var ok bool
if proposerCommit, ok = p.ExecuteCommitments[proposer.PublicKey]; !ok {
// No proposer commitment, we cannot proceed.
return nil, ErrNoProposerCommitment
}
default:
panic("roothash/commitment: unknown committee kind while checking commitments: " + p.Committee.Kind.String())
}
return proposerCommit, nil
}

// TryFinalize attempts to finalize the commitments by performing discrepancy
// detection and discrepancy resolution, based on the state of the pool. It may
// request the caller to schedule timeouts by setting NextTimeout appropriately.
Expand Down