Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

En 6839 dump goroutines #2009

Merged
merged 11 commits into from
Jun 25, 2020
61 changes: 56 additions & 5 deletions consensus/chronology/chronology.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
package chronology

import (
"bytes"
"context"
"fmt"
"os"
"runtime/pprof"
"sync"
"time"

Expand All @@ -11,6 +14,7 @@ import (
"github.com/ElrondNetwork/elrond-go/core"
"github.com/ElrondNetwork/elrond-go/core/check"
"github.com/ElrondNetwork/elrond-go/core/close"
"github.com/ElrondNetwork/elrond-go/data/endProcess"
"github.com/ElrondNetwork/elrond-go/display"
"github.com/ElrondNetwork/elrond-go/ntp"
"github.com/ElrondNetwork/elrond-go/statusHandler"
Expand All @@ -24,6 +28,8 @@ var log = logger.GetOrCreate("consensus/chronology")
// srBeforeStartRound defines the state which exist before the start of the round
const srBeforeStartRound = -1

const numRoundsToWaitBeforeSignalingChronologyStuck = 10

// chronology defines the data needed by the chronology
type chronology struct {
genesisTime time.Time
Expand All @@ -38,29 +44,37 @@ type chronology struct {
mutSubrounds sync.RWMutex
appStatusHandler core.AppStatusHandler
cancelFunc func()

alarmScheduler core.TimersScheduler
iulianpascalau marked this conversation as resolved.
Show resolved Hide resolved
chanStopNodeProcess chan endProcess.ArgEndProcess
}

// NewChronology creates a new chronology object
func NewChronology(
genesisTime time.Time,
rounder consensus.Rounder,
syncTimer ntp.SyncTimer,
alarmScheduler core.TimersScheduler,
chanStopNodeProcess chan endProcess.ArgEndProcess,
) (*chronology, error) {

err := checkNewChronologyParams(
rounder,
syncTimer,
alarmScheduler,
chanStopNodeProcess,
)

if err != nil {
return nil, err
}

chr := chronology{
genesisTime: genesisTime,
rounder: rounder,
syncTimer: syncTimer,
appStatusHandler: statusHandler.NewNilStatusHandler(),
genesisTime: genesisTime,
rounder: rounder,
syncTimer: syncTimer,
appStatusHandler: statusHandler.NewNilStatusHandler(),
alarmScheduler: alarmScheduler,
chanStopNodeProcess: chanStopNodeProcess,
}

chr.subroundId = srBeforeStartRound
Expand All @@ -74,6 +88,8 @@ func NewChronology(
func checkNewChronologyParams(
rounder consensus.Rounder,
syncTimer ntp.SyncTimer,
alarmScheduler core.TimersScheduler,
chanStopNodeProcess chan endProcess.ArgEndProcess,
) error {

if check.IfNil(rounder) {
Expand All @@ -82,6 +98,12 @@ func checkNewChronologyParams(
if check.IfNil(syncTimer) {
return ErrNilSyncTimer
}
if check.IfNil(alarmScheduler) {
return ErrNilAlarmScheduler
}
if chanStopNodeProcess == nil {
return ErrNilEndProcessChan
}

return nil
}
Expand Down Expand Up @@ -116,6 +138,33 @@ func (chr *chronology) RemoveAllSubrounds() {
chr.mutSubrounds.Unlock()
}

func (chr *chronology) resetChronologyStuckAlarm() {
duration := chr.rounder.TimeDuration() * numRoundsToWaitBeforeSignalingChronologyStuck
alarmID := "chronologyStuck"

chr.alarmScheduler.Cancel(alarmID)

cb := func(alarmID string) {
buffer := new(bytes.Buffer)
err := pprof.Lookup("goroutine").WriteTo(buffer, 1)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can also print the stack trace for each go routine, if needed by using

    buff := make([]byte, 1<<27)
    num := runtime.Stack(buff, true)
    dump := string(buff[:num])

This one looks exactly like the one we got when the code panics

if err != nil {
log.Error("could not dump goroutines")
}
log.Debug(buffer.String())
iulianpascalau marked this conversation as resolved.
Show resolved Hide resolved

arg := endProcess.ArgEndProcess{
Reason: "chronology is stuck",
Description: "startRound() was not called before the alarm expired",
}
chr.chanStopNodeProcess <- arg

time.Sleep(core.TimeToGracefullyCloseNode)
iulianpascalau marked this conversation as resolved.
Show resolved Hide resolved
os.Exit(1)
iulianpascalau marked this conversation as resolved.
Show resolved Hide resolved
}

chr.alarmScheduler.Add(cb, duration, alarmID)
}

// StartRounds actually starts the chronology and calls the DoWork() method of the subroundHandlers loaded
func (chr *chronology) StartRounds() {
var ctx context.Context
Expand Down Expand Up @@ -151,6 +200,8 @@ func (chr *chronology) startRound() {
return
}

chr.resetChronologyStuckAlarm()

msg := fmt.Sprintf("SUBROUND %s BEGINS", sr.Name())
log.Debug(display.Headline(msg, chr.syncTimer.FormattedCurrentTime(), "."))
logger.SetCorrelationSubround(sr.Name())
Expand Down
72 changes: 72 additions & 0 deletions consensus/chronology/chronology_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import (
"github.com/ElrondNetwork/elrond-go/consensus"
"github.com/ElrondNetwork/elrond-go/consensus/chronology"
"github.com/ElrondNetwork/elrond-go/consensus/mock"
"github.com/ElrondNetwork/elrond-go/core/alarm"
"github.com/ElrondNetwork/elrond-go/core/check"
"github.com/ElrondNetwork/elrond-go/data/endProcess"
"github.com/stretchr/testify/assert"
)

Expand Down Expand Up @@ -36,6 +38,8 @@ func TestChronology_NewChronologyNilRounderShouldFail(t *testing.T) {
genesisTime,
nil,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
iulianpascalau marked this conversation as resolved.
Show resolved Hide resolved
)

assert.Nil(t, chr)
Expand All @@ -50,12 +54,46 @@ func TestChronology_NewChronologyNilSyncerShouldFail(t *testing.T) {
genesisTime,
rounderMock,
nil,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

assert.Nil(t, chr)
assert.Equal(t, err, chronology.ErrNilSyncTimer)
}

func TestChronology_NewChronologyNilAlarmSchedulerShouldFail(t *testing.T) {
t.Parallel()
rounderMock := &mock.RounderMock{}
genesisTime := time.Now()
chr, err := chronology.NewChronology(
genesisTime,
rounderMock,
&mock.SyncTimerMock{},
nil,
make(chan endProcess.ArgEndProcess),
)

assert.Nil(t, chr)
assert.Equal(t, err, chronology.ErrNilAlarmScheduler)
}

func TestChronology_NewChronologyNilChanShouldFail(t *testing.T) {
t.Parallel()
rounderMock := &mock.RounderMock{}
genesisTime := time.Now()
chr, err := chronology.NewChronology(
genesisTime,
rounderMock,
&mock.SyncTimerMock{},
alarm.NewAlarmScheduler(),
nil,
)

assert.Nil(t, chr)
assert.Equal(t, err, chronology.ErrNilEndProcessChan)
}

func TestChronology_NewChronologyShouldWork(t *testing.T) {
t.Parallel()
rounderMock := &mock.RounderMock{}
Expand All @@ -65,6 +103,8 @@ func TestChronology_NewChronologyShouldWork(t *testing.T) {
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

assert.Nil(t, err)
Expand All @@ -80,6 +120,8 @@ func TestChronology_AddSubroundShouldWork(t *testing.T) {
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

chr.AddSubround(initSubroundHandlerMock())
Expand All @@ -98,6 +140,8 @@ func TestChronology_RemoveAllSubroundsShouldReturnEmptySubroundHandlersArray(t *
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

chr.AddSubround(initSubroundHandlerMock())
Expand All @@ -124,6 +168,8 @@ func TestChronology_StartRoundShouldReturnWhenRoundIndexIsNegative(t *testing.T)
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

srm := initSubroundHandlerMock()
Expand All @@ -143,6 +189,8 @@ func TestChronology_StartRoundShouldReturnWhenLoadSubroundHandlerReturnsNil(t *t
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

initSubroundHandlerMock()
Expand All @@ -161,6 +209,8 @@ func TestChronology_StartRoundShouldReturnWhenDoWorkReturnsFalse(t *testing.T) {
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

srm := initSubroundHandlerMock()
Expand All @@ -181,6 +231,8 @@ func TestChronology_StartRoundShouldWork(t *testing.T) {
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

srm := initSubroundHandlerMock()
Expand All @@ -203,6 +255,8 @@ func TestChronology_UpdateRoundShouldInitRound(t *testing.T) {
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

srm := initSubroundHandlerMock()
Expand All @@ -221,6 +275,8 @@ func TestChronology_LoadSubrounderShouldReturnNilWhenSubroundHandlerNotExists(t
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

assert.Nil(t, chr.LoadSubroundHandler(0))
Expand All @@ -235,6 +291,8 @@ func TestChronology_LoadSubrounderShouldReturnNilWhenIndexIsOutOfBound(t *testin
genesisTime,
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

chr.AddSubround(initSubroundHandlerMock())
Expand All @@ -251,6 +309,8 @@ func TestChronology_InitRoundShouldNotSetSubroundWhenRoundIndexIsNegative(t *tes
syncTimerMock.CurrentTime(),
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

chr.AddSubround(initSubroundHandlerMock())
Expand All @@ -274,6 +334,8 @@ func TestChronology_InitRoundShouldSetSubroundWhenRoundIndexIsPositive(t *testin
syncTimerMock.CurrentTime(),
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

sr := initSubroundHandlerMock()
Expand All @@ -291,6 +353,8 @@ func TestChronology_StartRoundShouldNotUpdateRoundWhenCurrentRoundIsNotFinished(
syncTimerMock.CurrentTime(),
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

chr.SetSubroundId(0)
Expand All @@ -308,6 +372,8 @@ func TestChronology_StartRoundShouldUpdateRoundWhenCurrentRoundIsFinished(t *tes
syncTimerMock.CurrentTime(),
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

chr.SetSubroundId(-1)
Expand All @@ -325,6 +391,8 @@ func TestChronology_SetAppStatusHandlerWithNilValueShouldErr(t *testing.T) {
syncTimerMock.CurrentTime(),
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)
err := chr.SetAppStatusHandler(nil)

Expand All @@ -340,6 +408,8 @@ func TestChronology_SetAppStatusHandlerWithOkValueShouldPass(t *testing.T) {
syncTimerMock.CurrentTime(),
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

err := chr.SetAppStatusHandler(&mock.AppStatusHandlerMock{})
Expand All @@ -357,6 +427,8 @@ func TestChronology_CheckIfStatusHandlerWorks(t *testing.T) {
syncTimerMock.CurrentTime(),
rounderMock,
syncTimerMock,
alarm.NewAlarmScheduler(),
make(chan endProcess.ArgEndProcess),
)

err := chr.SetAppStatusHandler(&mock.AppStatusHandlerStub{
Expand Down
6 changes: 6 additions & 0 deletions consensus/chronology/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ var ErrNilSyncTimer = errors.New("sync timer is nil")

// ErrNilAppStatusHandler is raised when the AppStatusHandler is nil when setting it
var ErrNilAppStatusHandler = errors.New("nil AppStatusHandler")

// ErrNilAlarmScheduler is raised when a valid alarm scheduler is expected but nil is used
var ErrNilAlarmScheduler = errors.New("nil alarm scheduler")

// ErrNilEndProcessChan is raised when a valid end process chan is expected but nil is used
var ErrNilEndProcessChan = errors.New("nil end process chan")