Skip to content

Commit

Permalink
MGMT-15878: Ensure that hosts emit event showing why preparation failed.
Browse files Browse the repository at this point in the history
When preparation fails for a host, we do not implement any kind of a timeout for the host to indicate that this has occurred.

This means that it is sometimes not possible for the user to determine the cause of a cluster timeout (which will inevitably be caused by the timeout of a host during preparation.)

Presently, there are two ways in which a host may time out (no result received within the cluster timeout)

    An inconclusive result from the pulling of cluster images
    An inconclusive result from the disk speed check

This PR introduces a timeout to detect these scenarios and report on them in a host timeout event so that the user may have a clue as to what has happened.

This PR is in addition to MGMT-15814 which introduces a cluster condition to track a cluster timeout when there is a failure to configure the preparation of a cluster within a given time frame (for example if the assisted pod crashes)

Together these PR's should improve the overall quality of error reporting.
  • Loading branch information
paul-maidment committed Nov 6, 2023
1 parent 73b1599 commit 92f3e38
Show file tree
Hide file tree
Showing 10 changed files with 238 additions and 59 deletions.
8 changes: 8 additions & 0 deletions cmd/graphstatemachine/main.go
Expand Up @@ -35,6 +35,10 @@ func hostStateMachine() stateswitch.StateMachine {
func(_ stateswitch.StateSwitch, _ stateswitch.TransitionArgs) error { return nil },
).AnyTimes()

mockTransitionHandler.EXPECT().PostHostPreparationTimeout().Return(
func(_ stateswitch.StateSwitch, _ stateswitch.TransitionArgs) error { return nil },
).AnyTimes()

mockTransitionHandler.EXPECT().PostRefreshLogsProgress(gomock.Any()).Return(
func(_ stateswitch.StateSwitch, _ stateswitch.TransitionArgs) error { return nil },
).AnyTimes()
Expand Down Expand Up @@ -68,5 +72,9 @@ func poolHostStateMachine() stateswitch.StateMachine {
func(_ stateswitch.StateSwitch, _ stateswitch.TransitionArgs) error { return nil },
).AnyTimes()

mockTransitionHandler.EXPECT().PostHostPreparationTimeout().Return(
func(_ stateswitch.StateSwitch, _ stateswitch.TransitionArgs) error { return nil },
).AnyTimes()

return host.NewPoolHostStateMachine(stateswitch.NewStateMachine(), mockTransitionHandler)
}
2 changes: 2 additions & 0 deletions cmd/main.go
Expand Up @@ -313,6 +313,8 @@ func main() {
// Make sure that prepare for installation timeout is more than the timeouts of all underlying tools + 2m extra
Options.ClusterConfig.PrepareConfig.PrepareForInstallationTimeout = maxDuration(Options.ClusterConfig.PrepareConfig.PrepareForInstallationTimeout,
maxDuration(Options.InstructionConfig.DiskCheckTimeout, Options.InstructionConfig.ImageAvailabilityTimeout)+2*time.Minute)
Options.HostConfig.PrepareConfig.PrepareForInstallationTimeout = maxDuration(Options.HostConfig.PrepareConfig.PrepareForInstallationTimeout,
maxDuration(Options.InstructionConfig.DiskCheckTimeout, Options.InstructionConfig.ImageAvailabilityTimeout)+1*time.Minute)
var lead leader.ElectorInterface
var k8sClient *kubernetes.Clientset
var autoMigrationLeader leader.ElectorInterface
Expand Down
2 changes: 1 addition & 1 deletion internal/cluster/statemachine.go
Expand Up @@ -207,7 +207,7 @@ func NewClusterStateMachine(th TransitionHandler) stateswitch.StateMachine {
sm.AddTransitionRule(stateswitch.TransitionRule{
TransitionType: TransitionTypeRefreshStatus,
SourceStates: []stateswitch.State{stateswitch.State(models.ClusterStatusPreparingForInstallation)},
Condition: th.IsPreparingTimedOut,
Condition: stateswitch.And(th.IsPreparingTimedOut, stateswitch.Not(If(FailedPreparingtHostsExist))),
DestinationState: stateswitch.State(models.ClusterStatusReady),
PostTransition: th.PostPreparingTimedOut,
Documentation: stateswitch.TransitionRuleDoc{
Expand Down
16 changes: 10 additions & 6 deletions internal/host/common.go
Expand Up @@ -17,12 +17,16 @@ import (
)

const (
statusInfoMediaDisconnected = "Unable to read from the discovery media. It was either disconnected or poor network conditions prevented it from being read. Try using the minimal ISO option and be sure to keep the media connected until the installation is completed"
statusInfoDisconnected = "Host has stopped communicating with the installation service"
statusInfoDiscovering = "Waiting for host to send hardware details"
statusInfoInsufficientHardware = "Host does not meet the minimum hardware requirements: $FAILING_VALIDATIONS"
statusInfoPendingForInput = "Waiting for user input: $FAILING_VALIDATIONS"
statusInfoNotReadyForInstall = "Host cannot be installed due to following failing validation(s): $FAILING_VALIDATIONS"
statusInfoMediaDisconnected = "Unable to read from the discovery media. It was either disconnected or poor network conditions prevented it from being read. Try using the minimal ISO option and be sure to keep the media connected until the installation is completed"
statusInfoDisconnected = "Host has stopped communicating with the installation service"
statusInfoDiscovering = "Waiting for host to send hardware details"
statusInfoInsufficientHardware = "Host does not meet the minimum hardware requirements: $FAILING_VALIDATIONS"
statusInfoPendingForInput = "Waiting for user input: $FAILING_VALIDATIONS"
statusInfoNotReadyForInstall = "Host cannot be installed due to following failing validation(s): $FAILING_VALIDATIONS"
statusInfoPreparationTimeout = "The host has encountered a preparation timeout, the following conditions failed: $FAILING_CONDITIONS"
statusInfoPreparationTimeoutDiskSpeed = "the installation disk speed check did not complete within the timeout."
statusInfoPreparationTimeoutImageAvailability = "container availability was not determined within the timeout."

statusInfoKnown = "Host is ready to be installed"
statusInfoInstalling = "Installation is in progress"
statusInfoResettingPendingUserAction = "Host requires booting into the discovery image to complete resetting the installation"
Expand Down
5 changes: 5 additions & 0 deletions internal/host/config.go
Expand Up @@ -10,7 +10,12 @@ import (
"github.com/pkg/errors"
)

type PrepareConfig struct {
PrepareForInstallationTimeout time.Duration `envconfig:"PREPARE_FOR_INSTALLATION_HOST_TIMEOUT" default:"8m"`
}

type Config struct {
PrepareConfig PrepareConfig
LogTimeoutConfig
EnableAutoAssign bool `envconfig:"ENABLE_AUTO_ASSIGN" default:"true"`
ResetTimeout time.Duration `envconfig:"RESET_CLUSTER_TIMEOUT" default:"3m"`
Expand Down
29 changes: 29 additions & 0 deletions internal/host/mock_transition.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion internal/host/monitor.go
Expand Up @@ -239,7 +239,6 @@ func (m *Manager) HostMonitoring() {
m.log.Debugf("Not a leader, exiting HostMonitoring")
return
}
m.log.Debugf("Running HostMonitoring")
defer commonutils.MeasureOperation("HostMonitoring", m.log, m.metricApi)()
m.initMonitoringQueryGenerator()
monitored += m.clusterHostMonitoring()
Expand Down
14 changes: 14 additions & 0 deletions internal/host/statemachine.go
Expand Up @@ -549,6 +549,20 @@ func NewHostStateMachine(sm stateswitch.StateMachine, th TransitionHandler) stat
},
})

sm.AddTransitionRule(stateswitch.TransitionRule{
TransitionType: TransitionTypeRefresh,
SourceStates: []stateswitch.State{
stateswitch.State(models.HostStatusPreparingForInstallation),
},
Condition: stateswitch.And(If(IsConnected), If(IsMediaConnected), th.IsPreparingTimedOut, stateswitch.Or(installationDiskSpeedUnknown, imagesAvailabilityUnknown), allConditionsSuccessfulOrUnknown),
DestinationState: stateswitch.State(models.HostStatusPreparingFailed),
PostTransition: th.PostHostPreparationTimeout(),
Documentation: stateswitch.TransitionRuleDoc{
Name: "Preparing timed out host move to known",
Description: "TODO: Document this transition rule",
},
})

sm.AddTransitionRule(stateswitch.TransitionRule{
TransitionType: TransitionTypeRefresh,
SourceStates: []stateswitch.State{
Expand Down
45 changes: 45 additions & 0 deletions internal/host/transition.go
Expand Up @@ -60,6 +60,8 @@ type TransitionHandler interface {
PostRegisterHost(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) error
PostResettingPendingUserAction(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) error
PostUnbindHost(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) error
IsPreparingTimedOut(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error)
PostHostPreparationTimeout() stateswitch.PostTransition
}

var resetLogsField = []interface{}{"logs_info", "", "logs_started_at", strfmt.DateTime(time.Time{}), "logs_collected_at", strfmt.DateTime(time.Time{})}
Expand Down Expand Up @@ -665,6 +667,37 @@ func (th *transitionHandler) HasInstallationInProgressTimedOut(sw stateswitch.St
return time.Since(time.Time(sHost.host.Progress.StageUpdatedAt)) > maxDuration, nil
}

func (th *transitionHandler) PostHostPreparationTimeout() stateswitch.PostTransition {
ret := func(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) error {
sHost, ok := sw.(*stateHost)
if !ok {
return errors.New("PostHostPreparationTimeout incompatible type of StateSwitch")
}
params, ok := args.(*TransitionArgsRefreshHost)
if !ok {
return errors.New("PostRefreshHost invalid argument")
}
var (
err error
)
failingConditons := []string{}
if !params.conditions["installation-disk-speed-check-successful"] {
failingConditons = append(failingConditons, statusInfoPreparationTimeoutDiskSpeed)
}
if !params.conditions["successful-container-image-availability"] {
failingConditons = append(failingConditons, statusInfoPreparationTimeoutImageAvailability)
}
statusInfo := strings.Replace(statusInfoPreparationTimeout, "$FAILING_CONDITIONS", strings.Join(failingConditons, "\n"), 1)
if sHost.srcState != swag.StringValue(sHost.host.Status) || swag.StringValue(sHost.host.StatusInfo) != statusInfo {
_, err = hostutil.UpdateHostStatus(params.ctx, logutil.FromContext(params.ctx, th.log), params.db,
th.eventsHandler, sHost.host.InfraEnvID, *sHost.host.ID,
sHost.srcState, swag.StringValue(sHost.host.Status), statusInfo)
}
return err
}
return ret
}

// Return a post transition function with a constant reason
func (th *transitionHandler) PostRefreshHost(reason string) stateswitch.PostTransition {
ret := func(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) error {
Expand Down Expand Up @@ -791,3 +824,15 @@ func (th *transitionHandler) PostRefreshHostRefreshStageUpdateTime(
sHost.srcState)
return err
}

// check if prepare for installation reach to timeout
func (th *transitionHandler) IsPreparingTimedOut(sw stateswitch.StateSwitch, args stateswitch.TransitionArgs) (bool, error) {
sHost, ok := sw.(*stateHost)
if !ok {
return false, errors.New("IsPreparingTimedOut incompatible type of StateSwitch")
}
if time.Since(time.Time(sHost.host.StatusUpdatedAt)) > th.config.PrepareConfig.PrepareForInstallationTimeout {
return true, nil
}
return false, nil
}

0 comments on commit 92f3e38

Please sign in to comment.