Skip to content

Commit

Permalink
outofband: rework status poller method
Browse files Browse the repository at this point in the history
- Purges Flasher specific FirmwareInstall states (these were pointless indirection)
- Rename method since its a bit more generic - which could be polling uploaded firmware statuses or install firmware statuses.
- Handle the case where the BMC resets itself after a firmware update
  • Loading branch information
joelrebel committed Nov 20, 2023
1 parent a5a8aa7 commit 1ac12d1
Show file tree
Hide file tree
Showing 3 changed files with 109 additions and 83 deletions.
29 changes: 0 additions & 29 deletions internal/model/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,35 +39,6 @@ func (c Components) BySlugModel(cSlug string, cModels []string) *Component {
return nil
}

// ComponentFirmwareInstallStatus is the device component specific firmware install statuses
// returned by the FirmwareInstallStatus method, which is part of the DeviceQueryor interface.
//
// As an example, the BMCs return various firmware install statuses based on the vendor implementation
// and so these statuses defined reduce all of those differences into a few generic status values
//
// Note: these statuses are not related to the Flasher task status.
type ComponentFirmwareInstallStatus string

var (
// StatusInstallRunning is returned by the FirmwareInstallStatus when the device indicates the install is running.
StatusInstallRunning ComponentFirmwareInstallStatus = "running"

// StatusInstallRunning is returned by the FirmwareInstallStatus when the device indicates the install is running.
StatusInstallComplete ComponentFirmwareInstallStatus = "complete"

// StatusInstallUnknown is returned by the FirmwareInstallStatus when the firmware install status is not known.
StatusInstallUnknown ComponentFirmwareInstallStatus = "unknown"

// StatusInstallFailed is returned by the FirmwareInstallStatus when the device indicates the install has failed.
StatusInstallFailed ComponentFirmwareInstallStatus = "failed"

// StatusInstallPowerCycleHostRequired is returned by the FirmwareInstallStatus when the device indicates the install requires a host power cycle.
StatusInstallPowerCycleHostRequired ComponentFirmwareInstallStatus = "powerCycleHostRequired"

// StatusInstallPowerCycleBMCRequired is returned by the FirmwareInstallStatus when the device indicates the BMC requires a power cycle.
StatusInstallPowerCycleBMCRequired ComponentFirmwareInstallStatus = "powerCycleBMCRequired"
)

// ComponentConvertor provides methods to convert a common.Device to its Component equivalents.
type ComponentConverter struct {
deviceVendor string
Expand Down
137 changes: 106 additions & 31 deletions internal/outofband/action_handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ func (h *actionHandler) initiateInstallFirmware(a sw.StateSwitch, c sw.Transitio
// polls firmware install status from the BMC
//
// nolint:gocyclo // for now this is best kept in the same method
func (h *actionHandler) pollFirmwareInstallStatus(a sw.StateSwitch, c sw.TransitionArgs) error {
func (h *actionHandler) pollFirmwareTaskStatus(a sw.StateSwitch, c sw.TransitionArgs) error {
action, tctx, err := actionTaskCtxFromInterfaces(a, c)
if err != nil {
return err
Expand All @@ -387,12 +387,21 @@ func (h *actionHandler) pollFirmwareInstallStatus(a sw.StateSwitch, c sw.Transit

var attemptErrors *multierror.Error

// inventory is set when the loop below determines that
// a new collection should be attempted.
var inventory bool

// helper func
componentIsBMC := func(c string) bool {
return strings.EqualFold(strings.ToUpper(c), common.SlugBMC)
}

tctx.Logger.WithFields(
logrus.Fields{
"component": action.Firmware.Component,
"version": action.Firmware.Version,
"bmc": tctx.Asset.BmcAddress,
}).Info("polling BMC for firmware install status")
}).Info("polling BMC for firmware task status")

for {
// increment attempts
Expand All @@ -409,20 +418,62 @@ func (h *actionHandler) pollFirmwareInstallStatus(a sw.StateSwitch, c sw.Transit
if attempts >= maxPollStatusAttempts {
attemptErrors = multierror.Append(attemptErrors, errors.Wrapf(
ErrMaxBMCQueryAttempts,
"%d attempts querying FirmwareInstallStatus(), elapsed: %s",
"%d attempts querying FirmwareTaskStatus(), elapsed: %s",
attempts,
time.Since(startTS).String(),
))

return attemptErrors
}

// TODO: break into its own method
if inventory {
err := h.installedEqualsExpected(
tctx,
action.Firmware.Component,
action.Firmware.Version,
action.Firmware.Vendor,
action.Firmware.Models,
)
switch err {
case nil:
tctx.Logger.WithFields(
logrus.Fields{
"bmc": tctx.Asset.BmcAddress,
"component": action.Firmware.Component,
}).Debug("Installed firmware matches expected.")

return nil

case ErrInstalledFirmwareNotEqual:
// if the BMC came online and is still running the previous version
// the install failed
if componentIsBMC(action.Firmware.Component) {
errInstall := errors.New("BMC failed to install expected firmware")
return errInstall
}

default:
// includes errors - ErrInstalledVersionUnknown, ErrComponentNotFound
attemptErrors = multierror.Append(attemptErrors, err)
tctx.Logger.WithFields(
logrus.Fields{
"bmc": tctx.Asset.BmcAddress,
"component": action.Firmware.Component,
"err": err.Error(),
}).Debug("Inventory collection for component returned error")
}

continue
}

// query firmware install status
status, err := tctx.DeviceQueryor.FirmwareInstallStatus(
state, status, err := tctx.DeviceQueryor.FirmwareTaskStatus(
tctx.Ctx,
action.Firmware.Version,
bconsts.FirmwareInstallStep(action.FirmwareInstallStep),
action.Firmware.Component,
action.BMCTaskID,
action.Firmware.Version,
)

tctx.Logger.WithFields(
Expand All @@ -433,66 +484,88 @@ func (h *actionHandler) pollFirmwareInstallStatus(a sw.StateSwitch, c sw.Transit
"bmc": tctx.Asset.BmcAddress,
"elapsed": time.Since(startTS).String(),
"attempts": fmt.Sprintf("attempt %d/%d", attempts, maxPollStatusAttempts),
"taskState": status,
}).Debug("firmware install status query attempt")
"taskState": state,
"bmcTaskID": action.BMCTaskID,
"status": status,
}).Debug("firmware task status query attempt")

// error check returns when maxPollStatusAttempts have been reached
if err != nil {
attemptErrors = multierror.Append(attemptErrors, err)

// no implementations available.
if strings.Contains(err.Error(), "no FirmwareTaskVerifier implementations found") {
return errors.Wrap(
ErrFirmwareInstallFailed,
"Firmware install support for component not available:"+err.Error(),
)
}

// When BMCs are updating its own firmware, they can go unreachable
// they apply the new firmware and in most cases the BMC task information is lost.
//
// And so if we get an error and its a BMC component that was being updated, we wait for
// the BMC to be available again and validate its firmware matches the one expected.
if componentIsBMC(action.Firmware.Component) {
tctx.Logger.WithFields(
logrus.Fields{
"bmc": tctx.Asset.BmcAddress,
"delay": delayBMCReset.String(),
"taskState": state,
"bmcTaskID": action.BMCTaskID,
"status": status,
"err": err.Error(),
}).Debug("BMC task status lookup returned error")

inventory = true
}

continue
}

switch status {
switch state {
// continue polling when install is running
case model.StatusInstallRunning:
case bconsts.FirmwareInstallInitializing, bconsts.FirmwareInstallQueued, bconsts.FirmwareInstallRunning:
continue

// record the unknown status as an error
case model.StatusInstallUnknown:
err = errors.New("firmware install status unknown")
case bconsts.FirmwareInstallUnknown:
err = errors.New("BMC firmware task status unknown")
attemptErrors = multierror.Append(attemptErrors, err)

continue

// return when bmc power cycle is required
case model.StatusInstallPowerCycleBMCRequired:
case bconsts.FirmwareInstallPowerCycleBMC:
action.BMCPowerCycleRequired = true
return nil

// return when host power cycle is required
case model.StatusInstallPowerCycleHostRequired:
case bconsts.FirmwareInstallPowerCycleHost:
action.HostPowerCycleRequired = true
return nil

// return error when install fails
case model.StatusInstallFailed:
case bconsts.FirmwareInstallFailed:
return errors.Wrap(
ErrFirmwareInstallFailed,
"check logs on the BMC for information, bmc task ID: "+action.BMCTaskID,
)

// return nil when install is complete
case model.StatusInstallComplete:
if strings.EqualFold(action.Firmware.Component, common.SlugBMC) {
tctx.Logger.WithFields(
logrus.Fields{
"bmc": tctx.Asset.BmcAddress,
"delay": delayBMCReset.String(),
}).Debug("BMC firmware install completed, added delay to allow the BMC to complete its update process..")

if err := sleepWithContext(tctx.Ctx, delayBMCReset); err != nil {
return errors.Wrap(
ErrFirmwareInstallFailed,
err.Error(),
)
}
case bconsts.FirmwareInstallComplete:
// The BMC would reset itself and returning now would mean the next install fails,
// wait until the BMC is available again and verify its on the expected version.
if componentIsBMC(action.Firmware.Component) {
inventory = true

continue
}

return nil

default:
return errors.Wrap(ErrFirmwareInstallStatusUnexpected, string(status))
return errors.Wrap(ErrFirmwareTaskStateUnexpected, "state: "+(state))
}
}
}
Expand Down Expand Up @@ -534,7 +607,8 @@ func (h *actionHandler) resetBMC(a sw.StateSwitch, c sw.TransitionArgs) error {
return nil
}

return h.pollFirmwareInstallStatus(a, c)
// TODO: implement an poll BMC availability method instead
return h.pollFirmwareTaskStatus(a, c)
}

func (h *actionHandler) resetDevice(a sw.StateSwitch, c sw.TransitionArgs) error {
Expand Down Expand Up @@ -563,7 +637,8 @@ func (h *actionHandler) resetDevice(a sw.StateSwitch, c sw.TransitionArgs) error
}
}

return h.pollFirmwareInstallStatus(a, c)
// TODO: check if this is required
return h.pollFirmwareTaskStatus(a, c)
}

func (h *actionHandler) conditionPowerOffDevice(action *model.Action, tctx *sm.HandlerContext) (bool, error) {
Expand Down
26 changes: 3 additions & 23 deletions internal/outofband/bmc.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,28 +229,8 @@ func (b *bmc) FirmwareInstallStatus(ctx context.Context, installVersion, compone
return model.StatusInstallUnknown, errors.Wrap(ErrBMCQuery, err.Error())
}

switch status {
case bmclibv2consts.FirmwareInstallInitializing, bmclibv2consts.FirmwareInstallQueued, bmclibv2consts.FirmwareInstallRunning:
return model.StatusInstallRunning, nil
case bmclibv2consts.FirmwareInstallPowerCyleHost:
// if the host is under reset (this is the final state only for queueing updates)
// if hostWasReset {
// return false, nil
// }
return model.StatusInstallPowerCycleHostRequired, nil
case bmclibv2consts.FirmwareInstallPowerCycleBMC:
// if BMC is under reset return false (this is the final state only for queuing the update)
// if bmcWasReset {
// return false, nil
// }
return model.StatusInstallPowerCycleBMCRequired, nil
case bmclibv2consts.FirmwareInstallComplete:
return model.StatusInstallComplete, nil
case bmclibv2consts.FirmwareInstallFailed:
return model.StatusInstallFailed, nil
case bmclibv2consts.FirmwareInstallUnknown:
return model.StatusInstallUnknown, nil
default:
return model.StatusInstallUnknown, errors.Wrap(ErrFirmwareInstallStatusUnexpected, status)
func (b *bmc) FirmwareInstallUploaded(ctx context.Context, component, uploadVerifyTaskID string) (installTaskID string, err error) {
if err := b.Open(ctx); err != nil {
return "", err
}
}

0 comments on commit 1ac12d1

Please sign in to comment.