Skip to content

Commit

Permalink
Correctly handle psu FFDC on OpenPower Systems
Browse files Browse the repository at this point in the history
Prior to this change, there was a switch statement that was saying
"if this RC is found in the psu FFDC, then do this recovery action"
that is obviously not very easy to maintain because for every error
we need to add the proper action. Instead of this, now we will just
look if any GARD records were created as part of the error found in
the FFDC. If a gard was found , Hostboot will stop trying to recover
the SBE and instead enter a reconfig loop to try and IPL w/ the target
garded out. Again this only applies to OP system, in the FSP world we
will commit the error logs w/ the gard records and then TI telling HWSV
they need to look at the SBE

Change-Id: I04e03feebf2bbd1eae2d725bee31993062fe7c94
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/66374
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Roland Veloz <rveloz@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
crgeddes authored and dcrowell77 committed Sep 27, 2018
1 parent 0d43552 commit 7511e13
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 148 deletions.
24 changes: 0 additions & 24 deletions src/include/usr/sbeio/sbe_retry_handler.H
Original file line number Diff line number Diff line change
Expand Up @@ -270,17 +270,6 @@ class SbeRetryHandler
*/
errlHndl_t switch_sbe_sides(TARGETING::Target * i_target);


/**
* @brief This is the switch case that identifies the action needed
* for the RC value in an SBE FFDC package.
*
* @param[in] i_rc - RC value from SBE FFDC package
*
* @return - pass(0) or specific returned SBE action
*/
uint32_t action_for_ffdc_rc( uint32_t i_rc);

/**
* @brief This function handles the call to the p9_get_sbe_msg_handler.
* It will read the sbe msg register (Cfam 2809 or Scom 50009)
Expand Down Expand Up @@ -363,19 +352,6 @@ class SbeRetryHandler
*/
uint8_t iv_currentSideBootAttempts;

/*
* @brief If the asyncFFDC bit is found to be set on the status register
* this indicates to hostboot that the SBE was able to collect
* FFDC about what went wrong in its attempt to boot itself
* in this case Hostboot will send a FIFO chip op to the SBE
* so the SBE will write the FFDC data out to memory where
* Hostboot can parse it. Note that after the SBE writes
* the data to memory the asyncFFDC bit on the status register
* will be off.
*/
bool iv_ffdcSetAction;


/*
* @brief The mode of operation that needs to be run through the
* SbeRetryHandler. The different modes are specified in the
Expand Down
173 changes: 49 additions & 124 deletions src/usr/sbeio/common/sbe_retry_handler.C
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@
#include <trace/interface.H>
#include <errl/errlentry.H>
#include <errl/errlmanager.H>
#include <errl/errlreasoncodes.H>
#include <p9_extract_sbe_rc.H>

#include <fapi2/target.H>
#include <fapi2/plat_hwp_invoker.H>
#include <initservice/isteps_trace.H>
#include <initservice/initserviceif.H>
#include <initservice/istepdispatcherif.H>
#include <initservice/initsvcreasoncodes.H>
#include <errl/errludtarget.H>
#include <util/misc.H>
#include <ipmi/ipmiwatchdog.H>
Expand Down Expand Up @@ -100,12 +102,6 @@ constexpr uint8_t MAX_SIDE_BOOT_ATTEMPTS = 2;
// add to an errorlog but otherwise ignores
constexpr uint8_t MAX_EXPECTED_FFDC_PACKAGES = 2;

// action_for_ffdc_rc will figure out what action we should do
// for each p9_extract_sbe_rc return code. If the RC does not match
// any return code from p9_extract_sbe_rc then we want to have a
// known "no action found" value which is defined here
constexpr uint32_t NO_ACTION_FOUND_FOR_THIS_RC = 0xFFFF;

// Set up constants that will be used for setting up the timeout for
// reading the sbe message register
constexpr uint64_t SBE_RETRY_TIMEOUT_HW_SEC = 60; // 60 seconds
Expand All @@ -128,7 +124,6 @@ SbeRetryHandler::SbeRetryHandler(SBE_MODE_OF_OPERATION i_sbeMode,
, iv_currentSBEState(SBE_REG_RETURN::SBE_NOT_AT_RUNTIME)
, iv_shutdownReturnCode(0)
, iv_currentSideBootAttempts(1) // It is safe to assume that the current side has attempted to boot
, iv_ffdcSetAction(false)
, iv_sbeMode(i_sbeMode)
, iv_sbeRestartMethod(SBE_RESTART_METHOD::HRESET)
, iv_initialPowerOn(false)
Expand Down Expand Up @@ -207,7 +202,7 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
ERRORLOG::ERRL_SEV_UNRECOVERABLE,
SBEIO_EXTRACT_RC_HANDLER,
SBEIO_SLAVE_FAILED_TO_BOOT,
this->iv_ffdcSetAction,
this->iv_sbeRegister.asyncFFDC,
TARGETING::get_huid(i_target));

l_errl->collectTrace( "ISTEPS_TRACE", 256);
Expand All @@ -226,12 +221,10 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
}
#endif

// If iv_ffdcSetAction is true, that means that we found ffdc to parse
// this indicates that the SBE already determined what went wrong and
// reported the error via asyncFFDC so there is no need to
// run p9_extract_sbe_rc
// Also if the sbe is not booted at all, extract_rc will fail so we don't want to run it
if(!this->iv_ffdcSetAction && this->iv_sbeRegister.sbeBooted)

// if the sbe is not booted at all extract_rc will fail so we only
// will run extract RC if we know the sbe has at least tried to boot
if(this->iv_sbeRegister.sbeBooted)
{
SBE_TRACF("main_sbe_handler(): No async ffdc found and sbe says it has been booted, running run p9_sbe_extract_rc.");
// Call the function that runs extract_rc, this needs to run to determine
Expand All @@ -241,7 +234,7 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
// If we have determined that the sbe never booted
// then set the current action to be "restart sbe"
// that way we will attempt to start the sbe again
else if(!this->iv_sbeRegister.sbeBooted)
else
{
SBE_TRACF("main_sbe_handler(): SBE reports it was never booted, calling p9_sbe_extract_rc will fail. Setting action to be RESTART_SBE");
this->iv_currentAction = P9_EXTRACT_SBE_RC::RESTART_SBE;
Expand Down Expand Up @@ -553,9 +546,6 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
}
}

// We have performed the action, so make sure that ffdcSetAction is set back to 0
this->iv_ffdcSetAction = 0;

// Get the sbe register (note that if asyncFFDC bit is set in status register then
// we will read it in this call)
if(!this->sbe_run_extract_msg_reg(i_target))
Expand All @@ -565,19 +555,12 @@ void SbeRetryHandler::main_sbe_handler( TARGETING::Target * i_target )
break;
}

// If our retry attempt fail, and we didnt see any asyncFFDC after
// If the currState of the SBE is not RUNTIME then we will assume
// our attempt to boot the SBE has failed, so run extract rc again
// to determine why we have failed
if (this->iv_sbeRegister.currState != SBE_STATE_RUNTIME)
{
// Again, if ffdcSetAction is set, that means we have found FFDC
// already that the SBE saved away prior to failing so we don't need
// to run extract_rc if ffdcSetAction is true
if(!this->iv_ffdcSetAction)
{
SBE_TRACF("main_sbe_handler(): Failed to reach runtime after sbe restart and no asyncFFDC found. Calling p9_sbe_extract_rc.");
// Run extract rc to figure out why the sbe did not make it to
// runtime state
this->sbe_run_extract_rc(i_target);
}
this->sbe_run_extract_rc(i_target);
}

} while((this->iv_sbeRegister).currState != SBE_STATE_RUNTIME);
Expand Down Expand Up @@ -637,8 +620,8 @@ bool SbeRetryHandler::sbe_run_extract_msg_reg(TARGETING::Target * i_target)
(this->iv_sbeRegister.currState != SBE_STATE_RUNTIME) &&
this->iv_sbeRegister.asyncFFDC)
{
SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X . "
"There was asyncFFDC found though so we will run the FFDC parser",
SBE_TRACF("WARNING: sbe_run_extract_msg_reg completed without error for proc 0x%.8X . "
"However, there was asyncFFDC found though so we will run the FFDC parser",
TARGETING::get_huid(i_target));
// The SBE has responded to an asyncronus request that hostboot
// made with FFDC indicating an error has occurred.
Expand Down Expand Up @@ -669,7 +652,7 @@ bool SbeRetryHandler::sbe_run_extract_msg_reg(TARGETING::Target * i_target)
// No guarantees that the SBE made it to runtime
else
{
SBE_TRACF("SUCCESS: sbe_run_extract_msg_reg completed okay for proc 0x%.8X",
SBE_TRACF("sbe_run_extract_msg_reg completed without error for proc 0x%.8X",
TARGETING::get_huid(i_target));
}

Expand Down Expand Up @@ -826,76 +809,20 @@ void SbeRetryHandler::handleFspIplTimeFail(TARGETING::Target * i_target)
}
#endif

uint32_t SbeRetryHandler::action_for_ffdc_rc(
uint32_t i_rc)
{
SBE_TRACF(ENTER_MRK "action_for_ffdc_rc()");

uint32_t l_action;

switch(i_rc)
{
case fapi2::RC_EXTRACT_SBE_RC_RUNNING:
case fapi2::RC_EXTRACT_SBE_RC_NEVER_STARTED:
case fapi2::RC_EXTRACT_SBE_RC_PROGRAM_INTERRUPT:
case fapi2::RC_EXTRACT_SBE_RC_ADDR_NOT_RECOGNIZED:
case fapi2::RC_EXTRACT_SBE_RC_PIBMEM_ECC_ERR:
case fapi2::RC_EXTRACT_SBE_RC_FI2CM_BIT_RATE_ERR_NONSECURE_MODE:

l_action = P9_EXTRACT_SBE_RC::RESTART_SBE;

break;

case fapi2::RC_EXTRACT_SBE_RC_MAGIC_NUMBER_MISMATCH:
case fapi2::RC_EXTRACT_SBE_RC_FI2C_ECC_ERR:
case fapi2::RC_EXTRACT_SBE_RC_FI2C_ECC_ERR_NONSECURE_MODE:

l_action = P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM;

break;

case fapi2::RC_EXTRACT_SBE_RC_FI2C_TIMEOUT:
case fapi2::RC_EXTRACT_SBE_RC_SBE_L1_LOADER_FAIL:
case fapi2::RC_EXTRACT_SBE_RC_SBE_L2_LOADER_FAIL:
case fapi2::RC_EXTRACT_SBE_RC_UNKNOWN_ERROR:

l_action = P9_EXTRACT_SBE_RC::REIPL_BKP_SEEPROM;

break;

case fapi2::RC_EXTRACT_SBE_RC_OTP_TIMEOUT:
case fapi2::RC_EXTRACT_SBE_RC_OTP_PIB_ERR:
case fapi2::RC_EXTRACT_SBE_RC_PIBMEM_PIB_ERR:
case fapi2::RC_EXTRACT_SBE_RC_FI2C_SPRM_CFG_ERR:
case fapi2::RC_EXTRACT_SBE_RC_FI2C_PIB_ERR:

l_action = P9_EXTRACT_SBE_RC::RESTART_CBS;

break;

case fapi2::RC_EXTRACT_SBE_RC_BRANCH_TO_SEEPROM_FAIL:
case fapi2::RC_EXTRACT_SBE_RC_UNEXPECTED_OTPROM_HALT:
case fapi2::RC_EXTRACT_SBE_RC_OTP_ECC_ERR:

l_action = P9_EXTRACT_SBE_RC::NO_RECOVERY_ACTION;

break;
default:

l_action = NO_ACTION_FOUND_FOR_THIS_RC;
}

SBE_TRACF(EXIT_MRK "action_for_ffdc_rc()");
return l_action;
}

void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
{
SBE_TRACF(ENTER_MRK "sbe_get_ffdc_handler()");
uint32_t l_responseSize = SbeFifoRespBuffer::MSG_BUFFER_SIZE;
uint32_t *l_pFifoResponse =
reinterpret_cast<uint32_t *>(malloc(l_responseSize));

// For OpenPower systems if a piece of HW is garded then we will
// need to force a reconfigure loop and avoid the rest of the
// sbe recovery process. On FSP systems if HW callouts are found in
// the FFDC, we just commit the errorlog and TI telling HWSV to look
// at the failure
bool l_reconfigRequired = false;

#ifndef __HOSTBOOT_RUNTIME
errlHndl_t l_errl = nullptr;
l_errl = getFifoSBEFFDC(i_target,
Expand Down Expand Up @@ -986,10 +913,6 @@ void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
// Get the RC from the FFDC package
uint32_t l_rc = l_ffdc_parser->getPackageRC(i);

// Determine an action for the RC
P9_EXTRACT_SBE_RC::RETURN_ACTION l_action =
static_cast<P9_EXTRACT_SBE_RC::RETURN_ACTION>(action_for_ffdc_rc(l_rc));

//See if HWP error, create another error log with callouts
if (l_rc != fapi2::FAPI2_RC_PLAT_ERR_SEE_DATA)
{
Expand All @@ -1010,8 +933,30 @@ void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
uint32_t l_pos = i_target->getAttr<TARGETING::ATTR_FAPI_POS>();
FAPI_SET_SBE_ERROR(l_fapiRc, l_rc, &l_sbeFfdc, l_pos);
errlHndl_t l_sbeHwpfErr = rcToErrl(l_fapiRc);
// If we created an error successfully we must now commit it
if(l_sbeHwpfErr)
{
// On BMC systems we must do a reconfig loop if gard is found
if(!INITSERVICE::spBaseServicesEnabled())
{
// Iterate over user details sections of the error log to check for UD
// callouts from the HWPF component
// NOTE: rcToErrl will make UD Callouts have ERRL_COMP_ID/ERRL_UDT_CALLOUT
for(const auto l_callout : l_sbeHwpfErr->getUDSections(ERRL_COMP_ID,
ERRORLOG::ERRL_UDT_CALLOUT) )
{
// IF the callout has a gard associated with it we need to do a reconfig loop
if((reinterpret_cast<HWAS::callout_ud_t*>(l_callout)->type == HWAS::HW_CALLOUT &&
reinterpret_cast<HWAS::callout_ud_t*>(l_callout)->gardErrorType != HWAS::GARD_NULL) ||
(reinterpret_cast<HWAS::callout_ud_t*>(l_callout)->type == HWAS::CLOCK_CALLOUT &&
reinterpret_cast<HWAS::callout_ud_t*>(l_callout)->clkGardErrorType != HWAS::GARD_NULL) ||
(reinterpret_cast<HWAS::callout_ud_t*>(l_callout)->type == HWAS::PART_CALLOUT &&
reinterpret_cast<HWAS::callout_ud_t*>(l_callout)->partGardErrorType != HWAS::GARD_NULL))
{
l_reconfigRequired = true;
}
}
}
// Set the PLID of the error log to master PLID
// if the master PLID is set
updatePlids(l_sbeHwpfErr);
Expand All @@ -1029,26 +974,6 @@ void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
SBEIO_UDT_PARAMETERS,
false );
}

if(l_action != NO_ACTION_FOUND_FOR_THIS_RC)
{
// Set the action associated with the RC that we found
this->iv_currentAction = l_action;

// This call will look at what action_for_ffdc_rc had set the return action to
// checks on how many times we have attempted to boot this side,
// and if we have already tried switching sides
//
//
// Note this call is important, if this is not called we could end up in a
// endless loop because this enforces MAX_SWITCH_SIDE_COUNT and MAX_SIDE_BOOT_ATTEMPTS
this->bestEffortCheck();

// Set the instance variable ffdcSetAction to let us
// know that the current action was set from what we
// found in the asyncFFDC
this->iv_ffdcSetAction = true;
}
}

l_errl->collectTrace( SBEIO_COMP_NAME, KILOBYTE/4);
Expand All @@ -1066,6 +991,11 @@ void SbeRetryHandler::sbe_get_ffdc_handler(TARGETING::Target * i_target)
free(l_pFifoResponse);
l_pFifoResponse = nullptr;

if(l_reconfigRequired)
{
INITSERVICE::doShutdown(INITSERVICE::SHUTDOWN_DO_RECONFIG_LOOP);
}

SBE_TRACF(EXIT_MRK "sbe_get_ffdc_handler()");
}

Expand Down Expand Up @@ -1103,11 +1033,6 @@ void SbeRetryHandler::sbe_run_extract_rc(TARGETING::Target * i_target)
l_errl = rcToErrl(l_rc, ERRORLOG::ERRL_SEV_UNRECOVERABLE);
this->iv_currentAction = l_ret;

// Set the instance variable ffdcSetAction to let us
// know that the current action was not set by what
// we found in asyncFFDC
this->iv_ffdcSetAction = false;

// This call will look at what p9_extact_sbe_rc had set the return action to
// checks on how many times we have attempted to boot this side,
// and if we have already tried switching sides
Expand Down

0 comments on commit 7511e13

Please sign in to comment.