Skip to content

Commit

Permalink
Fix SBE error handling for slave sbe start fails
Browse files Browse the repository at this point in the history
Set slave processors to use FSI SCOM until SBE is running, then
switch to using SBE SCOM.  Re-enable calls that were blocked
because they would attempt an SBE SCOM before the slave SBE was
running, and so, would fail.

Change-Id: I4d3e93ac6e53a6073584043c6adc7ecf32a4714b
RTC: 177921
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45063
Reviewed-by: Christian R. Geddes <crgeddes@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Elizabeth K. Liner <eliner@us.ibm.com>
Reviewed-by: Nicholas E. Bofferding <bofferdn@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
mgloff authored and dcrowell77 committed Sep 11, 2017
1 parent 7aed03e commit daa077f
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 108 deletions.
118 changes: 19 additions & 99 deletions src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C
Expand Up @@ -144,105 +144,8 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs )
}
}

TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"SBE 0x%.8X never started, l_sbeReg=0x%.8X",
TARGETING::get_huid(l_cpu_target),l_sbeReg.reg );
/*@
* @errortype
* @reasoncode RC_SBE_SLAVE_TIMEOUT
* @severity ERRORLOG::ERRL_SEV_INFORMATIONAL
* @moduleid MOD_CHECK_SLAVE_SBE_SEEPROM_COMPLETE
* @userdata1 HUID of proc which had SBE timeout
* @userdata2 SBE MSG Register
*
* @devdesc Slave SBE did not get to ready state within
* allotted time
*
* @custdesc A processor in the system has failed to initialize
*/
l_errl = new ErrlEntry(
ERRL_SEV_INFORMATIONAL,
MOD_CHECK_SLAVE_SBE_SEEPROM_COMPLETE,
RC_SBE_SLAVE_TIMEOUT,
TARGETING::get_huid(l_cpu_target),
l_sbeReg.reg);

l_errl->collectTrace( "ISTEPS_TRACE", 256);

//@fixme - RTC:177921
// Do not call p9_extract_sbe_rc because it corrupts
// live debug of fails. Need to make some other
// changes before turning this back on.
#if 1 // get rid of this
// Create IStep error log and cross reference to error
l_stepError.addErrorDetails( l_errl );

// Commit error log
errlCommit( l_errl, HWPF_COMP_ID );
#else

// Commit error and continue, this is not terminating since
// we can still at least boot with master proc
errlCommit(l_errl,ISTEP_COMP_ID);

// Setup for the HWP
P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction =
P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM;
FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc,
l_fapi2ProcTarget, l_rcAction);

if(l_errl)
{
TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"ERROR : proc_check_slave_sbe_seeprom_complete "
"failed, p9_extract_sbe_rc HWP returning errorlog "
"PLID=0x%x",l_errl->plid());

// capture the target data in the elog
ErrlUserDetailsTarget(l_cpu_target).addToLog( l_errl );

// Create IStep error log and cross reference to error
l_stepError.addErrorDetails( l_errl );

// Commit error log
errlCommit( l_errl, HWPF_COMP_ID );

}
else if(l_rcAction != P9_EXTRACT_SBE_RC::ERROR_RECOVERED)
{

if(INITSERVICE::spBaseServicesEnabled())
{
// When we are on an FSP machine, we want to fail out of
// hostboot and give control back to the FSP. They have
// better diagnostics for this type of error.
INITSERVICE::doShutdownWithError(RC_HWSV_COLLECT_SBE_RC,
TARGETING::get_huid(l_cpu_target));
}

// Pull out previous rc error for threshold
uint8_t l_prevError = 0;

// Save the current rc error
(l_cpu_target)->setAttr<
TARGETING::ATTR_PREVIOUS_SBE_ERROR>(l_rcAction);
#ifdef CONFIG_BMC_IPMI
// This could potentially take awhile, reset watchdog
l_errl = IPMIWATCHDOG::resetWatchDogTimer();
if(l_errl)
{
TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"call_proc_check_slave_sbe_seeprom_complete "
"Resetting watchdog before sbe_handler");
l_errl->collectTrace("ISTEPS_TRACE",256);
errlCommit(l_errl,ISTEP_COMP_ID);
}
#endif
proc_extract_sbe_handler( l_cpu_target,
l_prevError, l_rcAction);
}
#endif //@fixme - RTC:177921

// Handle that SBE failed to boot in the allowed time
sbe_boot_fail_handler(l_cpu_target,l_sbeReg,&l_stepError);
}
else if (l_errl)
{
Expand All @@ -265,6 +168,23 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs )
// Set attribute indicating that SBE is started
l_cpu_target->setAttr<ATTR_SBE_IS_STARTED>(1);

// Switch to using SBE SCOM
ScomSwitches l_switches =
l_cpu_target->getAttr<ATTR_SCOM_SWITCHES>();
ScomSwitches l_switches_before = l_switches;

// Turn on SBE SCOM and turn off FSI SCOM.
l_switches.useFsiScom = 0;
l_switches.useSbeScom = 1;

TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"proc_check_slave_sbe_seeprom_complete: changing SCOM "
"switches from 0x%.2X to 0x%.2X for proc 0x%.8X",
l_switches_before,
l_switches,
TARGETING::get_huid(l_cpu_target));
l_cpu_target->setAttr<ATTR_SCOM_SWITCHES>(l_switches);

TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"SUCCESS : proc_check_slave_sbe_seeprom_complete"
" completed ok for proc 0x%.8X",
Expand Down
134 changes: 127 additions & 7 deletions src/usr/isteps/istep08/sbe_extract_rc_handler.C
Expand Up @@ -43,6 +43,7 @@
#include <fapi2/plat_hwp_invoker.H>
#include <isteps/istep_reasoncodes.H>
#include <initservice/isteps_trace.H>
#include <initservice/initserviceif.H>
#include <errl/errludtarget.H>
#include <sys/time.h>
#include <util/misc.H>
Expand All @@ -61,6 +62,7 @@
#include <sbeio/sbeioreasoncodes.H>

using namespace ISTEP;
using namespace ISTEP_ERROR;


/* array and enum must be in sync */
Expand Down Expand Up @@ -532,9 +534,7 @@ SBE_REG_RETURN check_sbe_reg(TARGETING::Target * i_target)
}

// Handle that SBE failed to boot in the allowed time
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"SBE 0x%.8X never started, l_sbeReg=0x%.8X",
TARGETING::get_huid(i_target),l_sbeReg.reg );
sbe_boot_fail_handler(i_target,l_sbeReg);
}
else if (l_errl)
{
Expand Down Expand Up @@ -740,16 +740,20 @@ errlHndl_t sbe_timeout_handler(sbeMsgReg_t * o_sbeReg,
if (l_errl)
{
TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"ERROR : call p9_get_sbe_msg_register, PLID=0x%x",
l_errl->plid() );
"ERROR : call p9_get_sbe_msg_register, PLID=0x%x, "
"on loop %d",
l_errl->plid(),
l_loops );
(*o_returnAction) = SBE_REG_RETURN::FUNCTION_ERROR;
break;
}
else if ((*o_sbeReg).currState == SBE_STATE_RUNTIME)
{
TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"SBE 0x%.8X booted and at runtime, o_sbeReg=0x%.8X",
TARGETING::get_huid(i_target), (*o_sbeReg).reg);
"SBE 0x%.8X booted and at runtime, o_sbeReg=0x%.8X, "
"on loop %d",
TARGETING::get_huid(i_target), (*o_sbeReg).reg,
l_loops);
(*o_returnAction) = SBE_REG_RETURN::SBE_AT_RUNTIME;
break;
}
Expand All @@ -775,6 +779,27 @@ errlHndl_t sbe_timeout_handler(sbeMsgReg_t * o_sbeReg,
nanosleep(0,SBE_WAIT_SLEEP);
}
}

if ((*o_sbeReg).currState != SBE_STATE_RUNTIME)
{
// Switch to using FSI SCOM
TARGETING::ScomSwitches l_switches =
i_target->getAttr<TARGETING::ATTR_SCOM_SWITCHES>();
TARGETING::ScomSwitches l_switches_before = l_switches;

// Turn off SBE SCOM and turn on FSI SCOM.
l_switches.useFsiScom = 1;
l_switches.useSbeScom = 0;

TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"sbe_timeout_handler: changing SCOM switches from 0x%.2X "
"to 0x%.2X for proc 0x%.8X",
l_switches_before,
l_switches,
TARGETING::get_huid(i_target));
i_target->setAttr<TARGETING::ATTR_SCOM_SWITCHES>(l_switches);
}

return l_errl;
}

Expand Down Expand Up @@ -935,6 +960,101 @@ bool sbe_get_ffdc_handler(TARGETING::Target * i_target)
return l_flowCtrl;
}

void sbe_boot_fail_handler(TARGETING::Target * i_target,
sbeMsgReg_t i_sbeReg,
IStepError *io_stepError)
{
errlHndl_t l_errl = nullptr;

TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"SBE 0x%.8X never started, sbeReg=0x%.8X",
TARGETING::get_huid(i_target),i_sbeReg.reg );
/*@
* @errortype
* @reasoncode RC_SBE_SLAVE_TIMEOUT
* @severity ERRORLOG::ERRL_SEV_INFORMATIONAL
* @moduleid MOD_SBE_EXTRACT_RC_HANDLER
* @userdata1 HUID of proc which had SBE timeout
* @userdata2 SBE MSG Register
*
* @devdesc Slave SBE did not get to ready state within
* allotted time
*
* @custdesc A processor in the system has failed to initialize
*/
l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL,
MOD_SBE_EXTRACT_RC_HANDLER,
RC_SBE_SLAVE_TIMEOUT,
TARGETING::get_huid(i_target),
i_sbeReg.reg);

l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4);

// Commit error and continue, this is not terminating since
// we can still at least boot with master proc
errlCommit(l_errl,ISTEP_COMP_ID);

// Setup for the HWP
const fapi2::Target<fapi2::TARGET_TYPE_PROC_CHIP> l_fapi2ProcTarget(
const_cast<TARGETING::Target*> (i_target));
P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction =
P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM;
FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc,
l_fapi2ProcTarget, l_rcAction);

if(l_errl)
{
TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"ERROR : sbe_boot_fail_handler : "
"p9_extract_sbe_rc HWP returning errorlog "
"PLID=0x%x",l_errl->plid());

// capture the target data in the elog
ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl );

// Create IStep error log and cross reference to error
if(io_stepError)
{
io_stepError->addErrorDetails( l_errl );
}

// Commit error log
errlCommit( l_errl, HWPF_COMP_ID );

}
else if(l_rcAction != P9_EXTRACT_SBE_RC::ERROR_RECOVERED)
{

if(INITSERVICE::spBaseServicesEnabled())
{
// When we are on an FSP machine, we want to fail out of
// hostboot and give control back to the FSP. They have
// better diagnostics for this type of error.
INITSERVICE::doShutdownWithError(RC_HWSV_COLLECT_SBE_RC,
TARGETING::get_huid(i_target));
}

// Save the current rc error
(i_target)->setAttr<TARGETING::ATTR_PREVIOUS_SBE_ERROR>(l_rcAction);
#ifdef CONFIG_BMC_IPMI
// This could potentially take awhile, reset watchdog
l_errl = IPMIWATCHDOG::resetWatchDogTimer();
if(l_errl)
{
TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace,
"sbe_boot_fail_handler "
"Resetting watchdog before sbe_handler");
l_errl->collectTrace("ISTEPS_TRACE",KILOBYTE/4);
errlCommit(l_errl,ISTEP_COMP_ID);
}
#endif
proc_extract_sbe_handler( i_target,
l_rcAction);
}

return;
}

errlHndl_t switch_sbe_sides(TARGETING::Target * i_target)
{
errlHndl_t l_errl = NULL;
Expand Down
16 changes: 16 additions & 0 deletions src/usr/isteps/istep08/sbe_extract_rc_handler.H
Expand Up @@ -25,6 +25,8 @@
#ifndef __SBE_EXTRACT_RC_HANDLER_H
#define __SBE_EXTRACT_RC_HANDLER_H

#include <isteps/hwpisteperror.H>

enum SBE_REG_RETURN
{
FUNCTION_ERROR = 0, // Error returned from HWP
Expand Down Expand Up @@ -195,6 +197,20 @@ P9_EXTRACT_SBE_RC::RETURN_ACTION action_for_ffdc_rc( uint32_t i_rc);

bool sbe_get_ffdc_handler(TARGETING::Target * i_target);

/**
* @brief This function handles the SBE failed to boot error.
*
* @param[in] i_target - current proc target
* @param[in] i_sbeReg - sbe register
* @param[in/out] io_stepError - istep error (optional)
*
* @return - NULL
*/

void sbe_boot_fail_handler(TARGETING::Target * i_target,
sbeMsgReg_t i_sbeReg,
ISTEP_ERROR::IStepError *io_stepError = nullptr);

/**
* @brief This function deals with the mask needed to switch
* boot side on the SBE for a given proc
Expand Down
5 changes: 5 additions & 0 deletions src/usr/targeting/common/genHwsvMrwXml.pl
Expand Up @@ -3639,6 +3639,11 @@ sub generate_proc
my $UseXscom = $haveFSPs ? 0 : 1;
my $UseFsiScom = $haveFSPs ? 0 : 1;
my $UseSbeScom = $haveFSPs ? 1 : 0;
if($proc ne 0)
{
$UseFsiScom = 1;
$UseSbeScom = 0;
}

my $fapi_name = sprintf("pu:k0:n%d:s0:p%02d", $node, $proc);
print "
Expand Down
10 changes: 8 additions & 2 deletions src/usr/targeting/common/processMrw.pl
Expand Up @@ -929,20 +929,26 @@ sub processProcessor
$targetObj->setAttribute($target, "FSI_MASTER_TYPE", "NO_MASTER");
$targetObj->setAttribute($target, "FSI_SLAVE_CASCADE", "0");
$targetObj->setAttribute($target, "PROC_MASTER_TYPE", "ACTING_MASTER");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useSbeScom",
"1");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useFsiScom",
"0");
}
else
{
$targetObj->setAttribute($target, "PROC_MASTER_TYPE",
"NOT_MASTER");
$targetObj->setAttribute($target, "ALTFSI_MASTER_CHIP", "physical:sys-0");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useSbeScom",
"0");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useFsiScom",
"1");
}
## Update bus speeds
processI2cSpeeds($targetObj,$target);
## these are hardcoded because code sets them properly
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "reserved", "0");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useSbeScom", "1");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useFsiScom", "0");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useInbandScom",
"0");
$targetObj->setAttributeField($target, "SCOM_SWITCHES", "useXscom", "0");
Expand Down

0 comments on commit daa077f

Please sign in to comment.