From daa077f45d06c001ccee098c13acaf8df6d4bdce Mon Sep 17 00:00:00 2001 From: Marty Gloff Date: Wed, 23 Aug 2017 15:00:39 -0500 Subject: [PATCH] Fix SBE error handling for slave sbe start fails Set slave processors to use FSI SCOM until SBE is running, then switch to using SBE SCOM. Re-enable calls that were blocked because they would attempt an SBE SCOM before the slave SBE was running, and so, would fail. Change-Id: I4d3e93ac6e53a6073584043c6adc7ecf32a4714b RTC: 177921 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45063 Reviewed-by: Christian R. Geddes Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Tested-by: FSP CI Jenkins Tested-by: Jenkins OP HW Reviewed-by: Elizabeth K. Liner Reviewed-by: Nicholas E. Bofferding Reviewed-by: Daniel M. Crowell --- ...ll_proc_check_slave_sbe_seeprom_complete.C | 118 +++------------ .../isteps/istep08/sbe_extract_rc_handler.C | 134 +++++++++++++++++- .../isteps/istep08/sbe_extract_rc_handler.H | 16 +++ src/usr/targeting/common/genHwsvMrwXml.pl | 5 + src/usr/targeting/common/processMrw.pl | 10 +- 5 files changed, 175 insertions(+), 108 deletions(-) diff --git a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C index 9e5155d34de..17fd0b80953 100644 --- a/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C +++ b/src/usr/isteps/istep08/call_proc_check_slave_sbe_seeprom_complete.C @@ -144,105 +144,8 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs ) } } - TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, - "SBE 0x%.8X never started, l_sbeReg=0x%.8X", - TARGETING::get_huid(l_cpu_target),l_sbeReg.reg ); - /*@ - * @errortype - * @reasoncode RC_SBE_SLAVE_TIMEOUT - * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL - * @moduleid MOD_CHECK_SLAVE_SBE_SEEPROM_COMPLETE - * @userdata1 HUID of proc which had SBE timeout - * @userdata2 SBE MSG Register - * - * @devdesc Slave SBE did not get to ready state within - * allotted time - * - * @custdesc A processor in the system has failed to initialize - */ - l_errl = new ErrlEntry( - ERRL_SEV_INFORMATIONAL, - MOD_CHECK_SLAVE_SBE_SEEPROM_COMPLETE, - RC_SBE_SLAVE_TIMEOUT, - TARGETING::get_huid(l_cpu_target), - l_sbeReg.reg); - - l_errl->collectTrace( "ISTEPS_TRACE", 256); - - //@fixme - RTC:177921 - // Do not call p9_extract_sbe_rc because it corrupts - // live debug of fails. Need to make some other - // changes before turning this back on. -#if 1 // get rid of this - // Create IStep error log and cross reference to error - l_stepError.addErrorDetails( l_errl ); - - // Commit error log - errlCommit( l_errl, HWPF_COMP_ID ); -#else - - // Commit error and continue, this is not terminating since - // we can still at least boot with master proc - errlCommit(l_errl,ISTEP_COMP_ID); - - // Setup for the HWP - P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction = - P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; - FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc, - l_fapi2ProcTarget, l_rcAction); - - if(l_errl) - { - TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "ERROR : proc_check_slave_sbe_seeprom_complete " - "failed, p9_extract_sbe_rc HWP returning errorlog " - "PLID=0x%x",l_errl->plid()); - - // capture the target data in the elog - ErrlUserDetailsTarget(l_cpu_target).addToLog( l_errl ); - - // Create IStep error log and cross reference to error - l_stepError.addErrorDetails( l_errl ); - - // Commit error log - errlCommit( l_errl, HWPF_COMP_ID ); - - } - else if(l_rcAction != P9_EXTRACT_SBE_RC::ERROR_RECOVERED) - { - - if(INITSERVICE::spBaseServicesEnabled()) - { - // When we are on an FSP machine, we want to fail out of - // hostboot and give control back to the FSP. They have - // better diagnostics for this type of error. - INITSERVICE::doShutdownWithError(RC_HWSV_COLLECT_SBE_RC, - TARGETING::get_huid(l_cpu_target)); - } - - // Pull out previous rc error for threshold - uint8_t l_prevError = 0; - - // Save the current rc error - (l_cpu_target)->setAttr< - TARGETING::ATTR_PREVIOUS_SBE_ERROR>(l_rcAction); -#ifdef CONFIG_BMC_IPMI - // This could potentially take awhile, reset watchdog - l_errl = IPMIWATCHDOG::resetWatchDogTimer(); - if(l_errl) - { - TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "call_proc_check_slave_sbe_seeprom_complete " - "Resetting watchdog before sbe_handler"); - l_errl->collectTrace("ISTEPS_TRACE",256); - errlCommit(l_errl,ISTEP_COMP_ID); - } -#endif - proc_extract_sbe_handler( l_cpu_target, - l_prevError, l_rcAction); - } -#endif //@fixme - RTC:177921 - + // Handle that SBE failed to boot in the allowed time + sbe_boot_fail_handler(l_cpu_target,l_sbeReg,&l_stepError); } else if (l_errl) { @@ -265,6 +168,23 @@ void* call_proc_check_slave_sbe_seeprom_complete( void *io_pArgs ) // Set attribute indicating that SBE is started l_cpu_target->setAttr(1); + // Switch to using SBE SCOM + ScomSwitches l_switches = + l_cpu_target->getAttr(); + ScomSwitches l_switches_before = l_switches; + + // Turn on SBE SCOM and turn off FSI SCOM. + l_switches.useFsiScom = 0; + l_switches.useSbeScom = 1; + + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "proc_check_slave_sbe_seeprom_complete: changing SCOM " + "switches from 0x%.2X to 0x%.2X for proc 0x%.8X", + l_switches_before, + l_switches, + TARGETING::get_huid(l_cpu_target)); + l_cpu_target->setAttr(l_switches); + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, "SUCCESS : proc_check_slave_sbe_seeprom_complete" " completed ok for proc 0x%.8X", diff --git a/src/usr/isteps/istep08/sbe_extract_rc_handler.C b/src/usr/isteps/istep08/sbe_extract_rc_handler.C index dc43c32e650..0ba22ccf357 100644 --- a/src/usr/isteps/istep08/sbe_extract_rc_handler.C +++ b/src/usr/isteps/istep08/sbe_extract_rc_handler.C @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ #include using namespace ISTEP; +using namespace ISTEP_ERROR; /* array and enum must be in sync */ @@ -532,9 +534,7 @@ SBE_REG_RETURN check_sbe_reg(TARGETING::Target * i_target) } // Handle that SBE failed to boot in the allowed time - TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, - "SBE 0x%.8X never started, l_sbeReg=0x%.8X", - TARGETING::get_huid(i_target),l_sbeReg.reg ); + sbe_boot_fail_handler(i_target,l_sbeReg); } else if (l_errl) { @@ -740,16 +740,20 @@ errlHndl_t sbe_timeout_handler(sbeMsgReg_t * o_sbeReg, if (l_errl) { TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "ERROR : call p9_get_sbe_msg_register, PLID=0x%x", - l_errl->plid() ); + "ERROR : call p9_get_sbe_msg_register, PLID=0x%x, " + "on loop %d", + l_errl->plid(), + l_loops ); (*o_returnAction) = SBE_REG_RETURN::FUNCTION_ERROR; break; } else if ((*o_sbeReg).currState == SBE_STATE_RUNTIME) { TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, - "SBE 0x%.8X booted and at runtime, o_sbeReg=0x%.8X", - TARGETING::get_huid(i_target), (*o_sbeReg).reg); + "SBE 0x%.8X booted and at runtime, o_sbeReg=0x%.8X, " + "on loop %d", + TARGETING::get_huid(i_target), (*o_sbeReg).reg, + l_loops); (*o_returnAction) = SBE_REG_RETURN::SBE_AT_RUNTIME; break; } @@ -775,6 +779,27 @@ errlHndl_t sbe_timeout_handler(sbeMsgReg_t * o_sbeReg, nanosleep(0,SBE_WAIT_SLEEP); } } + + if ((*o_sbeReg).currState != SBE_STATE_RUNTIME) + { + // Switch to using FSI SCOM + TARGETING::ScomSwitches l_switches = + i_target->getAttr(); + TARGETING::ScomSwitches l_switches_before = l_switches; + + // Turn off SBE SCOM and turn on FSI SCOM. + l_switches.useFsiScom = 1; + l_switches.useSbeScom = 0; + + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "sbe_timeout_handler: changing SCOM switches from 0x%.2X " + "to 0x%.2X for proc 0x%.8X", + l_switches_before, + l_switches, + TARGETING::get_huid(i_target)); + i_target->setAttr(l_switches); + } + return l_errl; } @@ -935,6 +960,101 @@ bool sbe_get_ffdc_handler(TARGETING::Target * i_target) return l_flowCtrl; } +void sbe_boot_fail_handler(TARGETING::Target * i_target, + sbeMsgReg_t i_sbeReg, + IStepError *io_stepError) +{ + errlHndl_t l_errl = nullptr; + + TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace, + "SBE 0x%.8X never started, sbeReg=0x%.8X", + TARGETING::get_huid(i_target),i_sbeReg.reg ); + /*@ + * @errortype + * @reasoncode RC_SBE_SLAVE_TIMEOUT + * @severity ERRORLOG::ERRL_SEV_INFORMATIONAL + * @moduleid MOD_SBE_EXTRACT_RC_HANDLER + * @userdata1 HUID of proc which had SBE timeout + * @userdata2 SBE MSG Register + * + * @devdesc Slave SBE did not get to ready state within + * allotted time + * + * @custdesc A processor in the system has failed to initialize + */ + l_errl = new ERRORLOG::ErrlEntry(ERRORLOG::ERRL_SEV_INFORMATIONAL, + MOD_SBE_EXTRACT_RC_HANDLER, + RC_SBE_SLAVE_TIMEOUT, + TARGETING::get_huid(i_target), + i_sbeReg.reg); + + l_errl->collectTrace( "ISTEPS_TRACE", KILOBYTE/4); + + // Commit error and continue, this is not terminating since + // we can still at least boot with master proc + errlCommit(l_errl,ISTEP_COMP_ID); + + // Setup for the HWP + const fapi2::Target l_fapi2ProcTarget( + const_cast (i_target)); + P9_EXTRACT_SBE_RC::RETURN_ACTION l_rcAction = + P9_EXTRACT_SBE_RC::REIPL_UPD_SEEPROM; + FAPI_INVOKE_HWP(l_errl, p9_extract_sbe_rc, + l_fapi2ProcTarget, l_rcAction); + + if(l_errl) + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "ERROR : sbe_boot_fail_handler : " + "p9_extract_sbe_rc HWP returning errorlog " + "PLID=0x%x",l_errl->plid()); + + // capture the target data in the elog + ERRORLOG::ErrlUserDetailsTarget(i_target).addToLog( l_errl ); + + // Create IStep error log and cross reference to error + if(io_stepError) + { + io_stepError->addErrorDetails( l_errl ); + } + + // Commit error log + errlCommit( l_errl, HWPF_COMP_ID ); + + } + else if(l_rcAction != P9_EXTRACT_SBE_RC::ERROR_RECOVERED) + { + + if(INITSERVICE::spBaseServicesEnabled()) + { + // When we are on an FSP machine, we want to fail out of + // hostboot and give control back to the FSP. They have + // better diagnostics for this type of error. + INITSERVICE::doShutdownWithError(RC_HWSV_COLLECT_SBE_RC, + TARGETING::get_huid(i_target)); + } + + // Save the current rc error + (i_target)->setAttr(l_rcAction); +#ifdef CONFIG_BMC_IPMI + // This could potentially take awhile, reset watchdog + l_errl = IPMIWATCHDOG::resetWatchDogTimer(); + if(l_errl) + { + TRACFCOMP(ISTEPS_TRACE::g_trac_isteps_trace, + "sbe_boot_fail_handler " + "Resetting watchdog before sbe_handler"); + l_errl->collectTrace("ISTEPS_TRACE",KILOBYTE/4); + errlCommit(l_errl,ISTEP_COMP_ID); + } +#endif + proc_extract_sbe_handler( i_target, + l_rcAction); + } + + return; +} + errlHndl_t switch_sbe_sides(TARGETING::Target * i_target) { errlHndl_t l_errl = NULL; diff --git a/src/usr/isteps/istep08/sbe_extract_rc_handler.H b/src/usr/isteps/istep08/sbe_extract_rc_handler.H index eba029539db..580a17a52b2 100644 --- a/src/usr/isteps/istep08/sbe_extract_rc_handler.H +++ b/src/usr/isteps/istep08/sbe_extract_rc_handler.H @@ -25,6 +25,8 @@ #ifndef __SBE_EXTRACT_RC_HANDLER_H #define __SBE_EXTRACT_RC_HANDLER_H +#include + enum SBE_REG_RETURN { FUNCTION_ERROR = 0, // Error returned from HWP @@ -195,6 +197,20 @@ P9_EXTRACT_SBE_RC::RETURN_ACTION action_for_ffdc_rc( uint32_t i_rc); bool sbe_get_ffdc_handler(TARGETING::Target * i_target); +/** + * @brief This function handles the SBE failed to boot error. + * + * @param[in] i_target - current proc target + * @param[in] i_sbeReg - sbe register + * @param[in/out] io_stepError - istep error (optional) + * + * @return - NULL + */ + +void sbe_boot_fail_handler(TARGETING::Target * i_target, + sbeMsgReg_t i_sbeReg, + ISTEP_ERROR::IStepError *io_stepError = nullptr); + /** * @brief This function deals with the mask needed to switch * boot side on the SBE for a given proc diff --git a/src/usr/targeting/common/genHwsvMrwXml.pl b/src/usr/targeting/common/genHwsvMrwXml.pl index 4fca434ae11..742f29da436 100755 --- a/src/usr/targeting/common/genHwsvMrwXml.pl +++ b/src/usr/targeting/common/genHwsvMrwXml.pl @@ -3639,6 +3639,11 @@ sub generate_proc my $UseXscom = $haveFSPs ? 0 : 1; my $UseFsiScom = $haveFSPs ? 0 : 1; my $UseSbeScom = $haveFSPs ? 1 : 0; + if($proc ne 0) + { + $UseFsiScom = 1; + $UseSbeScom = 0; + } my $fapi_name = sprintf("pu:k0:n%d:s0:p%02d", $node, $proc); print " diff --git a/src/usr/targeting/common/processMrw.pl b/src/usr/targeting/common/processMrw.pl index 8b969de72a1..5013bbc9a8b 100755 --- a/src/usr/targeting/common/processMrw.pl +++ b/src/usr/targeting/common/processMrw.pl @@ -929,20 +929,26 @@ sub processProcessor $targetObj->setAttribute($target, "FSI_MASTER_TYPE", "NO_MASTER"); $targetObj->setAttribute($target, "FSI_SLAVE_CASCADE", "0"); $targetObj->setAttribute($target, "PROC_MASTER_TYPE", "ACTING_MASTER"); + $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useSbeScom", + "1"); + $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useFsiScom", + "0"); } else { $targetObj->setAttribute($target, "PROC_MASTER_TYPE", "NOT_MASTER"); $targetObj->setAttribute($target, "ALTFSI_MASTER_CHIP", "physical:sys-0"); + $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useSbeScom", + "0"); + $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useFsiScom", + "1"); } ## Update bus speeds processI2cSpeeds($targetObj,$target); ## these are hardcoded because code sets them properly $targetObj->setAttributeField($target, "SCOM_SWITCHES", "reserved", "0"); - $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useSbeScom", "1"); - $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useFsiScom", "0"); $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useInbandScom", "0"); $targetObj->setAttributeField($target, "SCOM_SWITCHES", "useXscom", "0");