Skip to content

Commit

Permalink
Force reboot without visible errors for core wakeup failure
Browse files Browse the repository at this point in the history
The intermittent core wakeup failure continues to plague us with
no solution in sight.  Since the error is extremely rare (less
than 1% of boots) we have decided to force a manual reboot and
not log any visible errors to the customer.

Change-Id: Ic30f6330431bd2c8ce75075befc2c36d278d8152
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/71319
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
dcrowell77 committed Feb 7, 2019
1 parent 8a977a1 commit 39cba20
Showing 1 changed file with 34 additions and 9 deletions.
43 changes: 34 additions & 9 deletions src/usr/isteps/istep16/call_host_activate_slave_cores.C
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
#include <scom/scomif.H>
#include <errl/errludprintk.H>
#include <intr/intr_reasoncodes.H>
#include <initservice/istepdispatcherif.H>

using namespace ERRORLOG;
using namespace TARGETING;
Expand Down Expand Up @@ -84,6 +85,9 @@ void* call_host_activate_slave_cores (void *io_pArgs)
assert( sys != NULL );
uint32_t l_numCores = 0;

// keep track of which cores started
TargetHandleList l_startedCores;

for(TargetHandleList::const_iterator
l_core = l_cores.begin();
l_core != l_cores.end();
Expand Down Expand Up @@ -233,7 +237,8 @@ void* call_host_activate_slave_cores (void *io_pArgs)
// Add interesting ISTEP traces
l_errl->collectTrace(ISTEP_COMP_NAME,256);

l_stepError.addErrorDetails( l_errl );
// Choosing to ignore this intermittent error
l_errl->setSev(ERRORLOG::ERRL_SEV_INFORMATIONAL);
errlCommit( l_errl, HWPF_COMP_ID );

// Remember that we failed so we can gard the core if it
Expand All @@ -242,6 +247,14 @@ void* call_host_activate_slave_cores (void *io_pArgs)
(*l_core)->
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);

#ifdef CONFIG_BMC_IPMI
// Initiate a graceful power cycle
CONSOLE::displayf(ISTEP_COMP_NAME, "System Rebooting To Retry Recoverable Error");
CONSOLE::flush();
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,"call_host_activate_slave_cores: requesting power cycle");
INITSERVICE::requestReboot();
#endif

break;
}
// Create unrecoverable error log if this is a repeat
Expand Down Expand Up @@ -306,17 +319,29 @@ void* call_host_activate_slave_cores (void *io_pArgs)
// Zero out the counter if we passed
else if( l_prevFail > 0 )
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
"Resetting failure count for core %.8X",
TARGETING::get_huid(*l_core) );
l_prevFail = 0;
(*l_core)->
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
// Add to the list of passing cores so we can
// clear ATTR_PREVIOUS_WAKEUP_FAIL later
l_startedCores.push_back(*l_core);
}
}
}
// @@@@@ END CUSTOM BLOCK: @@@@@

// Clear out the wakeup_fail indicators only after every core has passed.
// Doing this outside the loop helps mitigate the (unlikely) case where
// a failure bounces between different cores on several consecutive boots.
for(TargetHandleList::const_iterator
l_core = l_startedCores.begin();
l_core != l_startedCores.end();
++l_core)
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
"Resetting failure count for core %.8X",
TARGETING::get_huid(*l_core) );
ATTR_PREVIOUS_WAKEUP_FAIL_type l_prevFail = 0;
(*l_core)->
setAttr<TARGETING::ATTR_PREVIOUS_WAKEUP_FAIL>(l_prevFail);
}

#if defined(CONFIG_IPLTIME_CHECKSTOP_ANALYSIS) && !defined(__HOSTBOOT_RUNTIME)
if( l_stepError.isNull() )
Expand Down

0 comments on commit 39cba20

Please sign in to comment.