Skip to content

Commit

Permalink
FFDC enhancements for core activate fails
Browse files Browse the repository at this point in the history
Adding some more traces to the error log we grab for core
activation failures.

Change-Id: I30c6985060fcffcb3382b775a52e59c08d2b51b7
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/57907
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Matt Derksen <mderkse1@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
dcrowell77 committed May 19, 2018
1 parent 798ff0e commit a4e02fc
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 6 deletions.
3 changes: 3 additions & 0 deletions src/include/usr/intr/intr_reasoncodes.H
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@

#include <hbotcompid.H>

#define INTR_TRACE_NAME INTR_COMP_NAME


namespace INTR
{
enum IntrModuleID
Expand Down
4 changes: 1 addition & 3 deletions src/usr/intr/intrrp.C
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@
#include <p9n2_misc_scom_addresses_fld.H>
#include <util/utilmbox_scratch.H>

#define INTR_TRACE_NAME INTR_COMP_NAME

using namespace INTR;
using namespace TARGETING;

Expand Down Expand Up @@ -3224,7 +3222,7 @@ void* INTR::IntrRp::handleCpuTimeout(void* _pir)
msg->data[0] = pir;
msg_q_t intr_msgQ = msg_q_resolve(VFS_ROOT_MSG_INTR);

TRACFCOMP( g_trac_intr,"handleCpuTimeout for pir: %lx", pir);
TRACFCOMP( g_trac_intr,"handleCpuTimeout for pir: 0x%lx", pir);

do
{
Expand Down
20 changes: 17 additions & 3 deletions src/usr/isteps/istep16/call_host_activate_slave_cores.C
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
#endif

#include <scom/scomif.H>
#include <errl/errludprintk.H>
#include <intr/intr_reasoncodes.H>

using namespace ERRORLOG;
using namespace TARGETING;
Expand Down Expand Up @@ -121,11 +123,12 @@ void* call_host_activate_slave_cores (void *io_pArgs)
int rc = cpu_start_core(pir, en_threads);

// Handle time out error
uint32_t l_checkidle_eid = 0;
if (-ETIME == rc)
{
TRACFCOMP( ISTEPS_TRACE::g_trac_isteps_trace,
"call_host_activate_slave_cores: "
"Time out rc from kernel %d on core %x",
"Time out rc from kernel %d on core 0x%x",
rc,
pir);

Expand Down Expand Up @@ -157,6 +160,8 @@ void* call_host_activate_slave_cores (void *io_pArgs)
// Create IStep error log
l_stepError.addErrorDetails(l_timeout_errl);

l_checkidle_eid = l_timeout_errl->eid();

// Commit error
errlCommit( l_timeout_errl, HWPF_COMP_ID );
}
Expand All @@ -176,7 +181,8 @@ void* call_host_activate_slave_cores (void *io_pArgs)
* @severity ERRORLOG::ERRL_SEV_UNRECOVERABLE
* @moduleid MOD_HOST_ACTIVATE_SLAVE_CORES
* @userdata1 PIR of failing core.
* @userdata2 rc of cpu_start_core().
* @userdata2[00:31] EID from p9_check_idle_stop_done().
* @userdata2[32:63] rc of cpu_start_core().
*
* @devdesc Kernel returned error when trying to activate
* core.
Expand All @@ -186,14 +192,22 @@ void* call_host_activate_slave_cores (void *io_pArgs)
MOD_HOST_ACTIVATE_SLAVE_CORES,
RC_BAD_RC,
pir,
rc );
TWO_UINT32_TO_UINT64(
l_checkidle_eid,
rc) );

// Callout core that failed to wake up.
l_errl->addHwCallout(*l_core,
HWAS::SRCI_PRIORITY_MED,
HWAS::DECONFIG,
HWAS::GARD_Predictive);

// Could be an interrupt issue
l_errl->collectTrace(INTR_TRACE_NAME,256);

// Throw printk in there too in case it is a kernel issue
ERRORLOG::ErrlUserDetailsPrintk().addToLog(l_errl);

l_stepError.addErrorDetails( l_errl );
errlCommit( l_errl, HWPF_COMP_ID );
break;
Expand Down

0 comments on commit a4e02fc

Please sign in to comment.