Skip to content

Commit

Permalink
HTMGT: Change OCC logs to info while recovery is still being attempted
Browse files Browse the repository at this point in the history
Change-Id: I0a46cacbc7e473dedd38ce9656ab25f5452c77c1
CQ: SW456777
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/72062
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Reviewed-by: Sheldon Bailey <baileysh@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
cjcain authored and dcrowell77 committed Feb 19, 2019
1 parent 4c0019e commit b616250
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 63 deletions.
22 changes: 14 additions & 8 deletions src/usr/htmgt/htmgt_occ.H
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2014,2018 */
/* Contributors Listed Below - COPYRIGHT 2014,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -346,13 +346,19 @@ namespace HTMGT
/**
* @brief Determine what actions are required for elog
*
* @param[in] i_actions Action requested by OCC
* @param[out] o_occReset returns true if OCC reset is needed
* @param[out] o_errlSeverity severity to use for elog commit
*/
void elogProcessActions(const uint8_t i_actions,
bool & o_occReset,
ERRORLOG::errlSeverity_t & o_errlSeverity);
* @param[in] i_actions Action flags requested by OCC
* @param[in] i_src SRC being reported by OCC
* @param[in] i_data Additional data used when
* processing actions
* @param[in,out] io_errlSeverity Severity to use for elog
* @param[out] o_call_home True if info error should be
* reported to BMC
*/
void elogProcessActions(const uint8_t i_actions,
const uint32_t i_src,
const uint32_t i_data,
ERRORLOG::errlSeverity_t & io_errlSeverity,
bool & o_call_home);


/**
Expand Down
148 changes: 93 additions & 55 deletions src/usr/htmgt/occError.C
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2014,2018 */
/* Contributors Listed Below - COPYRIGHT 2014,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -109,7 +109,6 @@ namespace HTMGT

TMGT_BIN("OCC ELOG", l_occElog, 256);


// Get user details section
const occErrlUsrDtls_t *l_usrDtls_ptr = (occErrlUsrDtls_t *)
((uint8_t*)l_occElog + sizeof(occErrlEntry_t));
Expand All @@ -118,6 +117,13 @@ namespace HTMGT
ERRORLOG::errlSeverity_t severity =
ERRORLOG::ERRL_SEV_INFORMATIONAL;

if (l_occSrc == 0x2A01)
{
// 2A01 is Periodic OCC Telemetry / Call Home data
TMGT_INF("OCC is reporting Periodic Telemetry Data (0x2A01)"
" - NOT AN ERROR");
}

// Translate Severity
const uint8_t l_occSeverity = l_occElog->severity;
if (l_occSeverity < OCC_SEV_ACTION_XLATE_SIZE)
Expand All @@ -132,42 +138,12 @@ namespace HTMGT
}

// Process Actions
bool l_occReset = false;
elogProcessActions(l_occElog->actions, l_occReset, severity);



// Need to add WOF reason code to OCC object regardless of
// whether WOF resets are disabled.
if( l_occElog->actions & TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED )
{
iv_wofResetReasons |= l_usrDtls_ptr->userData1;
TMGT_ERR("WOF Reset Reasons for OCC%d = 0x%08x",
iv_instance,
iv_wofResetReasons);

}

// Check if we need a WOF requested reset
if(iv_needsWofReset == true)
{
TMGT_ERR("WOF Reset detected! SRC = 0x%X",
l_occSrc);

// We compare against one less than the threshold because
// the WOF reset count doesn't get incremented until resetPrep
if( iv_wofResetCount < (WOF_RESET_COUNT_THRESHOLD-1) )
{
// Not at WOF reset threshold yet. Set sev to INFO
severity = ERRORLOG::ERRL_SEV_INFORMATIONAL;
}
}

if (l_occReset == true)
{
iv_needsReset = true;
OccManager::updateSafeModeReason(l_occSrc, iv_instance);
}
bool l_call_home_event = false;
elogProcessActions(l_occElog->actions,
l_occSrc,
l_usrDtls_ptr->userData1,
severity,
l_call_home_event);

// Create OCC error log
// NOTE: word 4 (used by extended reason code) to save off OCC
Expand All @@ -185,6 +161,13 @@ namespace HTMGT
l_occElog->extendedRC, // extended reason code
severity);

if (l_call_home_event)
{
// Force info log to the BMC.
// No HW Callouts (SELs) will be created for this error
l_errlHndl->setEselCallhomeInfoEvent(true);
}

// Add callout information
const uint8_t l_max_callouts = l_occElog->maxCallouts;
bool l_bad_fru_data = false;
Expand Down Expand Up @@ -295,16 +278,11 @@ namespace HTMGT
"HALT_ON_SRC is set. Resets will be disabled",
iv_instance, l_occSrc);
set_int_flags(get_int_flags() | FLAG_RESET_DISABLED);
// Force unrecoverable elog
l_errlHndl->setSev(ERRORLOG::ERRL_SEV_UNRECOVERABLE);
}
}

// Process force error log to be sent to BMC.
if( (l_occElog->actions & TMGT_ERRL_ACTIONS_FORCE_ERROR_POSTED ) ||
(l_occSrc == (OCCC_COMP_ID | 0x01 ) ) ) //GEN_CALLHOME_LOG
{
l_errlHndl->setEselCallhomeInfoEvent(true);
}

#ifdef CONFIG_CONSOLE_OUTPUT_OCC_COMM
char header[64];
sprintf(header, "OCC%d ELOG: (0x%04X bytes)", iv_instance, i_length);
Expand Down Expand Up @@ -426,57 +404,117 @@ namespace HTMGT

} // end Occ::elogAddCallout()

void Occ::elogProcessActions(const uint8_t i_actions,
bool & o_occReset,
ERRORLOG::errlSeverity_t & o_errlSeverity)

void Occ::elogProcessActions(const uint8_t i_actions,
const uint32_t i_src,
uint32_t i_data,
ERRORLOG::errlSeverity_t & io_errlSeverity,
bool & o_call_home)
{
bool l_occReset = false;
o_call_home = false;

if (i_actions & TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED)
{
iv_failed = false;
iv_resetReason = OCC_RESET_REASON_WOF_REQUEST;
// Check if WOF resets are disabled
if(int_flags_set(FLAG_WOF_RESET_DISABLED) == true)
{
o_occReset = false;
iv_needsWofReset = false;
TMGT_INF("elogProcessActions: OCC%d requested a WOF reset "
"but WOF resets are DISABLED",
iv_instance);
}
else // WOF resets are enabled
{
o_occReset = true;
l_occReset = true;
iv_needsWofReset = true;
TMGT_INF("elogProcessActions: OCC%d requested a WOF reset",
TMGT_ERR("elogProcessActions: OCC%d requested a WOF reset",
iv_instance);

// We compare against one less than the threshold because the
// WOF reset count doesn't get incremented until the resetPrep
if( iv_wofResetCount < (WOF_RESET_COUNT_THRESHOLD-1) )
{
// Not at WOF reset threshold yet. Set sev to INFO
io_errlSeverity = ERRORLOG::ERRL_SEV_INFORMATIONAL;
}
}

// Need to add WOF reason code to OCC object regardless of
// whether WOF resets are disabled.
iv_wofResetReasons |= i_data;
TMGT_ERR("elogProcessActions: WOF Reset Reasons for OCC%d = 0x%08x",
iv_instance, iv_wofResetReasons);
}
else
{
if (i_actions & TMGT_ERRL_ACTIONS_RESET_REQUIRED)
{
o_occReset = true;
l_occReset = true;
iv_failed = true;
iv_resetReason = OCC_RESET_REASON_OCC_REQUEST;

TMGT_INF("elogProcessActions: OCC%d requested reset",
iv_instance);
iv_instance);

// If reset will force safe mode, then make error unrecoverable
if (OCC_RESET_COUNT_THRESHOLD == iv_resetCount)
{
if (io_errlSeverity != ERRORLOG::ERRL_SEV_UNRECOVERABLE)
{
// update severity to UNRECOVERABLE
TMGT_ERR("elogProcessActions: changing severity to "
"UNRECOVERABLE (was sev=0x%02X)",
io_errlSeverity);
io_errlSeverity = ERRORLOG::ERRL_SEV_UNRECOVERABLE;
}
}
else if (io_errlSeverity != ERRORLOG::ERRL_SEV_INFORMATIONAL)
{
// update severity to INFO
TMGT_INF("elogProcessActions: changing severity to "
"INFORMATIONAL (was sev=0x%02X)",
io_errlSeverity);
io_errlSeverity = ERRORLOG::ERRL_SEV_INFORMATIONAL;
// log will be sent to BMC with NO SEL (hardware callouts)
o_call_home = true;
}
}

if (i_actions & TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED)
{
o_occReset = true;
l_occReset = true;
iv_failed = true;
iv_resetReason = OCC_RESET_REASON_CRIT_FAILURE;
iv_resetCount = OCC_RESET_COUNT_THRESHOLD;

TMGT_INF("elogProcessActions: OCC%d requested safe mode",
iv_instance);
TMGT_CONSOLE("OCC%d requested system enter safe mode",
iv_instance);
iv_instance);
}
}

// Check if error needs to be forced to the BMC:
// 1. 2A01 = OCC call home/telemetry data, OR
// 2. OCC requested force, but error was changed to info by HTMGT
// (log will be sent to the BMC with NO SEL (hardware callouts))
if ( (i_src == (OCCC_COMP_ID | 0x01 )) || // GEN_CALLHOME_LOG
( (i_actions & TMGT_ERRL_ACTIONS_FORCE_ERROR_POSTED) &&
(io_errlSeverity == ERRORLOG::ERRL_SEV_INFORMATIONAL) ) )
{
o_call_home = true;
}

// If reset required, save the SRC in case it leads to safe mode
if (l_occReset == true)
{
iv_needsReset = true;
OccManager::updateSafeModeReason(i_src, iv_instance);
}

} // end Occ::elogProcessActions()

} // end namespace

0 comments on commit b616250

Please sign in to comment.