Skip to content

Commit

Permalink
HTMGT: Clear OCC reset counts after an hour
Browse files Browse the repository at this point in the history
- add HTMGT/OCC data to elogs
- parse HTMGT/OCC data in elogs
- add reset count per OCC since last boot
- remove unused legacy pstate attributes/code

Change-Id: I69f9fe504af13eae86ec423a329a7bc46286f906
RTC: 202016
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/69717
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Sheldon Bailey <baileysh@us.ibm.com>
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Daniel M. Crowell <dcrowell@us.ibm.com>
  • Loading branch information
cjcain authored and dcrowell77 committed Jan 14, 2019
1 parent 9228e4c commit 2d59cd7
Show file tree
Hide file tree
Showing 13 changed files with 707 additions and 282 deletions.
3 changes: 1 addition & 2 deletions src/include/usr/htmgt/htmgt_reasoncodes.H
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2014,2018 */
/* Contributors Listed Below - COPYRIGHT 2014,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -87,7 +87,6 @@ namespace HTMGT
HTMGT_RC_ECMD_DBUF_COPY_FAIL = HTMGT_COMP_ID | 0x53,
HTMGT_RC_TARGET_NOT_FUNCTIONAL = HTMGT_COMP_ID | 0x54,
HTMGT_RC_OCC_MASTER_NOT_FOUND = HTMGT_COMP_ID | 0x55,
HTMGT_RC_OCC_RESET_THREHOLD = HTMGT_COMP_ID | 0x56,
HTMGT_RC_INVALID_OCC_ELOG = HTMGT_COMP_ID | 0x63,
HTMGT_RC_BAD_FRU_CALLOUTS = HTMGT_COMP_ID | 0x7D,
HTMGT_RC_MISMATCHING_SEVERITY = HTMGT_COMP_ID | 0x7F,
Expand Down
256 changes: 187 additions & 69 deletions src/usr/htmgt/htmgt.C

Large diffs are not rendered by default.

150 changes: 57 additions & 93 deletions src/usr/htmgt/htmgt_occ.C
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2014,2018 */
/* Contributors Listed Below - COPYRIGHT 2014,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -417,8 +417,7 @@ namespace HTMGT
:iv_occMaster(nullptr),
iv_state(OCC_STATE_UNKNOWN),
iv_targetState(OCC_STATE_ACTIVE),
iv_sysResetCount(0),
iv_normalPstateTables(true)
iv_sysResetCount(0)
{
}

Expand Down Expand Up @@ -828,6 +827,14 @@ namespace HTMGT

if (nullptr == l_err)
{
// Clear safe mode reason since OCC is at target state
if (cv_safeReturnCode != 0)
{
TMGT_INF("_setOccState: clearing safe mode reason "
"(0x%04X)", cv_safeReturnCode);
cv_safeReturnCode = 0;
cv_safeOccInstance = 0;
}
TMGT_INF("_setOccState: All OCCs have reached state "
"0x%02X", requestedState);
iv_state = requestedState;
Expand Down Expand Up @@ -875,7 +882,8 @@ namespace HTMGT

errlHndl_t OccManager::_resetOccs(TARGETING::Target * i_failedOccTarget,
bool i_skipCountIncrement,
bool i_skipComm)
bool i_skipComm,
enum occResetReason i_reason)
{
errlHndl_t err = nullptr;
bool atThreshold = false;
Expand Down Expand Up @@ -910,6 +918,10 @@ namespace HTMGT

for(const auto & occ : iv_occArray )
{
if (i_reason != OCC_RESET_REASON_NONE)
{
occ->iv_resetReason = i_reason;
}
if(occ->getTarget() == i_failedOccTarget)
{
occ->failed(true);
Expand Down Expand Up @@ -953,8 +965,23 @@ namespace HTMGT
}

uint64_t retryCount = OCC_RESET_COUNT_THRESHOLD;
TARGETING::Target* sys = nullptr;
TARGETING::targetService().getTopLevelTarget(sys);
while(retryCount)
{
if (sys)
{
// Increment cumulative reset count since boot
uint8_t count = sys->getAttr<TARGETING::
ATTR_CUMULATIVE_PMCOMPLEX_RESET_COUNT>();
if (count < 0xFF)
{
++count;
sys->setAttr<TARGETING::
ATTR_CUMULATIVE_PMCOMPLEX_RESET_COUNT>(count);
}
}

// Reset all OCCs
TMGT_INF("_resetOccs: Calling HBPM::resetPMAll");
err = HBPM::resetPMAll();
Expand Down Expand Up @@ -991,7 +1018,7 @@ namespace HTMGT
{
for( const auto & occ : iv_occArray )
{
// After OCC has been reset, clear flag
// After OCC has been reset, clear internal flags
occ->postResetClear();
}

Expand Down Expand Up @@ -1033,11 +1060,12 @@ namespace HTMGT
0, cv_safeReturnCode, 0, cv_safeOccInstance,
ERRORLOG::ERRL_SEV_UNRECOVERABLE);

TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)",
cv_safeReturnCode, cv_safeOccInstance);

// Check if OCC already logged reason for safe mode
// (add proc callout if non-OCC safe mode reason or
// the OCC hit an exception)
TMGT_ERR("_resetOccs: Safe Mode (RC: 0x%04X OCC%d)",
cv_safeReturnCode, cv_safeOccInstance);
if (((cv_safeReturnCode & OCCC_COMP_ID) != OCCC_COMP_ID) ||
((cv_safeReturnCode & 0xE0) == 0xE0))
{
Expand Down Expand Up @@ -1159,7 +1187,7 @@ namespace HTMGT
TARGETING::getParentChip(occ->getTarget() );

// Read SRAM response buffer to check for OCC checkpoint
errlHndl_t l_err = nullptr;
errlHndl_t l_err = nullptr;
const uint16_t l_length = 8; //Note: number of bytes
uint8_t l_sram_data[l_length] = { 0x0 };
l_err = HBOCC::readSRAM(procTarget,
Expand Down Expand Up @@ -1325,44 +1353,29 @@ namespace HTMGT


// Collect HTMGT Status Information for debug
// NOTE: o_data is pointer to 4096 byte buffer
void OccManager::_getOccData(uint16_t & o_length, uint8_t *o_data)
// NOTE: o_data is pointer to OCC_MAX_DATA_LENGTH byte buffer
void OccManager::_getHtmgtData(uint16_t & o_length, uint8_t *o_data)
{
uint16_t index = 0;

// If the system is in safemode then can't talk to OCCs (no build/poll)
TARGETING::Target* sys = nullptr;
TARGETING::targetService().getTopLevelTarget(sys);
uint8_t safeMode = 0;
if (sys &&
sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode) &&
(0 == safeMode))
uint8_t resets_since_boot = 0;
if (sys)
{
// Make sure OCCs were built first (so data is valid)
errlHndl_t err = _buildOccs(); // if not already built.
if (err)
{
TMGT_ERR("_getOccData: failed to build OCC structures "
"rc=0x%04X", err->reasonCode());
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
}
// Send poll to confirm comm, update states and flush errors
err = _sendOccPoll(true, nullptr);
if (err)
{
TMGT_ERR("_getOccData: Poll OCCs failed.");
ERRORLOG::errlCommit(err, HTMGT_COMP_ID);
}
sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode);
sys->tryGetAttr<TARGETING::ATTR_CUMULATIVE_PMCOMPLEX_RESET_COUNT>
(resets_since_boot);
}

// First add HTMGT specific data
o_data[index++] = _getNumOccs();
o_data[index++] =
(nullptr!=iv_occMaster)?iv_occMaster->getInstance():0xFF;
o_data[index++] = iv_state;
o_data[index++] = iv_targetState;
o_data[index++] = iv_sysResetCount;
o_data[index++] = iv_normalPstateTables ? 0 : 1;
o_data[index++] = resets_since_boot;
o_data[index++] = 0x00; // STATUS VERSION (for future expansion)
o_data[index++] = safeMode;
UINT32_PUT(&o_data[index], cv_safeReturnCode);
Expand Down Expand Up @@ -1428,23 +1441,6 @@ namespace HTMGT
}



// Set default pstate table type and reset all OCCs to pick them up
errlHndl_t OccManager::_loadPstates(bool i_normalPstates)
{
errlHndl_t err = nullptr;

// Set default pstate table type
_setPstateTable(i_normalPstates);

// Reset OCCs to pick up new tables (skip incrementing reset count)
TMGT_INF("_loadPstates: Resetting OCCs");
err = _resetOccs(nullptr, true);

return err;
}


// Consolidate all OCC states
void OccManager::_syncOccStates()
{
Expand Down Expand Up @@ -1479,18 +1475,12 @@ namespace HTMGT


// Clear all OCC reset counts
// Should not be called if the system is in safe mode.
void OccManager::_clearResetCounts()
{
TARGETING::Target* sys = nullptr;
TARGETING::targetService().getTopLevelTarget(sys);
uint8_t safeMode = 0;
if (sys)
{
sys->tryGetAttr<TARGETING::ATTR_HTMGT_SAFEMODE>(safeMode);
}
for( const auto & occ : iv_occArray )
{
if ( occ->iv_resetCount != 0 )
if (occ->iv_resetCount != 0)
{
TMGT_INF("_clearResetCounts: Clearing OCC%d reset count "
"(was %d)",
Expand All @@ -1499,28 +1489,15 @@ namespace HTMGT
occ->iv_resetCount = 0;
}

if( occ->iv_wofResetCount != 0 )
if (occ->iv_wofResetCount != 0)
{
occ->iv_wofResetCount = 0;
TMGT_INF("_clearResetCounts: Clearing OCC%d WOF reset count "
"( was %d)",
occ->getInstance(),
occ->iv_wofResetCount);
}

if( occ->iv_wofResetReasons != 0 )
{
TMGT_INF("_clearResetCounts: Clearing OCC%d WOF reset reasons "
"( was 0x%08x)",
"(was %d) reason(s): 0x%08X",
occ->getInstance(),
occ->iv_wofResetReasons );
}
if( safeMode )
{
// Clear OCC flags (failed, commEstablished, etc)
occ->postResetClear();
occ->iv_wofResetCount,
occ->iv_wofResetReasons);
occ->iv_wofResetCount = 0;
}

}

if (iv_sysResetCount != 0)
Expand Down Expand Up @@ -1571,12 +1548,14 @@ namespace HTMGT

errlHndl_t OccManager::resetOccs(TARGETING::Target * i_failedOccTarget,
bool i_skipCountIncrement,
bool i_skipComm)
bool i_skipComm,
enum occResetReason i_reason)
{
return
Singleton<OccManager>::instance()._resetOccs(i_failedOccTarget,
i_skipCountIncrement,
i_skipComm);
i_skipComm,
i_reason);
}


Expand Down Expand Up @@ -1614,9 +1593,9 @@ namespace HTMGT
return Singleton<OccManager>::instance()._occFailed();
}

void OccManager::getOccData(uint16_t & o_length, uint8_t *o_data)
void OccManager::getHtmgtData(uint16_t & o_length, uint8_t *o_data)
{
Singleton<OccManager>::instance()._getOccData(o_length, o_data);
Singleton<OccManager>::instance()._getHtmgtData(o_length, o_data);
}

void OccManager::getWOFResetReasons(uint16_t & o_length, uint8_t * o_data)
Expand All @@ -1625,21 +1604,6 @@ namespace HTMGT
o_data);
}

errlHndl_t OccManager::loadPstates(bool i_normalPstates)
{
return Singleton<OccManager>::instance()._loadPstates(i_normalPstates);
}

bool OccManager::isNormalPstate()
{
return Singleton<OccManager>::instance()._isNormalPstate();
}

void OccManager::setPstateTable(bool i_useNormal)
{
Singleton<OccManager>::instance()._setPstateTable(i_useNormal);
}

void OccManager::clearResetCounts()
{
Singleton<OccManager>::instance()._clearResetCounts();
Expand Down

0 comments on commit 2d59cd7

Please sign in to comment.