From c17f8e44c48c17c7ea9f049011e3151eede8d9aa Mon Sep 17 00:00:00 2001 From: Andres Lugo-Reyes Date: Mon, 28 Aug 2017 09:50:55 -0500 Subject: [PATCH] Support for detecting a WOF requested reset Change-Id: I385b20538230b152828075f695e8352f969d5cf2 RTC:174543 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45986 Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Reviewed-by: Martha Broyles Tested-by: Jenkins OP HW Tested-by: FSP CI Jenkins Reviewed-by: Prachi Gupta Reviewed-by: Christopher J. Cain Reviewed-by: Daniel M. Crowell --- src/usr/htmgt/htmgt.C | 6 ++- src/usr/htmgt/htmgt_cfgdata.C | 50 ++++++++++++++++++----- src/usr/htmgt/htmgt_cfgdata.H | 12 +++++- src/usr/htmgt/htmgt_occ.C | 53 ++++++++++++++++++------ src/usr/htmgt/htmgt_occ.H | 36 ++++++++++++++--- src/usr/htmgt/htmgt_poll.C | 22 +++++----- src/usr/htmgt/occError.C | 67 ++++++++++++++++++++----------- src/usr/htmgt/occError.H | 1 + src/usr/htmgt/test/htmgtcfgtest.H | 3 +- 9 files changed, 185 insertions(+), 65 deletions(-) diff --git a/src/usr/htmgt/htmgt.C b/src/usr/htmgt/htmgt.C index d7675bb56aa..cf2190d07a7 100644 --- a/src/usr/htmgt/htmgt.C +++ b/src/usr/htmgt/htmgt.C @@ -107,8 +107,8 @@ namespace HTMGT if (OccManager::occNeedsReset()) { // No need to continue if reset is required - TMGT_ERR("sendOccConfigData(): OCCs need " - "to be reset"); + TMGT_ERR("processOccStartStatus(): " + "OCCs need to be reset"); break; } else @@ -305,6 +305,8 @@ namespace HTMGT errlHndl_t err = OccManager::resetOccs(nullptr); if(err) { + TMGT_ERR("processOccError(): Error when attempting" + " to reset OCCs"); ERRORLOG::errlCommit(err, HTMGT_COMP_ID); } } diff --git a/src/usr/htmgt/htmgt_cfgdata.C b/src/usr/htmgt/htmgt_cfgdata.C index de636d153f7..85b8029f301 100644 --- a/src/usr/htmgt/htmgt_cfgdata.C +++ b/src/usr/htmgt/htmgt_cfgdata.C @@ -130,7 +130,8 @@ namespace HTMGT { case OCC_CFGDATA_FREQ_POINT: getFrequencyPointMessageData(cmdData, - cmdDataLen); + cmdDataLen, + occ->wofResetCount()); break; case OCC_CFGDATA_OCC_ROLE: @@ -1153,7 +1154,8 @@ void getGPUConfigMessageData(const TargetHandle_t i_occ, void getFrequencyPointMessageData(uint8_t* o_data, - uint64_t & o_size) + uint64_t & o_size, + uint8_t i_wofResetCount ) { uint64_t index = 0; uint16_t min = 0; @@ -1177,6 +1179,7 @@ void getFrequencyPointMessageData(uint8_t* o_data, uint8_t turboAllowed = sys->getAttr(); + if (turboAllowed) { turbo = sys->getAttr(); @@ -1185,31 +1188,61 @@ void getFrequencyPointMessageData(uint8_t* o_data, ATTR_SYSTEM_WOF_DISABLE_type wofSupported; if (!sys->tryGetAttr(wofSupported)) { + ultra = WOF_SYSTEM_DISABLED; G_wofSupported = false; } else { - if( wofSupported != SYSTEM_WOF_DISABLE_ON ) + uint16_t tempUt = sys->getAttr(); + if( wofSupported == SYSTEM_WOF_DISABLE_ON ) { - ultra = sys->getAttr(); + TMGT_INF("System does not support WOF"); + G_wofSupported = false; + ultra = WOF_SYSTEM_DISABLED; } - else + else if( tempUt == 0 ) + { + TMGT_INF("Missing Ultra Turbo VPD point. WOF disabled."); + G_wofSupported = false; + ultra = WOF_MISSING_ULTRA_TURBO; + } + else if( i_wofResetCount >= WOF_RESET_COUNT_THRESHOLD ) + { + TMGT_INF("WOF reset count reached. WOF disabled."); + G_wofSupported = false; + ultra = WOF_RESET_COUNT_REACHED; + } + else if( turbo <= nominal ) { + TMGT_INF("Turbo is less than nominal. WOF disabled."); G_wofSupported = false; + ultra = WOF_UNSUPPORTED_FREQ; + } + else if( tempUt <= turbo ) + { + TMGT_INF("Ultra Turbo is less than Turbo. WOF disabled."); + G_wofSupported = false; + ultra = WOF_UNSUPPORTED_FREQ; + } + else + { + ultra = tempUt; } + } if( !G_wofSupported ) { - TMGT_INF("getFrequencyPoint: WOF not enabled"); + TMGT_INF("getFrequencyPoint: WOF not enabled! RC = %x", ultra); } } else { // If turbo not supported, send nominal for turbo - // and 0 for ultra-turbo (no WOF support) + // and reason code for ultra-turbo (no WOF support) TMGT_INF("getFrequencyPoint: Turbo/WOF not supported"); turbo = nominal; + ultra = WOF_UNSUPPORTED_FREQ; G_wofSupported = false; } @@ -1342,5 +1375,4 @@ void getApssMessageData(uint8_t* o_data, } - -} +}// namespace HTMGT diff --git a/src/usr/htmgt/htmgt_cfgdata.H b/src/usr/htmgt/htmgt_cfgdata.H index 9d632889da7..82eb05a35d9 100644 --- a/src/usr/htmgt/htmgt_cfgdata.H +++ b/src/usr/htmgt/htmgt_cfgdata.H @@ -80,6 +80,13 @@ namespace HTMGT CFDATA_DVFS_NOT_DEFINED = 0xFF, }; + enum // WOF disabled reasons + { + WOF_MISSING_ULTRA_TURBO = 0x0000, + WOF_SYSTEM_DISABLED = 0x0001, + WOF_RESET_COUNT_REACHED = 0x0002, + WOF_UNSUPPORTED_FREQ = 0x0003, + }; enum cfgTargets { @@ -249,10 +256,13 @@ namespace HTMGT * * @param[out] o_data - preallocated buffer to fill in * @param[out] o_size - set to the message size + * @param[in] i_wofResetCount - Number of times OCC requested a reset + * due to WOF * @pre o_data is large enough. */ void getFrequencyPointMessageData(uint8_t* o_data, - uint64_t & o_size); + uint64_t & o_size, + uint8_t i_wofResetCount ); /** * Generate the APSS configuration message diff --git a/src/usr/htmgt/htmgt_occ.C b/src/usr/htmgt/htmgt_occ.C index 4559b65a0e5..760799e50b9 100644 --- a/src/usr/htmgt/htmgt_occ.C +++ b/src/usr/htmgt/htmgt_occ.C @@ -57,6 +57,8 @@ namespace HTMGT iv_state(OCC_STATE_UNKNOWN), iv_commEstablished(false), iv_needsReset(false), + iv_needsWofReset(false), + iv_wofResetCount(0), iv_failed(false), iv_seqNumber(0), iv_homer(i_homer), @@ -202,9 +204,15 @@ namespace HTMGT atThreshold = true; } } + else if( iv_needsWofReset ) //If WOF reset, increment count + { + iv_wofResetCount++; + TMGT_INF("resetPrep(): WOF reset requested. Reset Count = %d", + iv_wofResetCount ); + } else { - cmdData[1] = OCC_RESET_FAIL_OTHER_OCC; + cmdData[1] = OCC_RESET_FAIL_THIS_OCC; } if (iv_commEstablished) @@ -239,6 +247,7 @@ namespace HTMGT iv_state = OCC_STATE_UNKNOWN; iv_commEstablished = false; iv_needsReset = false; + iv_needsWofReset = false; iv_failed = false; iv_lastPollValid = false; iv_resetReason = OCC_RESET_REASON_NONE; @@ -401,7 +410,7 @@ namespace HTMGT :iv_occMaster(nullptr), iv_state(OCC_STATE_UNKNOWN), iv_targetState(OCC_STATE_ACTIVE), - iv_resetCount(0), + iv_sysResetCount(0), iv_normalPstateTables(true) { } @@ -903,17 +912,23 @@ namespace HTMGT atThreshold = true; } } + // If we need a WOF reset, skip system count increment + if( occ->needsWofReset() ) + { + i_skipCountIncrement = true; + } + } if ((false == i_skipCountIncrement) && (false == _occFailed())) { // No OCC has been marked failed, increment sys reset count - ++iv_resetCount; + ++iv_sysResetCount; TMGT_INF("_resetOCCs: Incrementing system OCC reset count" - " to %d", iv_resetCount); + " to %d", iv_sysResetCount); - if(iv_resetCount > OCC_RESET_COUNT_THRESHOLD) + if(iv_sysResetCount > OCC_RESET_COUNT_THRESHOLD) { atThreshold = true; } @@ -1225,7 +1240,7 @@ namespace HTMGT for( const auto & occ : iv_occArray ) { - if (occ->needsReset()) + if (occ->needsReset() || occ->needsWofReset()) { needsReset = true; break; @@ -1235,7 +1250,6 @@ namespace HTMGT return needsReset; } - // Return true if any OCC has been marked as failed bool OccManager::_occFailed() { @@ -1290,7 +1304,7 @@ namespace HTMGT o_data[index++] = (nullptr!=iv_occMaster)?iv_occMaster->getInstance():0xFF; o_data[index++] = iv_state; o_data[index++] = iv_targetState; - o_data[index++] = iv_resetCount; + o_data[index++] = iv_sysResetCount; o_data[index++] = iv_normalPstateTables ? 0 : 1; index += 1; // reserved for expansion o_data[index++] = safeMode; @@ -1393,7 +1407,8 @@ namespace HTMGT { TMGT_INF("_clearResetCounts: Clearing OCC%d reset count " "(was %d)", - occ->getInstance(), occ->iv_resetCount); + occ->getInstance(), + occ->iv_resetCount); occ->iv_resetCount = 0; if (safeMode) { @@ -1401,13 +1416,27 @@ namespace HTMGT occ->postResetClear(); } } + + if(occ->iv_wofResetCount != 0) + { + occ->iv_wofResetCount = 0; + TMGT_INF("_clearResetCounts: Clearing OCC%d WOF reset count " + "( was %d)", + occ->getInstance(), + occ->iv_wofResetCount); + if(safeMode) + { + // Clear OCC flags + occ->postResetClear(); + } + } } - if (iv_resetCount != 0) + if (iv_sysResetCount != 0) { TMGT_INF("_clearResetCounts: Clearing system reset count " - "(was %d)", iv_resetCount); - iv_resetCount = 0; + "(was %d)", iv_sysResetCount); + iv_sysResetCount = 0; } } diff --git a/src/usr/htmgt/htmgt_occ.H b/src/usr/htmgt/htmgt_occ.H index 80df52ab76d..fb439672d72 100644 --- a/src/usr/htmgt/htmgt_occ.H +++ b/src/usr/htmgt/htmgt_occ.H @@ -72,6 +72,7 @@ namespace HTMGT enum { OCC_RESET_COUNT_THRESHOLD = 3, + WOF_RESET_COUNT_THRESHOLD = 3, }; enum occResetReason @@ -83,6 +84,7 @@ namespace HTMGT OCC_RESET_REASON_POWER_FAULT = 0x04, OCC_RESET_REASON_DIFF_OCC = 0x05, OCC_RESET_REASON_OCC_REQUEST = 0x06, + OCC_RESET_REASON_WOF_REQUEST = 0x07, }; // OCC Callout Structure @@ -215,12 +217,14 @@ namespace HTMGT */ occStateId getState() { return iv_state; }; + /** * @brief Prepare this OCC for reset * @return return true if at threshold otherwise false */ bool resetPrep(); + /** * @brief Set IPMI OCC sensor state * @param i_activate: true - set active @@ -255,6 +259,22 @@ namespace HTMGT bool needsReset() { return iv_needsReset; } + /** + * @brief Determine if OCC needs to be reset due to WOF + * + * @return true if this OCC needs to be reset + */ + bool needsWofReset() { return iv_needsWofReset; } + + + /** + * @brief Returns the number of times a WOF reset has occured + * + * @return Number of WOF resets for this OCC + */ + uint8_t wofResetCount() { return iv_wofResetCount; } + + /** * @brief Return OCCs present bits * @@ -282,6 +302,7 @@ namespace HTMGT */ void collectCheckpointScomData(errlHndl_t i_err); + /** * @brief Add OCC trace buffers to given error log (ERR, IMP, INF) * @@ -290,8 +311,6 @@ namespace HTMGT void addOccTrace( errlHndl_t & io_errl ); - - private: // functions /** @@ -303,6 +322,7 @@ namespace HTMGT void pollRspHandler(const uint8_t * i_pollResponse, const uint16_t i_pollResponseSize); + /** * @brief Collect, Commit and Clear error log from the OCC * @@ -314,6 +334,7 @@ namespace HTMGT const uint32_t i_address, const uint16_t i_length); + /** * @brief Determine what actions are required for elog * @@ -325,6 +346,7 @@ namespace HTMGT bool & o_occReset, ERRORLOG::errlSeverity_t & o_errlSeverity); + /** * @brief Add specified callout to the error log * @@ -339,11 +361,13 @@ namespace HTMGT const occErrlCallout_t i_callout, uint8_t & io_callout_num); + /** * @brief Update the GPU presence sensors in the system */ void updateGpuPresence(); + protected: // Instance number of this OCC: 0 = first physical OCC uint8_t iv_instance; @@ -357,6 +381,10 @@ namespace HTMGT bool iv_commEstablished; // true if OCC needs to be reset bool iv_needsReset; + // true if OCC needs to be reset due to WOF + bool iv_needsWofReset; + // WOF reset count + uint8_t iv_wofResetCount; // true if OCC failed bool iv_failed; // Sequence number of last/current OCC command @@ -390,7 +418,6 @@ namespace HTMGT uint8_t iv_resetCount; // Version of data stored (0 = not written) uint8_t iv_version; - }; @@ -565,7 +592,6 @@ namespace HTMGT */ static bool occNeedsReset(); - /** * @brief Collect FFDC debug data for HTMGT and OCCs * @@ -645,7 +671,7 @@ namespace HTMGT occList_t iv_occArray; occStateId iv_state; occStateId iv_targetState; - uint8_t iv_resetCount; + uint8_t iv_sysResetCount; bool iv_normalPstateTables; diff --git a/src/usr/htmgt/htmgt_poll.C b/src/usr/htmgt/htmgt_poll.C index 3316f94254f..af863b1e53a 100644 --- a/src/usr/htmgt/htmgt_poll.C +++ b/src/usr/htmgt/htmgt_poll.C @@ -53,7 +53,7 @@ namespace HTMGT for( const auto & l_occ : iv_occArray ) { - if(NULL == i_occTarget || l_occ->iv_target == i_occTarget) + if(nullptr == i_occTarget || l_occ->iv_target == i_occTarget) { if ((l_occ->iv_commEstablished) || (onlyIfEstablished == false)) @@ -96,13 +96,13 @@ namespace HTMGT errlHndl_t Occ::pollForErrors(const bool i_flushAllErrors) { - errlHndl_t err = NULL; - uint8_t * poll_rsp = NULL; + errlHndl_t err = nullptr; + uint8_t * poll_rsp = nullptr; // Only send poll if OCC has not logged an exception if (0 == iv_exceptionLogged) { - TMGT_INF("sendOccPoll: Polling OCC%d", iv_instance); + TMGT_INF("pollForErrors: Polling OCC%d", iv_instance); bool continuePolling = false; size_t elogCount = 10; @@ -117,10 +117,10 @@ namespace HTMGT l_cmdData); err = cmd.sendOccCmd(); - if (err != NULL) + if (err != nullptr) { // Poll failed - TMGT_ERR("sendOccPoll: OCC%d poll failed with rc=0x%04X", + TMGT_ERR("pollForErrors: OCC%d poll failed with rc=0x%04X", iv_instance, err->reasonCode()); @@ -147,7 +147,7 @@ namespace HTMGT { // Limit number of elogs retrieved so // we do not get stuck in loop - TMGT_INF("sendOccPoll: OCC%d still has" + TMGT_INF("pollForErrors: OCC%d still has" "more errors to report.", iv_instance); continuePolling = false; @@ -162,7 +162,7 @@ namespace HTMGT } else { - TMGT_ERR("sendOccPoll: OCC%d poll command response " + TMGT_ERR("pollForErrors: OCC%d poll command response " "failed with invalid data length %d", iv_instance, poll_rsp_size); /*@ @@ -268,8 +268,8 @@ namespace HTMGT if (iv_state != pollRsp->state) { iv_state = (occStateId)pollRsp->state; - TMGT_INF("pollRspHandler: updating OCC%d state" - " to %s", + TMGT_INF("pollRspHandler: Need reset. " + "updating OCC%d state to %s", iv_instance, state_string(iv_state)); } break; @@ -281,7 +281,7 @@ namespace HTMGT (OCC_STATE_OBSERVATION == pollRsp->state) || (OCC_STATE_CHARACTERIZATION == pollRsp->state)) { - errlHndl_t l_err = NULL; + errlHndl_t l_err = nullptr; // Check role status if (((OCC_ROLE_SLAVE == iv_role) && diff --git a/src/usr/htmgt/occError.C b/src/usr/htmgt/occError.C index 70abd1c71db..47b0eb39536 100644 --- a/src/usr/htmgt/occError.C +++ b/src/usr/htmgt/occError.C @@ -89,7 +89,7 @@ namespace HTMGT const uint32_t i_address, const uint16_t i_length) { - errlHndl_t l_errlHndl = NULL; + errlHndl_t l_errlHndl = nullptr; // Read data from SRAM (length must be multiple of 8 bytes) const uint16_t l_length = (i_length) & 0xFFF8; @@ -101,7 +101,7 @@ namespace HTMGT reinterpret_cast(l_buffer.pointer()), l_length ); #endif - if (NULL == l_errlHndl) + if (nullptr == l_errlHndl) { const occErrlEntry_t * l_occElog= reinterpret_cast @@ -129,6 +129,17 @@ namespace HTMGT // Process Actions bool l_occReset = false; elogProcessActions(l_occElog->actions, l_occReset, severity); + + // Check if we need a WOF requested reset + if(iv_needsWofReset == true) + { + if( iv_wofResetCount < WOF_RESET_COUNT_THRESHOLD ) + { + // Not at WOF reset threshold yet. Set sev to INFO + severity = ERRORLOG::ERRL_SEV_INFORMATIONAL; + } + } + if (l_occReset == true) { iv_needsReset = true; @@ -209,7 +220,7 @@ namespace HTMGT } // Any bad fru data found ? - errlHndl_t err2 = NULL; + errlHndl_t err2 = nullptr; if (l_bad_fru_data == true) { TMGT_BIN("Callout Data", &l_occElog->callout[0], @@ -286,7 +297,7 @@ namespace HTMGT OccCmd l_cmd(this, OCC_CMD_CLEAR_ERROR_LOG, sizeof(l_cmdData), l_cmdData); l_errlHndl = l_cmd.sendOccCmd(); - if (l_errlHndl != NULL) + if (l_errlHndl != nullptr) { TMGT_ERR("occProcessElog: Failed to clear elog id %d to" " OCC%d (rc=0x%04X)", @@ -321,7 +332,7 @@ namespace HTMGT const uint32_t sensor = (uint32_t)i_callout.calloutValue; TARGETING::Target * target = TARGETING::UTIL::getSensorTarget(sensor); - if (NULL != target) + if (nullptr != target) { io_errlHndl->addHwCallout(target, i_priority, HWAS::NO_DECONFIG, @@ -392,34 +403,42 @@ namespace HTMGT bool & o_occReset, ERRORLOG::errlSeverity_t & o_errlSeverity) { - if (i_actions & TMGT_ERRL_ACTIONS_RESET_REQUIRED) + if (i_actions & TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED) { o_occReset = true; - iv_failed = true; - iv_resetReason = OCC_RESET_REASON_OCC_REQUEST; + iv_failed = false; + iv_needsWofReset = true; + iv_resetReason = OCC_RESET_REASON_WOF_REQUEST; - TMGT_INF("elogProcessActions: OCC%d requested reset", - iv_instance); + TMGT_INF("elogProcessActions: OCC%d requested a WOF reset", + iv_instance); } - - if (i_actions & TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED) + else { - o_occReset = true; - iv_failed = true; - iv_resetReason = OCC_RESET_REASON_CRIT_FAILURE; - iv_resetCount = OCC_RESET_COUNT_THRESHOLD; + if (i_actions & TMGT_ERRL_ACTIONS_RESET_REQUIRED) + { + o_occReset = true; + iv_failed = true; + iv_resetReason = OCC_RESET_REASON_OCC_REQUEST; - TMGT_INF("elogProcessActions: OCC%d requested safe mode", - iv_instance); - TMGT_CONSOLE("OCC%d requested system enter safe mode", + TMGT_INF("elogProcessActions: OCC%d requested reset", iv_instance); + } + + if (i_actions & TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED) + { + o_occReset = true; + iv_failed = true; + iv_resetReason = OCC_RESET_REASON_CRIT_FAILURE; + iv_resetCount = OCC_RESET_COUNT_THRESHOLD; + + TMGT_INF("elogProcessActions: OCC%d requested safe mode", + iv_instance); + TMGT_CONSOLE("OCC%d requested system enter safe mode", + iv_instance); + } } } // end Occ::elogProcessActions() - - } // end namespace - - - diff --git a/src/usr/htmgt/occError.H b/src/usr/htmgt/occError.H index 0f0a284be1f..1f3e5ad7d55 100644 --- a/src/usr/htmgt/occError.H +++ b/src/usr/htmgt/occError.H @@ -41,6 +41,7 @@ namespace HTMGT // Error Actions enum tmgtErrlActionsType { + TMGT_ERRL_ACTIONS_WOF_RESET_REQUIRED = 0x20, TMGT_ERRL_ACTIONS_SAFE_MODE_REQUIRED = 0x40, TMGT_ERRL_ACTIONS_RESET_REQUIRED = 0x80, }; diff --git a/src/usr/htmgt/test/htmgtcfgtest.H b/src/usr/htmgt/test/htmgtcfgtest.H index ab71ec03487..9bdffd1dbda 100644 --- a/src/usr/htmgt/test/htmgtcfgtest.H +++ b/src/usr/htmgt/test/htmgtcfgtest.H @@ -432,11 +432,12 @@ public: { uint8_t data[4*KILOBYTE]; uint64_t size = 0; + uint8_t wofResetCount = 0; TS_TRACE(ENTER_MRK"HTMGT: testThermalControlConfigData"); memset(data, 0, 4*KILOBYTE); - getFrequencyPointMessageData(data, size); + getFrequencyPointMessageData(data, size, wofResetCount); if (data[0] != OCC_CFGDATA_FREQ_POINT) {