From 3d4caf24f7e90f183f93cbcb915470f7205bb7f4 Mon Sep 17 00:00:00 2001 From: Matt Derksen Date: Tue, 15 Aug 2017 16:44:36 -0500 Subject: [PATCH] Added Error log support for new GPU sensors Change-Id: I8a0de390516fd02df07860b960db506899b13f14 RTC:178218 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45116 Tested-by: Jenkins Server Tested-by: Jenkins OP Build CI Reviewed-by: Daniel M. Crowell --- src/include/usr/errl/errlentry.H | 24 ++++++ src/include/usr/errl/errludcallout.H | 18 +++- src/include/usr/hwas/common/hwasCallout.H | 14 +++- src/include/usr/ipmi/ipmiconfiglookup.H | 50 +++++++++++ src/usr/errl/errlentry.C | 49 +++++++++++ src/usr/errl/errlentry_consts.H | 13 +++ src/usr/errl/errlmanager_common.C | 4 + src/usr/errl/errludcallout.C | 27 +++++- src/usr/errl/plugins/errludcallout.H | 24 ++++++ src/usr/errldisplay/errldisplay.C | 19 +++++ src/usr/ipmi/ipmiconfiglookup.C | 83 +++++++++++++++++++ .../common/xmltohb/attribute_types_hb.xml | 12 +++ .../xmltohb/attribute_types_openpower.xml | 50 +++++++++++ .../common/xmltohb/target_types_openpower.xml | 1 + 14 files changed, 384 insertions(+), 4 deletions(-) diff --git a/src/include/usr/errl/errlentry.H b/src/include/usr/errl/errlentry.H index fd30b00bedb..555bcb4c12b 100644 --- a/src/include/usr/errl/errlentry.H +++ b/src/include/usr/errl/errlentry.H @@ -573,6 +573,22 @@ public: void addProcedureCallout(const HWAS::epubProcedureID i_procedure, const HWAS::callOutPriority i_priority); + + /** + * @brief Add a special sensor callout + * Adds the given sensor to the list of callouts for the log + * + * @param[in] i_sensorID Sensor ID + * @param[in] i_sensorType Type of sensor being added + * @param[in] i_priority Priority of the callout + * + * @return void + */ + void addSensorCallout(const uint32_t i_sensorID, + const HWAS::sensorTypeEnum i_sensorType, + const HWAS::callOutPriority i_priority); + + /** * @brief Import flattened error log * @@ -650,6 +666,14 @@ private: */ epubSubSystem_t getSubSystem( HWAS::partTypeEnum i_partType ) const; + /** + * @brief maps a sensor type to a subsystem ID + * + * @param[in] i_sensorType Sensor Type. + * + * @return subsystem ID + */ + epubSubSystem_t getSubSystem( HWAS::sensorTypeEnum i_sensorType ) const; /** * @brief The ErrlManager will call here to ask the diff --git a/src/include/usr/errl/errludcallout.H b/src/include/usr/errl/errludcallout.H index d6f2423bddb..66324380989 100644 --- a/src/include/usr/errl/errludcallout.H +++ b/src/include/usr/errl/errludcallout.H @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2015 */ +/* Contributors Listed Below - COPYRIGHT 2012,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -159,6 +159,22 @@ private: const HWAS::callOutPriority i_priority); + /** + * @brief Add a sensor callout + * Adds the given sensor to the list of callouts for the log + * + * @param[in] i_sensorID Sensor ID + * @param[in] i_sensorType Type of sensor being added + * @param[in] i_priority Priority of the callout + * + * @return void + */ + ErrlUserDetailsCallout(const uint32_t i_sensorID, + const HWAS::sensorTypeEnum i_sensorType, + const HWAS::callOutPriority i_priority ); + + + // Disabled ErrlUserDetailsCallout(const ErrlUserDetailsCallout &); ErrlUserDetailsCallout & operator=(const ErrlUserDetailsCallout &); diff --git a/src/include/usr/hwas/common/hwasCallout.H b/src/include/usr/hwas/common/hwasCallout.H index 9a729a5ed68..8f5126e6e9a 100644 --- a/src/include/usr/hwas/common/hwasCallout.H +++ b/src/include/usr/hwas/common/hwasCallout.H @@ -145,8 +145,13 @@ enum partTypeEnum PCI_REF_CLOCK = 11, }; - - +enum sensorTypeEnum +{ + UNKNOWN_SENSOR = 0, + GPU_FUNC_SENSOR = 1, + GPU_TEMPERATURE_SENSOR = 2, + GPU_MEMORY_TEMP_SENSOR = 3, +}; // const uint8_t HW_CALLOUT = 0x01; @@ -154,6 +159,7 @@ const uint8_t PROCEDURE_CALLOUT = 0x02; const uint8_t BUS_CALLOUT = 0x03; const uint8_t CLOCK_CALLOUT = 0x04; const uint8_t PART_CALLOUT = 0x05; +const uint8_t SENSOR_CALLOUT = 0x06; const uint8_t TARGET_IS_SENTINEL = 0xF0; @@ -188,6 +194,10 @@ typedef struct callout_ud GARD_ErrorType partGardErrorType; // uint32_t // one Target will follow }; + struct { // type == SENSOR_CALLOUT + uint32_t sensorId; + sensorTypeEnum sensorType; + }; }; // union } callout_ud_t; diff --git a/src/include/usr/ipmi/ipmiconfiglookup.H b/src/include/usr/ipmi/ipmiconfiglookup.H index bc3290096d3..6f287d9c98d 100644 --- a/src/include/usr/ipmi/ipmiconfiglookup.H +++ b/src/include/usr/ipmi/ipmiconfiglookup.H @@ -210,6 +210,28 @@ class IpmiConfigLookup return i_tgt->tryGetAttr(l_ipmiArray); } +/** + * @brief Determines whether the passed in target has the GPU_SENSORS + * attribute. + * + * @param[in] i_tgt. The target to test. + * + * @reurn True if the target has the GPU_SENSORS attribute, false otherwise. + * + */ + inline static bool doesTargetHaveGPUSensorsAttr(TARGETING::Target * i_tgt) + { + if(!i_tgt) + { + return false; + } + + TARGETING::AttributeTraits::Type + l_gpuArray; + return i_tgt->tryGetAttr(l_gpuArray); + } + + /** * @brief Given a passed in target, looks up IPMI_SENSOR data based upon * the passed in sensor number. @@ -237,6 +259,34 @@ class IpmiConfigLookup uint8_t& o_entityId, TARGETING::SENSOR_NAME& o_sensorName ); + +/** + * @brief Given a passed in target, looks up GPU_SENSOR data based upon + * the passed in sensor number. + * + * @param[in] i_target. The target whose GPU_SENSORS attribute will be + * searched for information based upon the passed + * in sensor number. + * + * @param[in] i_sensorNumber. The GPU sensor whose information we wish + * to gather. + * + * @param[out] o_sensorType. The sensor type as read from the + * from the GPU_SENSOR_ARRAY. + * + * @param[out] o_entityId. The entity id associated with + * the sensor as read from the GPU_SENSOR_ARRAY. + * + * @param[out] o_sensorName. The sensor name as read from + * the GPU_SENSOR_ARRAY. + * + */ + static bool lookupGPUSensorInfo(TARGETING::Target * i_target, + uint32_t i_sensorNumber, + uint8_t& o_sensorType, + uint8_t& o_entityId, + TARGETING::SENSOR_NAME& o_sensorName + ); }; } diff --git a/src/usr/errl/errlentry.C b/src/usr/errl/errlentry.C index 251b24b6ca4..1ebbb2671bd 100644 --- a/src/usr/errl/errlentry.C +++ b/src/usr/errl/errlentry.C @@ -320,6 +320,17 @@ void ErrlEntry::addClockCallout(const TARGETING::Target *i_target, } // addClockCallout + +void ErrlEntry::addSensorCallout(const uint32_t i_sensorID, + const HWAS::sensorTypeEnum i_sensorType, + const HWAS::callOutPriority i_priority) +{ + TRACFCOMP(g_trac_errl, ENTER_MRK"addSensorCallout(0x%X, %d, 0x%x)", + i_sensorID, i_sensorType, i_priority); + + ErrlUserDetailsCallout(i_sensorID, i_sensorType, i_priority).addToLog(this); +} + //////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////// void ErrlEntry::addPartCallout(const TARGETING::Target *i_target, @@ -947,6 +958,13 @@ void ErrlEntry::setSubSystemIdBasedOnCallouts() "callout to determine SSID", pData->partType); iv_User.setSubSys(getSubSystem(pData->partType)); } + else if ( pData->type == HWAS::SENSOR_CALLOUT ) + { + TRACFCOMP(g_trac_errl, INFO_MRK + "mapping highest priority sensor type 0x%x " + "callout to determine SSID", pData->sensorType); + iv_User.setSubSys(getSubSystem(pData->sensorType)); + } else { TRACFCOMP(g_trac_errl, ERR_MRK @@ -1153,6 +1171,37 @@ epubSubSystem_t ErrlEntry::getSubSystem( HWAS::clockTypeEnum i_clockType ) const return subsystem; } +/////////////////////////////////////////////////////////////////////////////// +// Map a Sensor type to a subsystem ID +epubSubSystem_t ErrlEntry::getSubSystem(HWAS::sensorTypeEnum i_sensorType) const +{ + TRACDCOMP(g_trac_errl, ENTER_MRK"getSubSystem() from sensor type 0x%x", + i_sensorType); + + epubSubSystem_t subsystem = EPUB_MISC_UNKNOWN; + + const uint32_t SENSOR_TO_SUBSYS_TABLE_ENTRIES = + sizeof(SENSOR_TO_SUBSYS_TABLE)/sizeof(SENSOR_TO_SUBSYS_TABLE[0]); + + for (uint32_t i = 0; i < SENSOR_TO_SUBSYS_TABLE_ENTRIES; i++) + { + if (SENSOR_TO_SUBSYS_TABLE[i].xType == i_sensorType) + { + subsystem = SENSOR_TO_SUBSYS_TABLE[i].xSubSys; + break; + } + } + + if(subsystem == EPUB_MISC_UNKNOWN) + { + TRACFCOMP(g_trac_errl,"WRN>> Failed to find subsystem ID for sensor type 0x%x", + i_sensorType); + } + + TRACDCOMP(g_trac_errl, EXIT_MRK"getSubSystem() ssid 0x%x", subsystem); + return subsystem; +} + /////////////////////////////////////////////////////////////////////////////// // Map a Part type to a subsystem ID epubSubSystem_t ErrlEntry::getSubSystem( HWAS::partTypeEnum i_partType ) const diff --git a/src/usr/errl/errlentry_consts.H b/src/usr/errl/errlentry_consts.H index 3f57621958d..edd16505769 100644 --- a/src/usr/errl/errlentry_consts.H +++ b/src/usr/errl/errlentry_consts.H @@ -149,6 +149,19 @@ const epubPartTypeToSub_t PART_TO_SUBSYS_TABLE[] = { HWAS::SPIVID_SLAVE_PART_TYPE , EPUB_POWER_SUBSYS }, }; +struct epubSensorTypeToSub_t +{ + HWAS::sensorTypeEnum xType; + epubSubSystem_t xSubSys; +}; + +struct epubSensorTypeToSub_t SENSOR_TO_SUBSYS_TABLE[] = +{ + { HWAS::GPU_FUNC_SENSOR , EPUB_IO_SUBSYS }, + { HWAS::GPU_TEMPERATURE_SENSOR , EPUB_IO_SUBSYS }, + { HWAS::GPU_MEMORY_TEMP_SENSOR , EPUB_IO_SUBSYS }, +}; + } //end namespace #endif //#ifndef ERRLENTRY_CONSTS_H diff --git a/src/usr/errl/errlmanager_common.C b/src/usr/errl/errlmanager_common.C index a0a54da5e69..e69036195bc 100644 --- a/src/usr/errl/errlmanager_common.C +++ b/src/usr/errl/errlmanager_common.C @@ -842,6 +842,10 @@ uint8_t getSensorInfo(HWAS::callout_ud_t *i_ud, { *o_sensorNumber = SENSOR::getBackPlaneFaultSensor(); } + else if (i_ud->type == HWAS::SENSOR_CALLOUT ) + { + *o_sensorNumber = static_cast(i_ud->sensorId); + } else { // for all other types there will be at least diff --git a/src/usr/errl/errludcallout.C b/src/usr/errl/errludcallout.C index 2aa851d648c..334d3c9a29f 100644 --- a/src/usr/errl/errludcallout.C +++ b/src/usr/errl/errludcallout.C @@ -5,7 +5,7 @@ /* */ /* OpenPOWER HostBoot Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2012,2014 */ +/* Contributors Listed Below - COPYRIGHT 2012,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -202,4 +202,29 @@ ErrlUserDetailsCallout::ErrlUserDetailsCallout( } // Procedure callout +//------------------------------------------------------------------------------ +// Sensor callout +ErrlUserDetailsCallout::ErrlUserDetailsCallout(const uint32_t i_sensorID, + const HWAS::sensorTypeEnum i_sensorType, + const HWAS::callOutPriority i_priority) +{ + TRACDCOMP(g_trac_errl, "Sensor Callout"); + + // Set up ErrlUserDetails instance variables + iv_CompId = ERRL_COMP_ID; + iv_Version = 1; + iv_SubSection = ERRL_UDT_CALLOUT; + + HWAS::callout_ud_t *pData; + pData = reinterpret_cast + (reallocUsrBuf(sizeof(HWAS::callout_ud_t))); + + pData->type = HWAS::SENSOR_CALLOUT; + pData->priority = i_priority; + pData->sensorId = i_sensorID; + pData->sensorType = i_sensorType; + + TRACDCOMP(g_trac_errl, "Sensor Callout exit"); +} // Sensor callout + } diff --git a/src/usr/errl/plugins/errludcallout.H b/src/usr/errl/plugins/errludcallout.H index 95e147083ea..abce1fb7496 100644 --- a/src/usr/errl/plugins/errludcallout.H +++ b/src/usr/errl/plugins/errludcallout.H @@ -297,6 +297,30 @@ case HWAS::_type: i_parser.PrintString( "Procedure", #_type); break; #undef case_PROCEDURE break; // PROCEDURE_CALLOUT } + + case HWAS::SENSOR_CALLOUT: + { + i_parser.PrintString( "Callout type", "Sensor Callout"); + + i_parser.PrintNumber( "Sensor ID", "0x%X", + ntohl(pData->sensorId)); + + switch (ntohl(pData->sensorType)) + { +#define case_SENSOR_TYPE(_type) \ +case HWAS::_type: i_parser.PrintString( "Sensor Type", #_type); break; + case_SENSOR_TYPE(GPU_FUNC_SENSOR) + case_SENSOR_TYPE(GPU_TEMPERATURE_SENSOR) + case_SENSOR_TYPE(GPU_MEMORY_TEMP_SENSOR) + case_SENSOR_TYPE(UNKNOWN_SENSOR) + default: + i_parser.PrintNumber( "Sensor Type", "UNKNOWN: 0x%X", + ntohl(pData->sensorType) ); + break; + } // switch sensorType +#undef case_SENSOR_TYPE + break; + } default: i_parser.PrintNumber( "Callout type", "UNKNOWN: 0x%X", ntohl(pData->type) ); diff --git a/src/usr/errldisplay/errldisplay.C b/src/usr/errldisplay/errldisplay.C index 590b20077ad..ea496c426bd 100644 --- a/src/usr/errldisplay/errldisplay.C +++ b/src/usr/errldisplay/errldisplay.C @@ -380,6 +380,25 @@ case HWAS::_type: CONSOLE::displayf(NULL, " Procedure : %s", #_t } // switch procedure #undef case_PROCEDURE break; + + case HWAS::SENSOR_CALLOUT: + CONSOLE::displayf(NULL, " Sensor ID : 0x%x", callout->sensorId); + + switch (callout->sensorType) + { +#define case_SENSOR_TYPE(_type) \ +case HWAS::_type: CONSOLE::displayf(NULL, " Sensor Type : %s", #_type); break; + case_SENSOR_TYPE(UNKNOWN_SENSOR) + case_SENSOR_TYPE(GPU_FUNC_SENSOR) + case_SENSOR_TYPE(GPU_TEMPERATURE_SENSOR) + case_SENSOR_TYPE(GPU_MEMORY_TEMP_SENSOR) + default: + CONSOLE::displayf(NULL, " Sensor Type : UNKNOWN 0x%X", + callout->sensorType); + } // switch sensorType +#undef case_SENSOR_TYPE + break; + default: CONSOLE::displayf(NULL, " Callout type : UNKNOWN: 0x%X", callout->type); diff --git a/src/usr/ipmi/ipmiconfiglookup.C b/src/usr/ipmi/ipmiconfiglookup.C index 0cea60a5527..ee2eb85eb5d 100644 --- a/src/usr/ipmi/ipmiconfiglookup.C +++ b/src/usr/ipmi/ipmiconfiglookup.C @@ -92,6 +92,76 @@ bool IpmiConfigLookup::lookupIPMISensorInfo(TARGETING::Target * i_target, } +//----------------------------------------------------------------------------- +// Private method used to lookup sensor information from the GPU_SENSORS +// array attribute of the i_target parameter. +// +// Returns true if the sensor was found, false otherwise. +//----------------------------------------------------------------------------- +bool IpmiConfigLookup::lookupGPUSensorInfo(TARGETING::Target * i_target, + uint32_t i_sensorNumber, + uint8_t& o_sensorType, + uint8_t& o_entityId, + TARGETING::SENSOR_NAME& o_sensorName + ) +{ + using GPU_ARRAY_ELEMENT = uint16_t[7]; + bool l_result{false}; + + assert(nullptr != i_target); + assert(TARGETING::UTIL::INVALID_IPMI_SENSOR != i_sensorNumber); + + // Get the GPU_SENSORS array attribute from i_target + TARGETING::AttributeTraits::Type + l_sensorArray; + if(!i_target->tryGetAttr(l_sensorArray)) + { + return l_result; + } + + //Search the sensor array for the desired sensor + uint32_t elementCount = (sizeof(l_sensorArray)/sizeof(l_sensorArray[0])); + const GPU_ARRAY_ELEMENT * begin = &l_sensorArray[0]; + const GPU_ARRAY_ELEMENT * end = &l_sensorArray[elementCount]; + const GPU_ARRAY_ELEMENT * itr{nullptr}; + + itr = std::find_if(begin, + end, + [i_sensorNumber] (const GPU_ARRAY_ELEMENT& a) + { + return ( + (a[TARGETING::GPU_SENSOR_ARRAY_FUNC_ID_OFFSET] == i_sensorNumber) || + (a[TARGETING::GPU_SENSOR_ARRAY_TEMP_ID_OFFSET] == i_sensorNumber) || + (a[TARGETING::GPU_SENSOR_ARRAY_MEM_TEMP_ID_OFFSET] == i_sensorNumber)); + } + ); + + if(itr != end) + { + l_result = true; + uint16_t l_sensorName; + if (*itr[TARGETING::GPU_SENSOR_ARRAY_FUNC_ID_OFFSET] == i_sensorNumber) + { + l_sensorName = *itr[TARGETING::GPU_SENSOR_ARRAY_FUNC_OFFSET]; + } + else if + (*itr[TARGETING::GPU_SENSOR_ARRAY_TEMP_ID_OFFSET] == i_sensorNumber) + { + l_sensorName = *itr[TARGETING::GPU_SENSOR_ARRAY_TEMP_ID_OFFSET]; + } + else + { + l_sensorName = *itr[TARGETING::GPU_SENSOR_ARRAY_MEM_TEMP_ID_OFFSET]; + } + + o_sensorName = static_cast(l_sensorName); + o_sensorType = static_cast((l_sensorName >> 8) & 0x00FF); + o_entityId = static_cast(l_sensorName & 0x00FF); + } + + return l_result; +} + //-------------------------------------------------------------------------- //Given a sensor number, lookup and parse SENSOR_NAME into SENSOR_TYPE //and ENTITY_ID values. @@ -150,6 +220,19 @@ bool IpmiConfigLookup::getIPMISensorInfo(uint32_t i_sensorNumber, break; } } + else if (doesTargetHaveGPUSensorsAttr(*itr)) + { + l_result = lookupGPUSensorInfo((*itr), + i_sensorNumber, + l_sensorType, + l_entityId, + l_sensorName); + if (l_result) + { + break; + } + + } } } diff --git a/src/usr/targeting/common/xmltohb/attribute_types_hb.xml b/src/usr/targeting/common/xmltohb/attribute_types_hb.xml index e660dcf77e1..b8780cbd3ba 100755 --- a/src/usr/targeting/common/xmltohb/attribute_types_hb.xml +++ b/src/usr/targeting/common/xmltohb/attribute_types_hb.xml @@ -460,6 +460,18 @@ MEMBUF_TEMP 0x01D1 + + GPU_TEMP + 0x01D8 + + + GPU_MEM_TEMP + 0x01D9 + + + GPU_STATE + 0x17D8 + PROC_STATE 0x0703 diff --git a/src/usr/targeting/common/xmltohb/attribute_types_openpower.xml b/src/usr/targeting/common/xmltohb/attribute_types_openpower.xml index 44d48c70df5..32072ee5b55 100644 --- a/src/usr/targeting/common/xmltohb/attribute_types_openpower.xml +++ b/src/usr/targeting/common/xmltohb/attribute_types_openpower.xml @@ -317,6 +317,56 @@ + + GPU_SENSOR_ARRAY + + Enumeration defining the offsets into the GPU_SENSORS array. + + + FUNC_OFFSET + 0x00 + + + FUNC_ID_OFFSET + 0x01 + + + TEMP_OFFSET + 0x02 + + + TEMP_ID_OFFSET + 0x03 + + + MEM_TEMP_OFFSET + 0x04 + + + MEM_TEMP_ID_OFFSET + 0x05 + + + OBUS_CFG_OFFSET + 0x06 + + + + + GPU_SENSORS + Attribute to hold 3 possible GPU sensors. + Includes sensor types, ids, and OBUS_CFG bits + + + + 3,7 + + non-volatile + + + + + OP_TRACE_LITE diff --git a/src/usr/targeting/common/xmltohb/target_types_openpower.xml b/src/usr/targeting/common/xmltohb/target_types_openpower.xml index 1bc9bb654c6..e713a56a27d 100644 --- a/src/usr/targeting/common/xmltohb/target_types_openpower.xml +++ b/src/usr/targeting/common/xmltohb/target_types_openpower.xml @@ -46,6 +46,7 @@ IPMI_INSTANCE 0xFF + GPU_SENSORS