diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index 12c348db..fcc9698f 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2016 */ +/* Contributors Listed Below - COPYRIGHT 2011,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -1010,7 +1010,7 @@ void amec_health_check_proc_timeout() // exceeded the error temperature sent in data format 0x13. // // End Function Specification -void amec_health_check_vrm_vdd_temp() +void amec_health_check_vrm_vdd_temp(const sensor_t *i_sensor) { /*------------------------------------------------------------------------*/ /* Local Variables */ @@ -1018,7 +1018,6 @@ void amec_health_check_vrm_vdd_temp() uint16_t l_ot_error; static uint32_t L_error_count = 0; static BOOLEAN L_ot_error_logged = FALSE; - sensor_t *l_sensor; errlHndl_t l_err = NULL; /*------------------------------------------------------------------------*/ @@ -1026,12 +1025,10 @@ void amec_health_check_vrm_vdd_temp() /*------------------------------------------------------------------------*/ do { - // Get TEMPVDD sensor - l_sensor = getSensorByGsid(TEMPVDD); l_ot_error = g_amec->thermalvdd.ot_error; // Check to see if we exceeded our error temperature - if (l_sensor->sample > l_ot_error) + if (i_sensor->sample > l_ot_error) { // Increment the error counter for this FRU L_error_count++; @@ -1048,7 +1045,7 @@ void amec_health_check_vrm_vdd_temp() L_ot_error_logged = TRUE; TRAC_ERR("amec_health_check_vrm_vdd_temp: VRM vdd has exceeded OT error! temp[%u] ot_error[%u]", - l_sensor->sample, + i_sensor->sample, l_ot_error); // Log an OT error @@ -1069,7 +1066,7 @@ void amec_health_check_vrm_vdd_temp() NULL, DEFAULT_TRACE_SIZE, 0, - l_sensor->sample_max); + i_sensor->sample_max); // Callout the Ambient procedure addCalloutToErrl(l_err, @@ -1093,7 +1090,7 @@ void amec_health_check_vrm_vdd_temp() if (L_error_count >= AMEC_HEALTH_ERROR_TIMER) { TRAC_INFO("amec_health_check_vrm_vdd_temp: VRM Vdd temp [%u] now below error temp [%u] after error_count [%u]", - l_sensor->sample, l_ot_error, L_error_count); + i_sensor->sample, l_ot_error, L_error_count); } // Reset the error counter for this FRU diff --git a/src/occ_405/amec/amec_health.h b/src/occ_405/amec/amec_health.h index 7992f265..9199bb7e 100755 --- a/src/occ_405/amec/amec_health.h +++ b/src/occ_405/amec/amec_health.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2015 */ +/* Contributors Listed Below - COPYRIGHT 2011,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -51,7 +51,7 @@ void amec_mem_mark_logged(uint8_t i_cent, uint8_t i_dimm, uint8_t* i_clog_bitmap, uint8_t* i_dlog_bitmap); -void amec_health_check_vrm_vdd_temp(void); +void amec_health_check_vrm_vdd_temp(const sensor_t *i_sensor); void amec_health_check_vrm_vdd_temp_timeout(void); #endif diff --git a/src/occ_405/amec/amec_sensors_power.c b/src/occ_405/amec/amec_sensors_power.c index b58a9708..55429b15 100755 --- a/src/occ_405/amec/amec_sensors_power.c +++ b/src/occ_405/amec/amec_sensors_power.c @@ -591,12 +591,12 @@ void amec_update_avsbus_sensors(void) AVSBUS_STATE_INITIATE_READ = 1, AVSBUS_STATE_PROCESS_CURRENT = 2, AVSBUS_STATE_PROCESS_VOLTAGE = 3, - AVSBUS_STATE_PROCESS_STATUS = 4 + AVSBUS_STATE_PROCESS_STATUS = 4, + AVSBUS_STATE_PROCESS_TEMPERATURE= 5 } L_avsbus_state = AVSBUS_STATE_INITIATE_READ; - // Number of Curr/Volt readings between Status readings -#define NUM_VRM_READINGS_PER_STATUS 2 - static unsigned int L_readingCount = 0; + // Flag to select either temperature or status to read on 3rd tick + static bool L_read_temp = true; if (isSafeStateRequested()) { @@ -626,31 +626,45 @@ void amec_update_avsbus_sensors(void) // Process the voltage readings process_avsbus_voltage(); - if ((G_vrm_thermal_monitoring == FALSE) || (++L_readingCount < NUM_VRM_READINGS_PER_STATUS)) + // Initiate read of temperature or error status (OT/OC) + if (L_read_temp) { - // Initiate read of currents - initiate_avsbus_reads(AVSBUS_CURRENT); - L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT; + // Initiate AVS Bus read for Vdd temperature + avsbus_read_start(AVSBUS_VDD, AVSBUS_TEMPERATURE); + L_avsbus_state = AVSBUS_STATE_PROCESS_TEMPERATURE; } else { - // Periodically read status for VR FAN (VRM OT WARNING) initiate_avsbus_read_status(); L_avsbus_state = AVSBUS_STATE_PROCESS_STATUS; - L_readingCount = 0; } + // Toggle between reading temperature and status + L_read_temp = !L_read_temp; break; case AVSBUS_STATE_PROCESS_STATUS: - // Process the status { - // Update sensor with the OT status (0 / 1) + // Process the status uint16_t otStatus = process_avsbus_status(); - sensor_update(AMECSENSOR_PTR(VRMPROCOT), otStatus); + if (G_vrm_thermal_monitoring) + { + // Update sensor with the OT status (0 / 1) + sensor_update(AMECSENSOR_PTR(VRMPROCOT), otStatus); + } + // Back to reading currents + initiate_avsbus_reads(AVSBUS_CURRENT); + L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT; + } + break; + + case AVSBUS_STATE_PROCESS_TEMPERATURE: + { + // Read and process Vdd temperature + avsbus_read(AVSBUS_VDD, AVSBUS_TEMPERATURE); + // Back to reading currents + initiate_avsbus_reads(AVSBUS_CURRENT); + L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT; } - // Back to reading currents - initiate_avsbus_reads(AVSBUS_CURRENT); - L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT; break; case AVSBUS_STATE_DISABLED: diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 28fe8bb0..1f28f7a6 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -53,6 +53,8 @@ extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap; extern bool G_vrm_thermal_monitoring; extern uint32_t G_first_proc_gpu_config; +extern bool G_vrm_vdd_temp_expired; + #include extern gpe_shared_data_t G_shared_gpe_data; @@ -355,6 +357,26 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) } } + if (G_avsbus_vdd_monitoring) + { + // Add Vdd temp + const sensor_t *tempvdd = getSensorByGsid(TEMPVDD); + if (tempvdd != NULL) + { + l_tempSensorList[l_sensorHeader.count].id = AMECSENSOR_PTR(TEMPVDD)->ipmi_sid; + l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM_VDD; + if (G_vrm_vdd_temp_expired) + { + l_tempSensorList[l_sensorHeader.count].value = 0xFF; + } + else + { + l_tempSensorList[l_sensorHeader.count].value = tempvdd->sample & 0xFF; + } + l_sensorHeader.count++; + } + } + // Add GPU temperatures for (k=0; k*/ "op_occ_171012a\0" /**/ ; +volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /**/ "op_occ_171020a\0" /**/ ; #endif diff --git a/src/occ_405/pss/avsbus.c b/src/occ_405/pss/avsbus.c index 7c155f9f..75e23253 100644 --- a/src/occ_405/pss/avsbus.c +++ b/src/occ_405/pss/avsbus.c @@ -39,7 +39,7 @@ //#define AVSDEBUG #ifdef AVSDEBUG -#define DEBUG_TRACE_MAX 1 +#define DEBUG_TRACE_MAX 2 static bool G_trace_scoms = TRUE; #define AVS_DBG(frmt,args...) TRAC_INFO(frmt,##args) #define DEBUG_IN32(reg, result, name) if (G_trace_scoms) { TRAC_INFO(" in32(%08X) returned 0x%08X "name, reg, result); } @@ -66,6 +66,9 @@ extern uint32_t G_nest_frequency_mhz; #define AVSBUS_FREQUENCY_MHZ 10 extern bool G_vrm_thermal_monitoring; +extern bool G_vrm_vdd_temp_expired; +void amec_health_check_vrm_vdd_temp(const sensor_t *i_sensor); + // Number of read failures allowed before elog is created and reset requested. // This should be no longer than 4ms (or it will impact WOF calculations) // (readings are taken every 500us => 500us * 8 = 4ms) @@ -299,8 +302,6 @@ uint32_t avs_crc_calculate(const uint32_t i_avs_cmd) } -#define ERRORCOUNT_MAXTYPES 2 -#define ERRORCOUNT_MAXCMDS 2 // Initiate read for specified type (Vdd/Vdn) and cmd (Voltage/Current) void avsbus_read_start(const avsbus_type_e i_type, const avsbus_cmdtype_e i_cmdtype) @@ -333,12 +334,17 @@ void avsbus_read_start(const avsbus_type_e i_type, l_cmd_index = 1; l_trace_cmd = 'C'; } + else if (i_cmdtype == AVSBUS_TEMPERATURE) + { + l_cmd_index = 2; + l_trace_cmd = 'T'; + } if (AVSBUS_VDD != i_type) { l_trace_type = 'n'; } - static uint32_t L_trace_count[ERRORCOUNT_MAXTYPES][ERRORCOUNT_MAXCMDS] = {{0}}; + static uint32_t L_trace_count[AVSBUS_TYPE_MAX][AVSBUS_CMDS_MAX] = {{0}}; uint32_t * l_trace_count = &L_trace_count[i_type][l_cmd_index]; if (*l_trace_count < DEBUG_TRACE_MAX) { @@ -394,8 +400,8 @@ void avsbus_read_start(const avsbus_type_e i_type, } // end avsbus_read_start() -// Read and return the voltage or current for specified rail -// (voltage units are mV, current units are in 10mA) +// Read and return the voltage, current, or temperature for specified rail +// (voltage units are mV, current units are in 10mA, temperature in 0.1 C) uint16_t avsbus_read(const avsbus_type_e i_type, const avsbus_cmdtype_e i_cmdtype) { @@ -415,9 +421,14 @@ uint16_t avsbus_read(const avsbus_type_e i_type, l_cmd_index = 1; l_trace_cmd = 'C'; } + else if (i_cmdtype == AVSBUS_TEMPERATURE) + { + l_cmd_index = 2; + l_trace_cmd = 'T'; + } // Static error counters for each type (Vdd/Vdn) and command (Voltage/Current) - static uint32_t L_error_count[ERRORCOUNT_MAXTYPES][ERRORCOUNT_MAXCMDS] = {{0}}; + static uint32_t L_error_count[AVSBUS_TYPE_MAX][AVSBUS_CMDS_MAX] = {{0}}; uint32_t * l_error_count = &L_error_count[i_type][l_cmd_index]; char l_trace_type = 'd'; @@ -429,7 +440,7 @@ uint16_t avsbus_read(const avsbus_type_e i_type, } #ifdef AVSDEBUG - static uint32_t L_trace_count[ERRORCOUNT_MAXTYPES][ERRORCOUNT_MAXCMDS] = {{0}}; + static uint32_t L_trace_count[AVSBUS_TYPE_MAX][AVSBUS_CMDS_MAX] = {{0}}; uint32_t * l_trace_count = &L_trace_count[i_type][l_cmd_index]; if (*l_trace_count < DEBUG_TRACE_MAX) { @@ -513,11 +524,11 @@ uint16_t avsbus_read(const avsbus_type_e i_type, // Read the response data uint32_t value = in32(o2srd_reg); DEBUG_IN32(o2srd_reg, value, "OCB_O2SRDxB"); - // AVS Bus response (read voltage or current): + // AVS Bus response (read voltage, current, or temperature): // 0:1 SlaveAck (0b00 from slave indicates good CRC and action was taken) // 2 0 // 3:7 StatusResp - // 8:23 CmdData (LSB = 1mV or 10mA) + // 8:23 CmdData (LSB = 1mV or 10mA or 0.1C) // 24:28 Reserved (must be all 1s) // 29:31 CRC // AA0SSSSS VVVVVVVV VVVVVVVV 11111CCC @@ -555,13 +566,30 @@ uint16_t avsbus_read(const avsbus_type_e i_type, TRAC_INFO("avsbus_read: Successfully read Vd%c voltage %dmV [0x%08X]", l_trace_type, o_reading, value); } - else + else if (i_cmdtype == AVSBUS_CURRENT) { TRAC_INFO("avsbus_read: Successfully read Vd%c current %dx10mA [0x%08X]", l_trace_type, o_reading, value); } } #endif + + if (i_cmdtype == AVSBUS_TEMPERATURE) + { +#ifdef AVSDEBUG + if (*l_trace_count < DEBUG_TRACE_MAX) + { + TRAC_INFO("avsbus_read: Successfully read Vd%c temperature %d/10 C [0x%08X]", + l_trace_type, o_reading, value); + } +#endif + // Update sensor (convert to degrees C) and validate it + sensor_t * l_sensor = AMECSENSOR_PTR(TEMPVDD); + sensor_update(l_sensor, (uint16_t)o_reading/10); + G_vrm_vdd_temp_expired = false; + amec_health_check_vrm_vdd_temp(l_sensor); + } + if (*l_error_count) { // Trace and clear the error count @@ -592,10 +620,15 @@ uint16_t avsbus_read(const avsbus_type_e i_type, exrc = ERC_AVSBUS_VDD_CURRENT_FAILURE; INCREMENT_ERR_HISTORY(ERRH_AVSBUS_VDD_CURRENT); } - else + else if (i_cmdtype == AVSBUS_VOLTAGE) { INCREMENT_ERR_HISTORY(ERRH_AVSBUS_VDD_VOLTAGE); } + else if (i_cmdtype == AVSBUS_TEMPERATURE) + { + exrc = ERC_AVSBUS_VDD_TEMPERATURE_FAILURE; + INCREMENT_ERR_HISTORY(ERRH_AVSBUS_VDD_TEMPERATURE); + } } else { @@ -604,7 +637,7 @@ uint16_t avsbus_read(const avsbus_type_e i_type, exrc = ERC_AVSBUS_VDN_CURRENT_FAILURE; INCREMENT_ERR_HISTORY(ERRH_AVSBUS_VDN_CURRENT); } - else + else if (i_cmdtype == AVSBUS_VOLTAGE) { exrc = ERC_AVSBUS_VDN_VOLTAGE_FAILURE; INCREMENT_ERR_HISTORY(ERRH_AVSBUS_VDN_VOLTAGE); @@ -664,10 +697,10 @@ void initiate_avsbus_reads(avsbus_cmdtype_e i_cmdType) } // end initiate_avsbus_reads() -// Initiate read for vr fan +// Initiate read for error status bits (over-temperature, over-current) void initiate_avsbus_read_status() { - if (isSafeStateRequested() || (G_vrm_thermal_monitoring == FALSE)) + if (isSafeStateRequested()) { // No need to attempt read if OCC will be reset return; @@ -678,7 +711,7 @@ void initiate_avsbus_read_status() #endif unsigned int index; - for (index = 0; index <= 1; ++index) + for (index = 0; index < AVSBUS_TYPE_MAX; ++index) { // Determine busses that are being monitored uint8_t bus = 0xFF; @@ -771,7 +804,7 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) const uint8_t max_read_attempts = G_data_cnfg->thrm_thresh.data[DATA_FRU_VRM_OT_STATUS].max_read_timeout; // Static error counters for each type (Vdd/Vdn) - static uint32_t L_error_count[ERRORCOUNT_MAXTYPES] = {0}; + static uint32_t L_error_count[AVSBUS_TYPE_MAX] = {0}; uint32_t * l_error_count = &L_error_count[i_type]; char l_trace_type = 'd'; @@ -1158,7 +1191,7 @@ uint8_t process_avsbus_status() static bool loggedOT = FALSE; static bool loggedOC = FALSE; errlHndl_t l_err; - if ((foundOT == 1) && !loggedOT) + if ((foundOT == 1) && !loggedOT && G_vrm_thermal_monitoring) { loggedOT = TRUE; TRAC_ERR("process_avsbus_status: AVSBUS Over Temperature Warning (Vdd: 0x%08X, Vdn: 0x%08X)", diff --git a/src/occ_405/pss/avsbus.h b/src/occ_405/pss/avsbus.h index 2b40aaa2..071ba0ca 100755 --- a/src/occ_405/pss/avsbus.h +++ b/src/occ_405/pss/avsbus.h @@ -5,7 +5,7 @@ /* */ /* OpenPOWER OnChipController Project */ /* */ -/* Contributors Listed Below - COPYRIGHT 2011,2015 */ +/* Contributors Listed Below - COPYRIGHT 2011,2017 */ /* [+] International Business Machines Corp. */ /* */ /* */ @@ -43,6 +43,7 @@ typedef enum { AVSBUS_VDD = 0x00, AVSBUS_VDN = 0x01, + AVSBUS_TYPE_MAX = 2 // Number of bus types } avsbus_type_e; typedef enum @@ -50,7 +51,9 @@ typedef enum // This enum contains the AVS Bus CmdDataType that can be read AVSBUS_VOLTAGE = 0x00, AVSBUS_CURRENT = 0x02, - AVSBUS_STATUS = 0x0E + AVSBUS_TEMPERATURE = 0x03, + AVSBUS_STATUS = 0x0E, + AVSBUS_CMDS_MAX = 4 // Number of supported AVS bus commands } avsbus_cmdtype_e; // Setup the AVS Bus for reading @@ -60,6 +63,10 @@ void avsbus_init(); // (results can then be read on the next tick) void initiate_avsbus_reads(avsbus_cmdtype_e i_cmdType); +// Initiate read for specified type (Vdd/Vdn) and cmd (Voltage/Current/Temperature) +void avsbus_read_start(const avsbus_type_e i_type, + const avsbus_cmdtype_e i_cmdtype); + // Process AVS Bus read results (or errors) for specified bus/cmdtype. // Returns the data requested (voltage units are mV, current units are in 10mA) // Predictive error will be logged after MAX_READ_ATTEMPTS failures on the specific diff --git a/src/occ_405/sensor/sensor_main_memory.c b/src/occ_405/sensor/sensor_main_memory.c index 2cc4952c..d0fa3385 100644 --- a/src/occ_405/sensor/sensor_main_memory.c +++ b/src/occ_405/sensor/sensor_main_memory.c @@ -184,6 +184,7 @@ main_mem_sensor_t G_main_mem_sensors[] = // AMEC_SENSOR_TYPE_TEMP: gsid smf_mode master_only MAIN_MEM_SENSOR (TEMPNEST, false, false), + MAIN_MEM_SENSOR (TEMPVDD, false, false), MAIN_MEM_CORE_SENSORS (TEMPPROCTHRMC, false, false), MAIN_MEM_DIMM_SENSORS (TEMPDIMM, false, false), MAIN_MEM_SENSOR (TEMPGPU0, false, false),