From 051cc0a10cb61b410252098d13fb7dd8727a8e52 Mon Sep 17 00:00:00 2001 From: mbroyles Date: Fri, 6 Oct 2017 11:19:10 -0500 Subject: [PATCH] VRM Vdd Interfaces Change-Id: I8e2b597773c940ebc79972974a95fb323ea26660 Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48065 Tested-by: FSP CI Jenkins Reviewed-by: William A. Bryan Reviewed-by: Andres A. Lugo-Reyes Reviewed-by: Martha Broyles --- src/occ_405/amec/amec_data.c | 31 +++- src/occ_405/amec/amec_health.c | 197 +++++++++++++++++++++- src/occ_405/amec/amec_health.h | 2 + src/occ_405/amec/amec_service_codes.h | 40 ++--- src/occ_405/amec/amec_sys.h | 3 + src/occ_405/cmdh/cmdh_fsp_cmds.c | 2 +- src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c | 59 ++++--- src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h | 13 +- src/occ_405/main.c | 3 +- src/occ_405/occ_service_codes.h | 10 +- src/occ_405/occ_sys_config.h | 3 + src/occ_405/pss/avsbus.c | 2 +- src/occ_405/sensor/sensor_enum.h | 1 + src/occ_405/sensor/sensor_info.c | 35 ++-- src/occ_405/sensor/sensor_table.c | 2 + 15 files changed, 332 insertions(+), 71 deletions(-) diff --git a/src/occ_405/amec/amec_data.c b/src/occ_405/amec/amec_data.c index 4c553d6e..9294c1da 100755 --- a/src/occ_405/amec/amec_data.c +++ b/src/occ_405/amec/amec_data.c @@ -261,11 +261,40 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode) TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for DIMM", l_dvfs_temp); - g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM].error_count; + g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM_OT_STATUS].error_count; TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRHOT", g_amec->vrhotproc.setpoint); + // Store the VRM Vdd thermal data + if(i_mode == OCC_MODE_NOMINAL) + { + l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].dvfs; + l_error = l_frudata[DATA_FRU_VRM_VDD].error; + } + else + { + l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].pm_dvfs; + if(i_mode == OCC_MODE_TURBO) + { + //Need to log an error if we dvfs in static turbo mode (for mfg) + l_error = l_dvfs_temp; + } + else + { + l_error = l_frudata[DATA_FRU_VRM_VDD].pm_error; + } + } + // Store the DVFS thermal setpoint in 0.1 degrees C + g_amec->thermalvdd.setpoint = l_dvfs_temp * 10; + // Store the error temperature for OT detection + g_amec->thermalvdd.ot_error = l_error; + // Store the temperature timeout value + g_amec->thermalvdd.temp_timeout = l_frudata[DATA_FRU_VRM_VDD].max_read_timeout; + + TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRM Vdd", + l_dvfs_temp); + } while(0); return l_err; diff --git a/src/occ_405/amec/amec_health.c b/src/occ_405/amec/amec_health.c index 1d026d25..12c348db 100755 --- a/src/occ_405/amec/amec_health.c +++ b/src/occ_405/amec/amec_health.c @@ -67,6 +67,9 @@ uint8_t G_cent_temp_expired_bitmap = 0; // Array to store the update tag of each core's temperature sensor uint32_t G_core_temp_update_tag[MAX_NUM_CORES] = {0}; +// Reading VRM Vdd temperature timedout? +bool G_vrm_vdd_temp_expired = false; + //*************************************************************************/ // Function Declarations //*************************************************************************/ @@ -398,13 +401,13 @@ void amec_health_check_dimm_timeout() * @reasoncode FRU_TEMP_TIMEOUT * @userdata1 timeout value in seconds * @userdata2 0 - * @userdata4 OCC_NO_EXTENDED_RC + * @userdata4 ERC_AMEC_DIMM_TEMP_TIMEOUT * @devdesc Failed to read a memory DIMM temperature * */ l_err = createErrl(AMEC_HEALTH_CHECK_DIMM_TIMEOUT, //modId FRU_TEMP_TIMEOUT, //reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code + ERC_AMEC_DIMM_TEMP_TIMEOUT, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size @@ -706,14 +709,14 @@ void amec_health_check_cent_timeout() * @reasoncode FRU_TEMP_TIMEOUT * @userdata1 timeout value in seconds * @userdata2 0 - * @userdata4 OCC_NO_EXTENDED_RC + * @userdata4 ERC_AMEC_CENT_TEMP_TIMEOUT * @devdesc Failed to read a centaur memory controller * temperature * */ l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TIMEOUT, //modId FRU_TEMP_TIMEOUT, //reasoncode - OCC_NO_EXTENDED_RC, //Extended reason code + ERC_AMEC_CENT_TEMP_TIMEOUT, //Extended reason code ERRL_SEV_PREDICTIVE, //Severity NULL, //Trace Buf DEFAULT_TRACE_SIZE, //Trace Size @@ -999,6 +1002,192 @@ void amec_health_check_proc_timeout() }while(0); } +// Function Specification +// +// Name: amec_health_check_vrm_vdd_temp +// +// Description: This function checks if the VRM Vdd temperature has +// exceeded the error temperature sent in data format 0x13. +// +// End Function Specification +void amec_health_check_vrm_vdd_temp() +{ + /*------------------------------------------------------------------------*/ + /* Local Variables */ + /*------------------------------------------------------------------------*/ + uint16_t l_ot_error; + static uint32_t L_error_count = 0; + static BOOLEAN L_ot_error_logged = FALSE; + sensor_t *l_sensor; + errlHndl_t l_err = NULL; + + /*------------------------------------------------------------------------*/ + /* Code */ + /*------------------------------------------------------------------------*/ + do + { + // Get TEMPVDD sensor + l_sensor = getSensorByGsid(TEMPVDD); + l_ot_error = g_amec->thermalvdd.ot_error; + + // Check to see if we exceeded our error temperature + if (l_sensor->sample > l_ot_error) + { + // Increment the error counter for this FRU + L_error_count++; + + // Trace and log error the first time this occurs + if (L_error_count == AMEC_HEALTH_ERROR_TIMER) + { + // Have we logged an OT error for this FRU already? + if (L_ot_error_logged == TRUE) + { + break; + } + + L_ot_error_logged = TRUE; + + TRAC_ERR("amec_health_check_vrm_vdd_temp: VRM vdd has exceeded OT error! temp[%u] ot_error[%u]", + l_sensor->sample, + l_ot_error); + + // Log an OT error + /* @ + * @errortype + * @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TEMP + * @reasoncode VRM_VDD_ERROR_TEMP + * @userdata1 0 + * @userdata2 Fru peak temperature sensor + * @devdesc VRM Vdd has reached error temperature + * threshold and is called out in this error log. + * + */ + l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TEMP, + VRM_VDD_ERROR_TEMP, + ERC_AMEC_PROC_ERROR_OVER_TEMPERATURE, + ERRL_SEV_PREDICTIVE, + NULL, + DEFAULT_TRACE_SIZE, + 0, + l_sensor->sample_max); + + // Callout the Ambient procedure + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_COMPONENT_ID, + ERRL_COMPONENT_ID_OVER_TEMPERATURE, + ERRL_CALLOUT_PRIORITY_HIGH); + + // Callout VRM Vdd + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.vrm_vdd_huid, + ERRL_CALLOUT_PRIORITY_MED); + + // Commit Error + commitErrl(&l_err); + } + } + else + { + // Trace that we have now dropped below the error threshold + if (L_error_count >= AMEC_HEALTH_ERROR_TIMER) + { + TRAC_INFO("amec_health_check_vrm_vdd_temp: VRM Vdd temp [%u] now below error temp [%u] after error_count [%u]", + l_sensor->sample, l_ot_error, L_error_count); + } + + // Reset the error counter for this FRU + L_error_count = 0; + } + }while (0); + +} + +// Function Specification +// +// Name: amec_health_check_vrm_vdd_temp_timeout +// +// Description: This function checks if OCC has failed to read the VRM Vdd +// temperature and if it has exceeded the maximum allowed number of retries. +// +// End Function Specification +void amec_health_check_vrm_vdd_temp_timeout() +{ + /*------------------------------------------------------------------------*/ + /* Local Variables */ + /*------------------------------------------------------------------------*/ + errlHndl_t l_err = NULL; + uint32_t l_update_tag = 0; + static uint32_t L_read_fail_cnt = 0; + static BOOLEAN L_error_logged = FALSE; + static uint32_t L_vdd_temp_update_tag = 0; + + /*------------------------------------------------------------------------*/ + /* Code */ + /*------------------------------------------------------------------------*/ + + // Check if VRM Vdd temperature sensor has been updated by checking the sensor update tag + // If the update tag is not changing, then temperature sensor is not being updated. + l_update_tag = AMECSENSOR_PTR(TEMPVDD)->update_tag; + if (l_update_tag != L_vdd_temp_update_tag) + { + // We were able to read VRM Vdd temperature + L_read_fail_cnt = 0; + G_vrm_vdd_temp_expired = false; + L_vdd_temp_update_tag = l_update_tag; + } + else + { + // Failed to read VRM Vdd temperature sensor + L_read_fail_cnt++; + + // Check if we have reached the maximum read time allowed + if((L_read_fail_cnt == g_amec->thermalvdd.temp_timeout) && + (g_amec->thermalvdd.temp_timeout != 0xFF)) + { + //temperature has expired. Notify control algorithms + G_vrm_vdd_temp_expired = true; + + // Log error one time + if (L_error_logged == FALSE) + { + L_error_logged = TRUE; + + TRAC_ERR("Timed out reading VRM Vdd temperature for timeout[%u]", + g_amec->thermalvdd.temp_timeout); + + /* @ + * @errortype + * @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT + * @reasoncode FRU_TEMP_TIMEOUT + * @userdata1 timeout value in seconds + * @userdata2 0 + * @userdata4 ERC_AMEC_VRM_VDD_TEMP_TIMEOUT + * @devdesc Failed to read VRM Vdd temperature. + * + */ + l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT, //modId + FRU_TEMP_TIMEOUT, //reasoncode + ERC_AMEC_VRM_VDD_TEMP_TIMEOUT, //Extended reason code + ERRL_SEV_PREDICTIVE, //Severity + NULL, //Trace Buf + DEFAULT_TRACE_SIZE, //Trace Size + g_amec->thermalvdd.temp_timeout, //userdata1 + 0); //userdata2 + + // Callout the VRM + addCalloutToErrl(l_err, + ERRL_CALLOUT_TYPE_HUID, + G_sysConfigData.vrm_vdd_huid, + ERRL_CALLOUT_PRIORITY_MED); + + // Commit error log and request reset + REQUEST_RESET(l_err); + } + } // if reached timeout + } // else failed to read temp +} + /*----------------------------------------------------------------------------*/ /* End */ /*----------------------------------------------------------------------------*/ diff --git a/src/occ_405/amec/amec_health.h b/src/occ_405/amec/amec_health.h index 11d8fb0f..7992f265 100755 --- a/src/occ_405/amec/amec_health.h +++ b/src/occ_405/amec/amec_health.h @@ -51,5 +51,7 @@ void amec_mem_mark_logged(uint8_t i_cent, uint8_t i_dimm, uint8_t* i_clog_bitmap, uint8_t* i_dlog_bitmap); +void amec_health_check_vrm_vdd_temp(void); +void amec_health_check_vrm_vdd_temp_timeout(void); #endif diff --git a/src/occ_405/amec/amec_service_codes.h b/src/occ_405/amec/amec_service_codes.h index f206daf0..8c87e5f2 100755 --- a/src/occ_405/amec/amec_service_codes.h +++ b/src/occ_405/amec/amec_service_codes.h @@ -48,25 +48,27 @@ /*----------------------------------------------------------------------------*/ enum occAmecModuleId { - AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00, - AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01, - AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02, - AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03, - AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04, - AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05, - AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06, - AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07, - AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08, - AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09, - AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10, - AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11, - AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12, - AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13, - AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14, - AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16, - AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17, - AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18, - AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19, + AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00, + AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01, + AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02, + AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03, + AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04, + AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05, + AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06, + AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07, + AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08, + AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09, + AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10, + AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11, + AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12, + AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13, + AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14, + AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16, + AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17, + AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18, + AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19, + AMEC_HEALTH_CHECK_VRM_VDD_TEMP = AMEC_COMP_ID | 0x1A, + AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT = AMEC_COMP_ID | 0x1B, }; /*----------------------------------------------------------------------------*/ diff --git a/src/occ_405/amec/amec_sys.h b/src/occ_405/amec/amec_sys.h index 3f1d3335..e86a0003 100755 --- a/src/occ_405/amec/amec_sys.h +++ b/src/occ_405/amec/amec_sys.h @@ -362,6 +362,7 @@ typedef struct sensor_t vrhot_mem_proc; sensor_t vrfan; + sensor_t tempvdd; // Chip Sensors sensor_t todclock0; @@ -687,6 +688,8 @@ typedef struct amec_controller_t thermaldimm; // Thermal Controller based on VRHOT signal from processor VRM amec_controller_t vrhotproc; + // Thermal Controller based on VRM Vdd temperatures + amec_controller_t thermalvdd; // Oversubscription Status oversub_status_t oversub_status; diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds.c b/src/occ_405/cmdh/cmdh_fsp_cmds.c index 422dc381..28fe8bb0 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds.c @@ -349,7 +349,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr) if (vrfan != NULL) { l_tempSensorList[l_sensorHeader.count].id = 0; - l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM; + l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM_OT_STATUS; l_tempSensorList[l_sensorHeader.count].value = vrfan->sample & 0xFF; l_sensorHeader.count++; } diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c index c35f1b75..5982e7fa 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c +++ b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.c @@ -57,6 +57,7 @@ #define DATA_PCAP_VERSION_20 0x20 #define DATA_SYS_VERSION_20 0x20 +#define DATA_SYS_VERSION_21 0x21 #define DATA_APSS_VERSION20 0x20 @@ -1565,7 +1566,7 @@ errlHndl_t data_store_sys_config(const cmdh_fsp_cmd_t * i_cmd_ptr, errlHndl_t l_err = NULL; // Cast the command to the struct for this format - cmdh_sys_config_v20_t * l_cmd_ptr = (cmdh_sys_config_v20_t *)i_cmd_ptr; + cmdh_sys_config_v21_t * l_cmd_ptr = (cmdh_sys_config_v21_t *)i_cmd_ptr; uint16_t l_data_length = 0; uint32_t l_sys_data_sz = 0; bool l_invalid_input = TRUE; //Assume bad input @@ -1582,6 +1583,14 @@ errlHndl_t data_store_sys_config(const cmdh_fsp_cmd_t * i_cmd_ptr, l_invalid_input = FALSE; } } + else if(l_cmd_ptr->version == DATA_SYS_VERSION_21) + { + l_sys_data_sz = sizeof(cmdh_sys_config_v21_t) - sizeof(cmdh_fsp_cmd_header_t); + if(l_sys_data_sz == l_data_length) + { + l_invalid_input = FALSE; + } + } if(l_invalid_input) { @@ -1613,28 +1622,32 @@ errlHndl_t data_store_sys_config(const cmdh_fsp_cmd_t * i_cmd_ptr, ERRL_COMPONENT_ID_FIRMWARE, ERRL_CALLOUT_PRIORITY_HIGH); } - else + else // version and length is valid, store the data { - if(l_cmd_ptr->version == DATA_SYS_VERSION_20) + // Copy data that is common to all versions + G_sysConfigData.system_type.byte = l_cmd_ptr->sys_config.system_type; + G_sysConfigData.backplane_huid = l_cmd_ptr->sys_config.backplane_sid; + G_sysConfigData.apss_huid = l_cmd_ptr->sys_config.apss_sid; + G_sysConfigData.proc_huid = l_cmd_ptr->sys_config.proc_sid; + CNFG_DBG("data_store_sys_config: SystemType[0x%02X] BPSID[0x%08X] APSSSID[0x%08X] ProcSID[0x%08X]", + G_sysConfigData.system_type.byte, G_sysConfigData.backplane_huid, G_sysConfigData.apss_huid, + G_sysConfigData.proc_huid); + + //Write core temp and freq sensor ids + //Core Temp and Freq sensors are always in sequence in the table + for (l_coreIndex = 0; l_coreIndex < MAX_CORES; l_coreIndex++) { - // Copy data - G_sysConfigData.system_type.byte = l_cmd_ptr->sys_config.system_type; - G_sysConfigData.backplane_huid = l_cmd_ptr->sys_config.backplane_sid; - G_sysConfigData.apss_huid = l_cmd_ptr->sys_config.apss_sid; - G_sysConfigData.proc_huid = l_cmd_ptr->sys_config.proc_sid; - CNFG_DBG("data_store_sys_config: SystemType[0x%02X] BPSID[0x%08X] APSSSID[0x%08X] ProcSID[0x%08X]", - G_sysConfigData.system_type.byte, G_sysConfigData.backplane_huid, G_sysConfigData.apss_huid, - G_sysConfigData.proc_huid); - - //Write core temp and freq sensor ids - //Core Temp and Freq sensors are always in sequence in the table - for (l_coreIndex = 0; l_coreIndex < MAX_CORES; l_coreIndex++) - { - AMECSENSOR_PTR(TEMPPROCTHRMC0 + l_coreIndex)->ipmi_sid = l_cmd_ptr->sys_config.core_sid[(l_coreIndex * 2)]; - AMECSENSOR_PTR(FREQAC0 + l_coreIndex)->ipmi_sid = l_cmd_ptr->sys_config.core_sid[(l_coreIndex * 2) + 1]; - CNFG_DBG("data_store_sys_config: Core[%d] TempSID[0x%08X] FreqSID[0x%08X]", l_coreIndex, - AMECSENSOR_PTR(TEMPPROCTHRMC0 + l_coreIndex)->ipmi_sid, AMECSENSOR_PTR(FREQAC0 + l_coreIndex)->ipmi_sid); - } + AMECSENSOR_PTR(TEMPPROCTHRMC0 + l_coreIndex)->ipmi_sid = l_cmd_ptr->sys_config.core_sid[(l_coreIndex * 2)]; + AMECSENSOR_PTR(FREQAC0 + l_coreIndex)->ipmi_sid = l_cmd_ptr->sys_config.core_sid[(l_coreIndex * 2) + 1]; + CNFG_DBG("data_store_sys_config: Core[%d] TempSID[0x%08X] FreqSID[0x%08X]", l_coreIndex, + AMECSENSOR_PTR(TEMPPROCTHRMC0 + l_coreIndex)->ipmi_sid, AMECSENSOR_PTR(FREQAC0 + l_coreIndex)->ipmi_sid); + } + + if(l_cmd_ptr->version == DATA_SYS_VERSION_21) + { + // Copy the additional data for version 21 + G_sysConfigData.vrm_vdd_huid = l_cmd_ptr->vrm_vdd_sid; + AMECSENSOR_PTR(TEMPVDD)->ipmi_sid = l_cmd_ptr->vrm_vdd_temp_sid; } // Change Data Request Mask to indicate we got this data @@ -1723,7 +1736,7 @@ errlHndl_t data_store_thrm_thresholds(const cmdh_fsp_cmd_t * i_cmd_ptr, l_cmd_ptr->data[i].max_read_timeout; // Set a local flag if we get data for VRM FRU type - if(l_frutype == DATA_FRU_VRM) + if(l_frutype == DATA_FRU_VRM_OT_STATUS) { l_vrm_frutype = TRUE; } @@ -1759,7 +1772,7 @@ errlHndl_t data_store_thrm_thresholds(const cmdh_fsp_cmd_t * i_cmd_ptr, // Also, make the error count very high so that the health // monitor doesn't complain about VRHOT being asserted. G_vrm_thermal_monitoring = FALSE; - G_data_cnfg->thrm_thresh.data[DATA_FRU_VRM].error_count = 0xFF; + G_data_cnfg->thrm_thresh.data[DATA_FRU_VRM_OT_STATUS].error_count = 0xFF; CMDH_TRAC_IMP("data_store_thrm_thresholds: No VRM limits received. OCC will not monitor AVS bus status"); } diff --git a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h index fcb48939..dbeb768f 100755 --- a/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h +++ b/src/occ_405/cmdh/cmdh_fsp_cmds_datacnfg.h @@ -76,9 +76,10 @@ typedef enum DATA_FRU_PROC = 0x00, DATA_FRU_CENTAUR = 0x01, DATA_FRU_DIMM = 0x02, - DATA_FRU_VRM = 0x03, + DATA_FRU_VRM_OT_STATUS = 0x03, // this is just a bit indicating OT or not DATA_FRU_GPU = 0x04, DATA_FRU_GPU_MEM = 0x05, + DATA_FRU_VRM_VDD = 0x06, // this is an actual temperature reading for VRM Vdd DATA_FRU_MAX, } eConfigDataFruType; @@ -218,6 +219,16 @@ typedef struct __attribute__ ((packed)) cmdh_sys_config_data_v20_t sys_config; }cmdh_sys_config_v20_t; +typedef struct __attribute__ ((packed)) +{ + struct cmdh_fsp_cmd_header; + uint8_t format; + uint8_t version; + cmdh_sys_config_data_v20_t sys_config; + uint32_t vrm_vdd_sid; // VRM Vdd Sensor ID for hw callout + uint32_t vrm_vdd_temp_sid; // VRM Vdd Temperature sensor ID +}cmdh_sys_config_v21_t; + // Used by TMGT to send OCC the IPS config data. typedef struct __attribute__ ((packed)) { diff --git a/src/occ_405/main.c b/src/occ_405/main.c index ad09280f..f80486e8 100755 --- a/src/occ_405/main.c +++ b/src/occ_405/main.c @@ -1343,10 +1343,11 @@ void hmon_routine() } //if we are in observation, characterization, or activate state, then monitor the processor - //temperature for timeout conditions and the processor VRHOT signal. + //and VRM Vdd temperatures for timeout conditions if (IS_OCC_STATE_OBSERVATION() || IS_OCC_STATE_ACTIVE() || IS_OCC_STATE_CHARACTERIZATION()) { amec_health_check_proc_timeout(); +// enable with VRM Vdd read support amec_health_check_vrm_vdd_temp_timeout(); } //if we are in observation, characterization, or active state with memory temperature data diff --git a/src/occ_405/occ_service_codes.h b/src/occ_405/occ_service_codes.h index c740c059..fd0fbecb 100644 --- a/src/occ_405/occ_service_codes.h +++ b/src/occ_405/occ_service_codes.h @@ -61,8 +61,8 @@ enum occReasonCode EXTERNAL_INTERFACE_FAILURE = 0x18, /// Incorrect number of active quads reported INVALID_ACTIVE_QUAD_COUNT = 0x19, - /// VRM reached error threshold (VR_HOT asserted) - VRM_ERROR_TEMP = 0x20, + /// VRM Vdd reached error temperature threshold + VRM_VDD_ERROR_TEMP = 0x20, /// VR_FAN - AVS Bus over-temperature reported VRM_VRFAN_WARNING = 0x22, /// GPIO_VR_HOT_MEM_PROC signal from APSS asserted @@ -210,9 +210,9 @@ enum occExtReasonCode ERC_AMEC_PROC_ERROR_OVER_TEMPERATURE = 0x002F, - ERC_APLT_INIT_FAILURE = 0x0030, - ERC_APLT_START_VERSION_MISMATCH = 0x0031, - ERC_APLT_START_CHECKSUM_MISMATCH = 0x0032, + ERC_AMEC_VRM_VDD_TEMP_TIMEOUT = 0x0030, + ERC_AMEC_DIMM_TEMP_TIMEOUT = 0x0031, + ERC_AMEC_CENT_TEMP_TIMEOUT = 0x0032, ERC_CMDH_MBOX_REQST_FAILURE = 0x0040, ERC_CMDH_INTERNAL_FAILURE = 0x0041, diff --git a/src/occ_405/occ_sys_config.h b/src/occ_405/occ_sys_config.h index 9e5afcdd..e398a6cc 100755 --- a/src/occ_405/occ_sys_config.h +++ b/src/occ_405/occ_sys_config.h @@ -340,6 +340,9 @@ typedef struct // DPSS HUID - Used by OCC for DPSS error call out uint32_t dpss_huid; + // VRM Vdd HUID - Used by OCC for VRM Vdd error call out + uint32_t vrm_vdd_huid; + // Contains how many OCCs & how many proc modules are present. uint8_t sys_num_proc_present; diff --git a/src/occ_405/pss/avsbus.c b/src/occ_405/pss/avsbus.c index d6252c52..7c155f9f 100644 --- a/src/occ_405/pss/avsbus.c +++ b/src/occ_405/pss/avsbus.c @@ -768,7 +768,7 @@ uint16_t avsbus_read_status(const avsbus_type_e i_type) uint16_t o_reading = 0; bool l_failure = FALSE; - const uint8_t max_read_attempts = G_data_cnfg->thrm_thresh.data[DATA_FRU_VRM].max_read_timeout; + const uint8_t max_read_attempts = G_data_cnfg->thrm_thresh.data[DATA_FRU_VRM_OT_STATUS].max_read_timeout; // Static error counters for each type (Vdd/Vdn) static uint32_t L_error_count[ERRORCOUNT_MAXTYPES] = {0}; diff --git a/src/occ_405/sensor/sensor_enum.h b/src/occ_405/sensor/sensor_enum.h index f3fe7436..433530e1 100755 --- a/src/occ_405/sensor/sensor_enum.h +++ b/src/occ_405/sensor/sensor_enum.h @@ -145,6 +145,7 @@ enum e_gsid CURVDD, CURVDN, VRMPROCOT, + TEMPVDD, // ------------------------------------------------------ // Core Sensors diff --git a/src/occ_405/sensor/sensor_info.c b/src/occ_405/sensor/sensor_info.c index 23bda41c..06f3ff1e 100755 --- a/src/occ_405/sensor/sensor_info.c +++ b/src/occ_405/sensor/sensor_info.c @@ -29,7 +29,9 @@ #define AMEEFP_250US_IN_HZ AMEFP(4,3) // 4000 Hz #define AMEEFP_500US_IN_HZ AMEFP(2,3) // 2000 Hz #define AMEEFP_1MS_IN_HZ AMEFP(1,3) // 1000 Hz +#define AMEEFP_1500US_IN_HZ AMEFP(75,1) // 750 Hz #define AMEEFP_2MS_IN_HZ AMEFP(5,2) // 500 Hz +#define AMEEFP_3MS_IN_HZ AMEFP(375,0) // 375 Hz #define AMEEFP_4MS_IN_HZ AMEFP(25,1) // 250 Hz #define AMEEFP_8MS_IN_HZ AMEFP(125,0) // 125 Hz #define AMEEFP_16MS_IN_HZ AMEFP(625,-1) // 62.5 Hz @@ -40,14 +42,16 @@ #define AMEFP_SCALE_0_16384 AMEFP(610352,-8) // scalar so that digital 16384=100% // constants to allow fewer changes if tick time changes -#define AMEEFP_EVERY_TICK_HZ AMEEFP_500US_IN_HZ // tick time 500us -#define AMEEFP_EVERY_2ND_TICK_HZ AMEEFP_1MS_IN_HZ // 1ms -#define AMEEFP_EVERY_4TH_TICK_HZ AMEEFP_2MS_IN_HZ // 2ms -#define AMEEFP_EVERY_8TH_TICK_HZ AMEEFP_4MS_IN_HZ // 4ms -#define AMEEFP_EVERY_16TH_TICK_HZ AMEEFP_8MS_IN_HZ // 8ms -#define AMEEFP_EVERY_32ND_TICK_HZ AMEEFP_16MS_IN_HZ // 16ms -#define AMEEFP_EVERY_64TH_TICK_HZ AMEEFP_32MS_IN_HZ // 32ms -#define AMEEFP_EVERY_128TH_TICK_HZ AMEEFP_64MS_IN_HZ // 64ms +#define AMEEFP_EVERY_TICK_HZ AMEEFP_500US_IN_HZ // tick time 500us +#define AMEEFP_EVERY_2ND_TICK_HZ AMEEFP_1MS_IN_HZ // 1ms +#define AMEEFP_EVERY_3RD_TICK_HZ AMEEFP_1500US_IN_HZ // 1.5ms +#define AMEEFP_EVERY_4TH_TICK_HZ AMEEFP_2MS_IN_HZ // 2ms +#define AMEEFP_EVERY_6TH_TICK_HZ AMEEFP_3MS_IN_HZ // 3ms +#define AMEEFP_EVERY_8TH_TICK_HZ AMEEFP_4MS_IN_HZ // 4ms +#define AMEEFP_EVERY_16TH_TICK_HZ AMEEFP_8MS_IN_HZ // 8ms +#define AMEEFP_EVERY_32ND_TICK_HZ AMEEFP_16MS_IN_HZ // 16ms +#define AMEEFP_EVERY_64TH_TICK_HZ AMEEFP_32MS_IN_HZ // 32ms +#define AMEEFP_EVERY_128TH_TICK_HZ AMEEFP_64MS_IN_HZ // 64ms // This will get the string when given the GSID #define SENSOR_GSID_TO_STRING(gsid) G_sensor_list[gsid].name; @@ -320,8 +324,8 @@ const sensor_info_t G_sensor_info[] = SENSOR_INFO_T_ENTRY( TEMPPROCTHRM, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_16TH_TICK_HZ, AMEFP( 1, 0) ), SENSOR_INFO_T_ENTRY( UTIL, "%\0", AMEC_SENSOR_TYPE_UTIL, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_16TH_TICK_HZ, AMEFP( 1,-2) ), SENSOR_INFO_T_ENTRY( TEMPNEST, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_16TH_TICK_HZ, AMEFP( 1, 0) ), - SENSOR_INFO_T_ENTRY( VOLTVDDSENSE, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1, -1) ), - SENSOR_INFO_T_ENTRY( VOLTVDNSENSE, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1, -1) ), + SENSOR_INFO_T_ENTRY( VOLTVDDSENSE, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_3RD_TICK_HZ, AMEFP( 1, -1) ), + SENSOR_INFO_T_ENTRY( VOLTVDNSENSE, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_3RD_TICK_HZ, AMEFP( 1, -1) ), SENSOR_INFO_T_ENTRY( PWRVDD, "W\0", AMEC_SENSOR_TYPE_POWER, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1, 0) ), SENSOR_INFO_T_ENTRY( PWRVDN, "W\0", AMEC_SENSOR_TYPE_POWER, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1, 0) ), SENSOR_INFO_T_ENTRY( PROCPWRTHROT, "#\0", AMEC_SENSOR_TYPE_PERF, AMEC_SENSOR_LOC_PROC, AMEC_SENSOR_NONUM, AMEEFP_EVERY_TICK_HZ, AMEFP( 1, 0) ), @@ -331,11 +335,12 @@ const sensor_info_t G_sensor_info[] = SENS_QUAD_ENTRY_SET( VOLTDROOPCNTQ, "#\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_QUAD, AMEC_SENSOR_NONUM, AMEEFP_EVERY_16TH_TICK_HZ, AMEFP( 1, 0) ), /* ==ReguSensors== NameString Units Type Location Number Freq ScaleFactor */ - SENSOR_INFO_T_ENTRY( VOLTVDD, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1, -1) ), - SENSOR_INFO_T_ENTRY( VOLTVDN, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1, -1) ), - SENSOR_INFO_T_ENTRY( CURVDD, "A\0", AMEC_SENSOR_TYPE_CURRENT, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1,-2) ), - SENSOR_INFO_T_ENTRY( CURVDN, "A\0", AMEC_SENSOR_TYPE_CURRENT, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_2ND_TICK_HZ, AMEFP( 1,-2) ), - SENSOR_INFO_T_ENTRY( VRMPROCOT, "#\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_4TH_TICK_HZ, AMEFP( 1, 0) ), + SENSOR_INFO_T_ENTRY( VOLTVDD, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_3RD_TICK_HZ, AMEFP( 1, -1) ), + SENSOR_INFO_T_ENTRY( VOLTVDN, "mV\0", AMEC_SENSOR_TYPE_VOLTAGE, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_3RD_TICK_HZ, AMEFP( 1, -1) ), + SENSOR_INFO_T_ENTRY( CURVDD, "A\0", AMEC_SENSOR_TYPE_CURRENT, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_3RD_TICK_HZ, AMEFP( 1,-2) ), + SENSOR_INFO_T_ENTRY( CURVDN, "A\0", AMEC_SENSOR_TYPE_CURRENT, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_3RD_TICK_HZ, AMEFP( 1,-2) ), + SENSOR_INFO_T_ENTRY( VRMPROCOT, "#\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_6TH_TICK_HZ, AMEFP( 1, 0) ), + SENSOR_INFO_T_ENTRY( TEMPVDD, "C\0", AMEC_SENSOR_TYPE_TEMP, AMEC_SENSOR_LOC_VRM, AMEC_SENSOR_NONUM, AMEEFP_EVERY_6TH_TICK_HZ, AMEFP( 1, 0) ), /* ==CoreSensors== NameString Units Type Location Number Freq ScaleFactor */ SENS_CORE_ENTRY_SET( FREQREQC, "MHz\0", AMEC_SENSOR_TYPE_FREQ, AMEC_SENSOR_LOC_CORE, AMEC_SENSOR_NONUM, AMEEFP_EVERY_TICK_HZ, AMEFP( 1, 0) ), diff --git a/src/occ_405/sensor/sensor_table.c b/src/occ_405/sensor/sensor_table.c index c2dabdb2..1b7aef05 100755 --- a/src/occ_405/sensor/sensor_table.c +++ b/src/occ_405/sensor/sensor_table.c @@ -377,6 +377,7 @@ const sensor_ptr_t G_amec_sensor_list[] = SENSOR_PTR( CURVDD, &g_amec_sys.proc[0].curvdd), SENSOR_PTR( CURVDN, &g_amec_sys.proc[0].curvdn), SENSOR_PTR( VRMPROCOT, &g_amec_sys.sys.vrfan), + SENSOR_PTR( TEMPVDD, &g_amec_sys.sys.tempvdd), // ------------------------------------------------------ @@ -558,6 +559,7 @@ const minisensor_ptr_t G_amec_mini_sensor_list[] INIT_SECTION = MINI_SENSOR_PTR( CURVDD, NULL), MINI_SENSOR_PTR( CURVDN, NULL), MINI_SENSOR_PTR( VRMPROCOT, NULL), + MINI_SENSOR_PTR( TEMPVDD, NULL), // ------------------------------------------------------ // Core Sensors (24 of each)