Skip to content

Commit

Permalink
VRM Vdd Interfaces
Browse files Browse the repository at this point in the history
Change-Id: I8e2b597773c940ebc79972974a95fb323ea26660
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48065
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
  • Loading branch information
marthabroyles committed Oct 16, 2017
1 parent d4fb4c3 commit 051cc0a
Show file tree
Hide file tree
Showing 15 changed files with 332 additions and 71 deletions.
31 changes: 30 additions & 1 deletion src/occ_405/amec/amec_data.c
Expand Up @@ -261,11 +261,40 @@ errlHndl_t AMEC_data_write_thrm_thresholds(const OCC_MODE i_mode)
TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for DIMM",
l_dvfs_temp);

g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM].error_count;
g_amec->vrhotproc.setpoint = l_frudata[DATA_FRU_VRM_OT_STATUS].error_count;

TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRHOT",
g_amec->vrhotproc.setpoint);

// Store the VRM Vdd thermal data
if(i_mode == OCC_MODE_NOMINAL)
{
l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].dvfs;
l_error = l_frudata[DATA_FRU_VRM_VDD].error;
}
else
{
l_dvfs_temp = l_frudata[DATA_FRU_VRM_VDD].pm_dvfs;
if(i_mode == OCC_MODE_TURBO)
{
//Need to log an error if we dvfs in static turbo mode (for mfg)
l_error = l_dvfs_temp;
}
else
{
l_error = l_frudata[DATA_FRU_VRM_VDD].pm_error;
}
}
// Store the DVFS thermal setpoint in 0.1 degrees C
g_amec->thermalvdd.setpoint = l_dvfs_temp * 10;
// Store the error temperature for OT detection
g_amec->thermalvdd.ot_error = l_error;
// Store the temperature timeout value
g_amec->thermalvdd.temp_timeout = l_frudata[DATA_FRU_VRM_VDD].max_read_timeout;

TRAC_INFO("AMEC_data_write_thrm_thresholds: Setting %u as DVFS setpoint for VRM Vdd",
l_dvfs_temp);

} while(0);

return l_err;
Expand Down
197 changes: 193 additions & 4 deletions src/occ_405/amec/amec_health.c
Expand Up @@ -67,6 +67,9 @@ uint8_t G_cent_temp_expired_bitmap = 0;
// Array to store the update tag of each core's temperature sensor
uint32_t G_core_temp_update_tag[MAX_NUM_CORES] = {0};

// Reading VRM Vdd temperature timedout?
bool G_vrm_vdd_temp_expired = false;

//*************************************************************************/
// Function Declarations
//*************************************************************************/
Expand Down Expand Up @@ -398,13 +401,13 @@ void amec_health_check_dimm_timeout()
* @reasoncode FRU_TEMP_TIMEOUT
* @userdata1 timeout value in seconds
* @userdata2 0
* @userdata4 OCC_NO_EXTENDED_RC
* @userdata4 ERC_AMEC_DIMM_TEMP_TIMEOUT
* @devdesc Failed to read a memory DIMM temperature
*
*/
l_err = createErrl(AMEC_HEALTH_CHECK_DIMM_TIMEOUT, //modId
FRU_TEMP_TIMEOUT, //reasoncode
OCC_NO_EXTENDED_RC, //Extended reason code
ERC_AMEC_DIMM_TEMP_TIMEOUT, //Extended reason code
ERRL_SEV_PREDICTIVE, //Severity
NULL, //Trace Buf
DEFAULT_TRACE_SIZE, //Trace Size
Expand Down Expand Up @@ -706,14 +709,14 @@ void amec_health_check_cent_timeout()
* @reasoncode FRU_TEMP_TIMEOUT
* @userdata1 timeout value in seconds
* @userdata2 0
* @userdata4 OCC_NO_EXTENDED_RC
* @userdata4 ERC_AMEC_CENT_TEMP_TIMEOUT
* @devdesc Failed to read a centaur memory controller
* temperature
*
*/
l_err = createErrl(AMEC_HEALTH_CHECK_CENT_TIMEOUT, //modId
FRU_TEMP_TIMEOUT, //reasoncode
OCC_NO_EXTENDED_RC, //Extended reason code
ERC_AMEC_CENT_TEMP_TIMEOUT, //Extended reason code
ERRL_SEV_PREDICTIVE, //Severity
NULL, //Trace Buf
DEFAULT_TRACE_SIZE, //Trace Size
Expand Down Expand Up @@ -999,6 +1002,192 @@ void amec_health_check_proc_timeout()
}while(0);
}

// Function Specification
//
// Name: amec_health_check_vrm_vdd_temp
//
// Description: This function checks if the VRM Vdd temperature has
// exceeded the error temperature sent in data format 0x13.
//
// End Function Specification
void amec_health_check_vrm_vdd_temp()
{
/*------------------------------------------------------------------------*/
/* Local Variables */
/*------------------------------------------------------------------------*/
uint16_t l_ot_error;
static uint32_t L_error_count = 0;
static BOOLEAN L_ot_error_logged = FALSE;
sensor_t *l_sensor;
errlHndl_t l_err = NULL;

/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/
do
{
// Get TEMPVDD sensor
l_sensor = getSensorByGsid(TEMPVDD);
l_ot_error = g_amec->thermalvdd.ot_error;

// Check to see if we exceeded our error temperature
if (l_sensor->sample > l_ot_error)
{
// Increment the error counter for this FRU
L_error_count++;

// Trace and log error the first time this occurs
if (L_error_count == AMEC_HEALTH_ERROR_TIMER)
{
// Have we logged an OT error for this FRU already?
if (L_ot_error_logged == TRUE)
{
break;
}

L_ot_error_logged = TRUE;

TRAC_ERR("amec_health_check_vrm_vdd_temp: VRM vdd has exceeded OT error! temp[%u] ot_error[%u]",
l_sensor->sample,
l_ot_error);

// Log an OT error
/* @
* @errortype
* @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TEMP
* @reasoncode VRM_VDD_ERROR_TEMP
* @userdata1 0
* @userdata2 Fru peak temperature sensor
* @devdesc VRM Vdd has reached error temperature
* threshold and is called out in this error log.
*
*/
l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TEMP,
VRM_VDD_ERROR_TEMP,
ERC_AMEC_PROC_ERROR_OVER_TEMPERATURE,
ERRL_SEV_PREDICTIVE,
NULL,
DEFAULT_TRACE_SIZE,
0,
l_sensor->sample_max);

// Callout the Ambient procedure
addCalloutToErrl(l_err,
ERRL_CALLOUT_TYPE_COMPONENT_ID,
ERRL_COMPONENT_ID_OVER_TEMPERATURE,
ERRL_CALLOUT_PRIORITY_HIGH);

// Callout VRM Vdd
addCalloutToErrl(l_err,
ERRL_CALLOUT_TYPE_HUID,
G_sysConfigData.vrm_vdd_huid,
ERRL_CALLOUT_PRIORITY_MED);

// Commit Error
commitErrl(&l_err);
}
}
else
{
// Trace that we have now dropped below the error threshold
if (L_error_count >= AMEC_HEALTH_ERROR_TIMER)
{
TRAC_INFO("amec_health_check_vrm_vdd_temp: VRM Vdd temp [%u] now below error temp [%u] after error_count [%u]",
l_sensor->sample, l_ot_error, L_error_count);
}

// Reset the error counter for this FRU
L_error_count = 0;
}
}while (0);

}

// Function Specification
//
// Name: amec_health_check_vrm_vdd_temp_timeout
//
// Description: This function checks if OCC has failed to read the VRM Vdd
// temperature and if it has exceeded the maximum allowed number of retries.
//
// End Function Specification
void amec_health_check_vrm_vdd_temp_timeout()
{
/*------------------------------------------------------------------------*/
/* Local Variables */
/*------------------------------------------------------------------------*/
errlHndl_t l_err = NULL;
uint32_t l_update_tag = 0;
static uint32_t L_read_fail_cnt = 0;
static BOOLEAN L_error_logged = FALSE;
static uint32_t L_vdd_temp_update_tag = 0;

/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/

// Check if VRM Vdd temperature sensor has been updated by checking the sensor update tag
// If the update tag is not changing, then temperature sensor is not being updated.
l_update_tag = AMECSENSOR_PTR(TEMPVDD)->update_tag;
if (l_update_tag != L_vdd_temp_update_tag)
{
// We were able to read VRM Vdd temperature
L_read_fail_cnt = 0;
G_vrm_vdd_temp_expired = false;
L_vdd_temp_update_tag = l_update_tag;
}
else
{
// Failed to read VRM Vdd temperature sensor
L_read_fail_cnt++;

// Check if we have reached the maximum read time allowed
if((L_read_fail_cnt == g_amec->thermalvdd.temp_timeout) &&
(g_amec->thermalvdd.temp_timeout != 0xFF))
{
//temperature has expired. Notify control algorithms
G_vrm_vdd_temp_expired = true;

// Log error one time
if (L_error_logged == FALSE)
{
L_error_logged = TRUE;

TRAC_ERR("Timed out reading VRM Vdd temperature for timeout[%u]",
g_amec->thermalvdd.temp_timeout);

/* @
* @errortype
* @moduleid AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT
* @reasoncode FRU_TEMP_TIMEOUT
* @userdata1 timeout value in seconds
* @userdata2 0
* @userdata4 ERC_AMEC_VRM_VDD_TEMP_TIMEOUT
* @devdesc Failed to read VRM Vdd temperature.
*
*/
l_err = createErrl(AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT, //modId
FRU_TEMP_TIMEOUT, //reasoncode
ERC_AMEC_VRM_VDD_TEMP_TIMEOUT, //Extended reason code
ERRL_SEV_PREDICTIVE, //Severity
NULL, //Trace Buf
DEFAULT_TRACE_SIZE, //Trace Size
g_amec->thermalvdd.temp_timeout, //userdata1
0); //userdata2

// Callout the VRM
addCalloutToErrl(l_err,
ERRL_CALLOUT_TYPE_HUID,
G_sysConfigData.vrm_vdd_huid,
ERRL_CALLOUT_PRIORITY_MED);

// Commit error log and request reset
REQUEST_RESET(l_err);
}
} // if reached timeout
} // else failed to read temp
}

/*----------------------------------------------------------------------------*/
/* End */
/*----------------------------------------------------------------------------*/
2 changes: 2 additions & 0 deletions src/occ_405/amec/amec_health.h
Expand Up @@ -51,5 +51,7 @@ void amec_mem_mark_logged(uint8_t i_cent,
uint8_t i_dimm,
uint8_t* i_clog_bitmap,
uint8_t* i_dlog_bitmap);
void amec_health_check_vrm_vdd_temp(void);
void amec_health_check_vrm_vdd_temp_timeout(void);

#endif
40 changes: 21 additions & 19 deletions src/occ_405/amec/amec_service_codes.h
Expand Up @@ -48,25 +48,27 @@
/*----------------------------------------------------------------------------*/
enum occAmecModuleId
{
AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00,
AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01,
AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02,
AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03,
AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04,
AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05,
AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06,
AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07,
AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08,
AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09,
AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10,
AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11,
AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12,
AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13,
AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14,
AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16,
AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17,
AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18,
AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19,
AMEC_INITIALIZE_FW_SENSORS = AMEC_COMP_ID | 0x00,
AMEC_UPDATE_FW_SENSORS = AMEC_COMP_ID | 0x01,
AMEC_VECTORIZE_FW_SENSORS = AMEC_COMP_ID | 0x02,
AMEC_AMESTER_INTERFACE = AMEC_COMP_ID | 0x03,
AMEC_PCAP_CONN_OC_CONTROLLER = AMEC_COMP_ID | 0x04,
AMEC_MST_CHECK_PCAPS_MATCH = AMEC_COMP_ID | 0x05,
AMEC_MST_CHECK_UNDER_PCAP = AMEC_COMP_ID | 0x06,
AMEC_SLAVE_CHECK_PERFORMANCE = AMEC_COMP_ID | 0x07,
AMEC_HEALTH_CHECK_PROC_TEMP = AMEC_COMP_ID | 0x08,
AMEC_HEALTH_CHECK_DIMM_TEMP = AMEC_COMP_ID | 0x09,
AMEC_HEALTH_CHECK_CENT_TEMP = AMEC_COMP_ID | 0x10,
AMEC_HEALTH_CHECK_DIMM_TIMEOUT = AMEC_COMP_ID | 0x11,
AMEC_HEALTH_CHECK_CENT_TIMEOUT = AMEC_COMP_ID | 0x12,
AMEC_HEALTH_CHECK_VRFAN_TIMEOUT = AMEC_COMP_ID | 0x13,
AMEC_HEALTH_CHECK_PROC_TIMEOUT = AMEC_COMP_ID | 0x14,
AMEC_CALC_DTS_SENSORS = AMEC_COMP_ID | 0x16,
AMEC_SET_FREQ_RANGE = AMEC_COMP_ID | 0x17,
AMEC_UPDATE_APSS_GPIO = AMEC_COMP_ID | 0x18,
AMEC_GPU_PCAP_MID = AMEC_COMP_ID | 0x19,
AMEC_HEALTH_CHECK_VRM_VDD_TEMP = AMEC_COMP_ID | 0x1A,
AMEC_HEALTH_CHECK_VRM_VDD_TIMEOUT = AMEC_COMP_ID | 0x1B,
};

/*----------------------------------------------------------------------------*/
Expand Down
3 changes: 3 additions & 0 deletions src/occ_405/amec/amec_sys.h
Expand Up @@ -362,6 +362,7 @@ typedef struct
sensor_t vrhot_mem_proc;

sensor_t vrfan;
sensor_t tempvdd;

// Chip Sensors
sensor_t todclock0;
Expand Down Expand Up @@ -687,6 +688,8 @@ typedef struct
amec_controller_t thermaldimm;
// Thermal Controller based on VRHOT signal from processor VRM
amec_controller_t vrhotproc;
// Thermal Controller based on VRM Vdd temperatures
amec_controller_t thermalvdd;

// Oversubscription Status
oversub_status_t oversub_status;
Expand Down
2 changes: 1 addition & 1 deletion src/occ_405/cmdh/cmdh_fsp_cmds.c
Expand Up @@ -349,7 +349,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
if (vrfan != NULL)
{
l_tempSensorList[l_sensorHeader.count].id = 0;
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM;
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM_OT_STATUS;
l_tempSensorList[l_sensorHeader.count].value = vrfan->sample & 0xFF;
l_sensorHeader.count++;
}
Expand Down

0 comments on commit 051cc0a

Please sign in to comment.