Skip to content

Commit

Permalink
Read VRM Vdd Temperatures
Browse files Browse the repository at this point in the history
Change-Id: I428417a8e94e2b3f8892998ef280e454c46405ed
RTC: 180433
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/48131
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
  • Loading branch information
cjcain authored and wilbryan committed Oct 19, 2017
1 parent e00c5e2 commit c07a720
Show file tree
Hide file tree
Showing 11 changed files with 129 additions and 52 deletions.
15 changes: 6 additions & 9 deletions src/occ_405/amec/amec_health.c
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2016 */
/* Contributors Listed Below - COPYRIGHT 2011,2017 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -1010,28 +1010,25 @@ void amec_health_check_proc_timeout()
// exceeded the error temperature sent in data format 0x13.
//
// End Function Specification
void amec_health_check_vrm_vdd_temp()
void amec_health_check_vrm_vdd_temp(const sensor_t *i_sensor)
{
/*------------------------------------------------------------------------*/
/* Local Variables */
/*------------------------------------------------------------------------*/
uint16_t l_ot_error;
static uint32_t L_error_count = 0;
static BOOLEAN L_ot_error_logged = FALSE;
sensor_t *l_sensor;
errlHndl_t l_err = NULL;

/*------------------------------------------------------------------------*/
/* Code */
/*------------------------------------------------------------------------*/
do
{
// Get TEMPVDD sensor
l_sensor = getSensorByGsid(TEMPVDD);
l_ot_error = g_amec->thermalvdd.ot_error;

// Check to see if we exceeded our error temperature
if (l_sensor->sample > l_ot_error)
if (i_sensor->sample > l_ot_error)
{
// Increment the error counter for this FRU
L_error_count++;
Expand All @@ -1048,7 +1045,7 @@ void amec_health_check_vrm_vdd_temp()
L_ot_error_logged = TRUE;

TRAC_ERR("amec_health_check_vrm_vdd_temp: VRM vdd has exceeded OT error! temp[%u] ot_error[%u]",
l_sensor->sample,
i_sensor->sample,
l_ot_error);

// Log an OT error
Expand All @@ -1069,7 +1066,7 @@ void amec_health_check_vrm_vdd_temp()
NULL,
DEFAULT_TRACE_SIZE,
0,
l_sensor->sample_max);
i_sensor->sample_max);

// Callout the Ambient procedure
addCalloutToErrl(l_err,
Expand All @@ -1093,7 +1090,7 @@ void amec_health_check_vrm_vdd_temp()
if (L_error_count >= AMEC_HEALTH_ERROR_TIMER)
{
TRAC_INFO("amec_health_check_vrm_vdd_temp: VRM Vdd temp [%u] now below error temp [%u] after error_count [%u]",
l_sensor->sample, l_ot_error, L_error_count);
i_sensor->sample, l_ot_error, L_error_count);
}

// Reset the error counter for this FRU
Expand Down
4 changes: 2 additions & 2 deletions src/occ_405/amec/amec_health.h
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2015 */
/* Contributors Listed Below - COPYRIGHT 2011,2017 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -51,7 +51,7 @@ void amec_mem_mark_logged(uint8_t i_cent,
uint8_t i_dimm,
uint8_t* i_clog_bitmap,
uint8_t* i_dlog_bitmap);
void amec_health_check_vrm_vdd_temp(void);
void amec_health_check_vrm_vdd_temp(const sensor_t *i_sensor);
void amec_health_check_vrm_vdd_temp_timeout(void);

#endif
46 changes: 30 additions & 16 deletions src/occ_405/amec/amec_sensors_power.c
Expand Up @@ -591,12 +591,12 @@ void amec_update_avsbus_sensors(void)
AVSBUS_STATE_INITIATE_READ = 1,
AVSBUS_STATE_PROCESS_CURRENT = 2,
AVSBUS_STATE_PROCESS_VOLTAGE = 3,
AVSBUS_STATE_PROCESS_STATUS = 4
AVSBUS_STATE_PROCESS_STATUS = 4,
AVSBUS_STATE_PROCESS_TEMPERATURE= 5
} L_avsbus_state = AVSBUS_STATE_INITIATE_READ;

// Number of Curr/Volt readings between Status readings
#define NUM_VRM_READINGS_PER_STATUS 2
static unsigned int L_readingCount = 0;
// Flag to select either temperature or status to read on 3rd tick
static bool L_read_temp = true;

if (isSafeStateRequested())
{
Expand Down Expand Up @@ -626,31 +626,45 @@ void amec_update_avsbus_sensors(void)
// Process the voltage readings
process_avsbus_voltage();

if ((G_vrm_thermal_monitoring == FALSE) || (++L_readingCount < NUM_VRM_READINGS_PER_STATUS))
// Initiate read of temperature or error status (OT/OC)
if (L_read_temp)
{
// Initiate read of currents
initiate_avsbus_reads(AVSBUS_CURRENT);
L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT;
// Initiate AVS Bus read for Vdd temperature
avsbus_read_start(AVSBUS_VDD, AVSBUS_TEMPERATURE);
L_avsbus_state = AVSBUS_STATE_PROCESS_TEMPERATURE;
}
else
{
// Periodically read status for VR FAN (VRM OT WARNING)
initiate_avsbus_read_status();
L_avsbus_state = AVSBUS_STATE_PROCESS_STATUS;
L_readingCount = 0;
}
// Toggle between reading temperature and status
L_read_temp = !L_read_temp;
break;

case AVSBUS_STATE_PROCESS_STATUS:
// Process the status
{
// Update sensor with the OT status (0 / 1)
// Process the status
uint16_t otStatus = process_avsbus_status();
sensor_update(AMECSENSOR_PTR(VRMPROCOT), otStatus);
if (G_vrm_thermal_monitoring)
{
// Update sensor with the OT status (0 / 1)
sensor_update(AMECSENSOR_PTR(VRMPROCOT), otStatus);
}
// Back to reading currents
initiate_avsbus_reads(AVSBUS_CURRENT);
L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT;
}
break;

case AVSBUS_STATE_PROCESS_TEMPERATURE:
{
// Read and process Vdd temperature
avsbus_read(AVSBUS_VDD, AVSBUS_TEMPERATURE);
// Back to reading currents
initiate_avsbus_reads(AVSBUS_CURRENT);
L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT;
}
// Back to reading currents
initiate_avsbus_reads(AVSBUS_CURRENT);
L_avsbus_state = AVSBUS_STATE_PROCESS_CURRENT;
break;

case AVSBUS_STATE_DISABLED:
Expand Down
22 changes: 22 additions & 0 deletions src/occ_405/cmdh/cmdh_fsp_cmds.c
Expand Up @@ -53,6 +53,8 @@
extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern bool G_vrm_thermal_monitoring;
extern uint32_t G_first_proc_gpu_config;
extern bool G_vrm_vdd_temp_expired;


#include <gpe_export.h>
extern gpe_shared_data_t G_shared_gpe_data;
Expand Down Expand Up @@ -355,6 +357,26 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
}
}

if (G_avsbus_vdd_monitoring)
{
// Add Vdd temp
const sensor_t *tempvdd = getSensorByGsid(TEMPVDD);
if (tempvdd != NULL)
{
l_tempSensorList[l_sensorHeader.count].id = AMECSENSOR_PTR(TEMPVDD)->ipmi_sid;
l_tempSensorList[l_sensorHeader.count].fru_type = DATA_FRU_VRM_VDD;
if (G_vrm_vdd_temp_expired)
{
l_tempSensorList[l_sensorHeader.count].value = 0xFF;
}
else
{
l_tempSensorList[l_sensorHeader.count].value = tempvdd->sample & 0xFF;
}
l_sensorHeader.count++;
}
}

// Add GPU temperatures
for (k=0; k<MAX_NUM_GPU_PER_DOMAIN; k++)
{
Expand Down
8 changes: 5 additions & 3 deletions src/occ_405/errl/errl.h
Expand Up @@ -268,9 +268,6 @@ extern uint8_t G_occErrIdCounter;

extern errlHndl_t G_occErrSlots[ERRL_MAX_SLOTS];

// Array of error counters that are only cleared on OCC reset
#define ERR_HISTORY_SIZE 32
extern uint8_t G_error_history[ERR_HISTORY_SIZE];
typedef enum {
ERRH_AVSBUS_VDD_CURRENT = 0x01,
ERRH_AVSBUS_VDD_VOLTAGE = 0x02,
Expand All @@ -297,7 +294,12 @@ typedef enum {
ERRH_GPE1_NOT_IDLE = 0x17,
ERRH_24X7_DISABLED = 0x18,
ERRH_CEFF_RATIO_VDD_EXCURSION = 0x19,
ERRH_AVSBUS_VDD_TEMPERATURE = 0x1A,
ERR_HISTORY_SIZE = 0x20
} ERR_HISTORY_INDEX;

// Array of error counters that are only cleared on OCC reset
extern uint8_t G_error_history[ERR_HISTORY_SIZE];
#define INCREMENT_ERR_HISTORY(errorIndex) { \
if ((errorIndex < ERR_HISTORY_SIZE) && (G_error_history[errorIndex] < 255)) { \
++G_error_history[errorIndex]; \
Expand Down
2 changes: 1 addition & 1 deletion src/occ_405/main.c
Expand Up @@ -1414,7 +1414,7 @@ void hmon_routine()
if (IS_OCC_STATE_OBSERVATION() || IS_OCC_STATE_ACTIVE() || IS_OCC_STATE_CHARACTERIZATION())
{
amec_health_check_proc_timeout();
// enable with VRM Vdd read support amec_health_check_vrm_vdd_temp_timeout();
amec_health_check_vrm_vdd_temp_timeout();
}

//if we are in observation, characterization, or active state with memory temperature data
Expand Down
1 change: 1 addition & 0 deletions src/occ_405/occ_service_codes.h
Expand Up @@ -259,6 +259,7 @@ enum occExtReasonCode
ERC_AVSBUS_VDN_VOLTAGE_FAILURE = 0x00AC,
ERC_AVSBUS_VDN_CURRENT_FAILURE = 0x00AD,
ERC_AVSBUS_STATUS_FAILURE = 0x00AE,
ERC_AVSBUS_VDD_TEMPERATURE_FAILURE = 0x00AF,

ERC_PGPE_BEACON_TIMEOUT = 0x00B0,
ERC_PGPE_CLIP_NOT_IDLE = 0x00B1,
Expand Down
2 changes: 1 addition & 1 deletion src/occ_405/occbuildname.c
Expand Up @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) =

#else

volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_171012a\0" /*</BuildName>*/ ;
volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_171020a\0" /*</BuildName>*/ ;

#endif

0 comments on commit c07a720

Please sign in to comment.