Skip to content

Commit

Permalink
Fix incorrect hw callout in Centaur DIMM OT errors
Browse files Browse the repository at this point in the history
Change-Id: I2a7076f1a328daf18b3eff35cd75895c472a8962
CQ: SW470683
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/80639
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
Reviewed-by: Douglas R. Gilbert <dgilbert@us.ibm.com>
Reviewed-by: William A Bryan <wilbryan@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
  • Loading branch information
marthabroyles committed Jul 19, 2019
1 parent bae814c commit d467852
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 19 deletions.
31 changes: 19 additions & 12 deletions src/occ_405/amec/amec_health.c
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -142,11 +142,12 @@ void amec_mem_mark_logged(uint8_t i_cent,
*/
void amec_health_check_dimm_temp()
{
uint16_t l_ot_error, l_cur_temp, l_max_temp;
uint16_t l_ot_error, l_max_temp;
sensor_t *l_sensor;
uint8_t l_dimm;
uint8_t l_port;
uint8_t l_max_port; // #ports in nimbus/#centaurs in cumulus
uint8_t l_max_port; // #ports in nimbus/#mem buf in cumulus/OCM
uint8_t l_max_dimm_per_port; // per port in nimbus/per mem buf in cumulus/OCM
uint32_t l_callouts_count = 0;
uint8_t l_new_callouts;
uint64_t l_huid;
Expand All @@ -155,10 +156,12 @@ void amec_health_check_dimm_temp()
if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS)
{
l_max_port = NUM_DIMM_PORTS;
l_max_dimm_per_port = NUM_DIMMS_PER_I2CPORT;
}
else // MEM_TYPE_CUMULUS
{
l_max_port = MAX_NUM_CENTAURS;
l_max_dimm_per_port = NUM_DIMMS_PER_CENTAUR;
}

// Check to see if any dimms have reached the error temperature that
Expand All @@ -170,7 +173,6 @@ void amec_health_check_dimm_temp()

l_ot_error = g_amec->thermaldimm.ot_error;
l_sensor = getSensorByGsid(TEMPDIMMTHRM);
l_cur_temp = l_sensor->sample;
l_max_temp = l_sensor->sample_max;

//iterate over all dimms
Expand All @@ -186,14 +188,15 @@ void amec_health_check_dimm_temp()
continue;
}

TRAC_ERR("amec_health_check_dimm_temp: DIMM reached error temp[%d]. current[%d], hist_max[%d], port[%d]",
l_ot_error,
l_cur_temp,
l_max_temp,
l_port);
// if the previous port had errors commit it so this port gets new error log
if(l_err)
{
commitErrl(&l_err);
l_callouts_count = 0;
}

//find the dimm(s) that need to be called out for this port
for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++)
for(l_dimm = 0; l_dimm < l_max_dimm_per_port; l_dimm++)
{
if (!(l_new_callouts & (DIMM_SENSOR0 >> l_dimm)))
{
Expand All @@ -206,15 +209,19 @@ void amec_health_check_dimm_temp()
l_dimm,
&G_cent_overtemp_logged_bitmap,
&G_dimm_overtemp_logged_bitmap.bytes[l_port]);
TRAC_ERR("amec_health_check_dimm_temp: DIMM%04X overtemp - %dC",
TRAC_ERR("amec_health_check_dimm_temp: DIMM%04X being called out for overtemp - %dC",
(l_port<<8)|l_dimm, l_fru->cur_temp);

// Create single elog with up to MAX_CALLOUTS
// Create single elog with up to MAX_CALLOUTS for this port
if(l_callouts_count < ERRL_MAX_CALLOUTS)
{
//If we don't have an error log for the callout, create one
if(!l_err)
{
TRAC_ERR("amec_health_check_dimm_temp: Creating log for port[%d] OT bitmap[0x%02X] logged bitmap[0x%02X]",
l_port,
G_dimm_overtemp_bitmap.bytes[l_port],
G_dimm_overtemp_logged_bitmap.bytes[l_port]);
/* @
* @errortype
* @moduleid AMEC_HEALTH_CHECK_DIMM_TEMP
Expand Down
15 changes: 13 additions & 2 deletions src/occ_405/amec/amec_sensors_centaur.c
Expand Up @@ -71,7 +71,7 @@ void amec_perfcount_getmc( CentaurMemData * i_sensor_cache, uint8_t i_centaur);

// Function Specification
//
// Name: amec_update_dimm_dts_sensors
// Name: amec_update_centaur_sensors
//
// Description: Updates sensors that have data grabbed by the fast core data
// task.
Expand Down Expand Up @@ -116,6 +116,7 @@ void amec_update_dimm_dts_sensors(CentaurMemData * i_sensor_cache, uint8_t i_cen
uint32_t l_sens_status;
int32_t l_dimm_temp, l_prev_temp;
static uint8_t L_ran_once[MAX_NUM_CENTAURS] = {FALSE};
static bool L_ot_traced[MAX_NUM_CENTAURS][NUM_DIMMS_PER_CENTAUR] = {{false}};

// Harvest thermal data for all dimms
for(k=0; k < NUM_DIMMS_PER_CENTAUR; k++)
Expand Down Expand Up @@ -236,7 +237,17 @@ void amec_update_dimm_dts_sensors(CentaurMemData * i_sensor_cache, uint8_t i_cen
if(l_dts[k] >= g_amec->thermaldimm.ot_error)
{
//Set a bit so that this dimm can be called out by the thermal thread
G_dimm_overtemp_bitmap.bytes[i_centaur] |= 1 << k;
G_dimm_overtemp_bitmap.bytes[i_centaur] |= (DIMM_SENSOR0 >> k);
// trace first time OT per DIMM
if( !L_ot_traced[i_centaur][k] )
{
TRAC_ERR("amec_update_dimm_dts_sensors: centaur[%d] DIMM[%d] reached error temp[%d]. current[%d]",
i_centaur,
k,
g_amec->thermaldimm.ot_error,
l_dts[k]);
L_ot_traced[i_centaur][k] = true;
}
}
}

Expand Down
15 changes: 13 additions & 2 deletions src/occ_405/amec/amec_sensors_ocmb.c
Expand Up @@ -71,7 +71,7 @@ void amec_perfcount_ocmb_getmc( OcmbMemData * i_sensor_cache, uint8_t i_membuf);

// Function Specification
//
// Name: amec_update_ocmb_dimm_dts_sensors
// Name: amec_update_ocmb_sensors
//
// Description: Updates sensors that have data grabbed by the fast core data
// task.
Expand Down Expand Up @@ -119,6 +119,7 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m
uint32_t l_hottest_dimm_loc = NUM_DIMMS_PER_OCMB;
int32_t l_dimm_temp, l_prev_temp;
static uint8_t L_ran_once[MAX_NUM_OCMBS] = {FALSE};
static bool L_ot_traced[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB] = {{false}};

// Harvest thermal data for all dimms
for(k=0; k < NUM_DIMMS_PER_OCMB; k++)
Expand Down Expand Up @@ -253,7 +254,17 @@ void amec_update_ocmb_dimm_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_m
if(l_dts[k] >= g_amec->thermaldimm.ot_error)
{
//Set a bit so that this dimm can be called out by the thermal thread
G_dimm_overtemp_bitmap.bytes[i_membuf] |= 1 << k;
G_dimm_overtemp_bitmap.bytes[i_membuf] |= (DIMM_SENSOR0 >> k);
// trace first time OT per DIMM
if( !L_ot_traced[i_membuf][k] )
{
TRAC_ERR("amec_update_ocmb_dimm_dts_sensors: Mem Buf[%d] DIMM[%d] reached error temp[%d]. current[%d]",
i_membuf,
k,
g_amec->thermaldimm.ot_error,
l_dts[k]);
L_ot_traced[i_membuf][k] = true;
}
}
}

Expand Down
16 changes: 14 additions & 2 deletions src/occ_405/dimm/dimm.c
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2018 */
/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -623,6 +623,7 @@ void process_dimm_temp()
const uint8_t port = G_dimm_sm_args.i2cPort;
const uint8_t dimm = G_dimm_sm_args.dimm;
uint8_t l_dimm_temp = G_dimm_sm_args.temp;
static bool L_ot_traced[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{false}};

#define MIN_VALID_DIMM_TEMP 1
#define MAX_VALID_DIMM_TEMP 125 //according to Mike Pardiek
Expand Down Expand Up @@ -687,7 +688,18 @@ void process_dimm_temp()
if (l_dimm_temp >= g_amec->thermaldimm.ot_error)
{
//Set a bit so that this dimm can be called out by the thermal thread
G_dimm_overtemp_bitmap.bytes[port] |= DIMM_SENSOR0 >> dimm;
G_dimm_overtemp_bitmap.bytes[port] |= (DIMM_SENSOR0 >> dimm);

// trace first time OT per DIMM
if( !L_ot_traced[port][dimm] )
{
TRAC_ERR("process_dimm_temp: port[%d] DIMM[%d] reached error temp[%d]. current[%d]",
port,
dimm,
g_amec->thermaldimm.ot_error,
l_dimm_temp);
L_ot_traced[port][dimm] = true;
}
}

l_fru->cur_temp = l_dimm_temp;
Expand Down
2 changes: 1 addition & 1 deletion src/occ_405/occbuildname.c
Expand Up @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) =

#else

volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_190712a\0" /*</BuildName>*/ ;
volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_190719a\0" /*</BuildName>*/ ;

#endif

0 comments on commit d467852

Please sign in to comment.