Skip to content

Commit

Permalink
Support for 16 OCMBs
Browse files Browse the repository at this point in the history
Change-Id: I906c797176f0ea2a683e6f05c7e6094cdf11693a
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/80364
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Christopher J Cain <cjcain@us.ibm.com>
Reviewed-by: Douglas R Gilbert <dgilbert@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
  • Loading branch information
marthabroyles committed Jul 22, 2019
1 parent d467852 commit 7888141
Show file tree
Hide file tree
Showing 23 changed files with 322 additions and 151 deletions.
4 changes: 2 additions & 2 deletions src/occ_405/amec/amec_amester.c
Expand Up @@ -278,9 +278,9 @@ static uint8_t amester_get_sensor_info( uint8_t* o_resp, uint16_t* io_resp_lengt

if( (MEM_TYPE_CUMULUS != G_sysConfigData.mem_type) &&
( ((i_sensor >= MRDM0) &&
(i_sensor <= MRDM7)) ||
(i_sensor <= MRDM11)) ||
((i_sensor >= MWRM0) &&
(i_sensor <= MWRM7)) ) )
(i_sensor <= MWRM11)) ) )
{
*((uint32_t *)o_resp) = AMEFP(64, -5);
}
Expand Down
5 changes: 3 additions & 2 deletions src/occ_405/amec/amec_controller.c
Expand Up @@ -35,7 +35,7 @@
// Externs
//*************************************************************************
extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern uint8_t G_cent_temp_expired_bitmap;
extern uint16_t G_cent_temp_expired_bitmap;
//*************************************************************************
// Macros
//*************************************************************************
Expand Down Expand Up @@ -271,7 +271,8 @@ void amec_controller_dimm_thermal()
// Get TEMPDIMMTHRM sensor value
l_sensor = getSensorByGsid(TEMPDIMMTHRM);

if(G_dimm_temp_expired_bitmap.bigword)
if(G_dimm_temp_expired_bitmap.dw[0] ||
G_dimm_temp_expired_bitmap.dw[1])
{
//we were not able to read one or more dimm temperatures.
//Assume temperature is at the setpoint plus 1 degree C.
Expand Down
4 changes: 2 additions & 2 deletions src/occ_405/amec/amec_freq.c
Expand Up @@ -796,8 +796,8 @@ void amec_slv_mem_voting_box(void)
l_reason,
l_vote,
G_cent_temp_expired_bitmap,
G_dimm_temp_expired_bitmap.words[0],
G_dimm_temp_expired_bitmap.words[1]);
(uint32_t)(G_dimm_temp_expired_bitmap.dw[0] >> 32),
(uint32_t)G_dimm_temp_expired_bitmap.dw[0]);
}
}
else
Expand Down
84 changes: 56 additions & 28 deletions src/occ_405/amec/amec_health.c
Expand Up @@ -48,22 +48,22 @@ extern bool G_log_gpe1_error;
//*************************************************************************/

// Have we already called out the dimm for overtemp (bitmap of dimms)?
dimm_sensor_flags_t G_dimm_overtemp_logged_bitmap = {0};
dimm_sensor_flags_t G_dimm_overtemp_logged_bitmap = {{0}};

// Have we already called out the dimm for timeout (bitmap of dimms)?
dimm_sensor_flags_t G_dimm_timeout_logged_bitmap = {0};
dimm_sensor_flags_t G_dimm_timeout_logged_bitmap = {{0}};

// Are any dimms currently in the timedout state (bitmap of dimm)?
dimm_sensor_flags_t G_dimm_temp_expired_bitmap = {0};
dimm_sensor_flags_t G_dimm_temp_expired_bitmap = {{0}};

// Have we already called out the centaur for timeout (bitmap of centaurs)?
uint8_t G_cent_timeout_logged_bitmap = 0;
uint16_t G_cent_timeout_logged_bitmap = 0;

// Have we already called out the centaur for overtemp (bitmap of centaurs)?
uint8_t G_cent_overtemp_logged_bitmap = 0;
uint16_t G_cent_overtemp_logged_bitmap = 0;

// Are any dimms currently in the timedout state (bitmap of centaurs)?
uint8_t G_cent_temp_expired_bitmap = 0;
// Are any mem controllers currently in the timedout state (bitmap of centaurs)?
uint16_t G_cent_temp_expired_bitmap = 0;

// Array to store the update tag of each core's temperature sensor
uint32_t G_core_temp_update_tag[MAX_NUM_CORES] = {0};
Expand Down Expand Up @@ -113,8 +113,8 @@ uint64_t amec_mem_get_huid(uint8_t i_cent, uint8_t i_dimm)
//that the dimm should be marked.
void amec_mem_mark_logged(uint8_t i_cent,
uint8_t i_dimm,
uint8_t* i_clog_bitmap,
uint8_t* i_dlog_bitmap)
uint16_t* i_clog_bitmap,
uint8_t* i_dlog_bitmap)
{
if(i_dimm == 0xff)
{
Expand Down Expand Up @@ -158,6 +158,11 @@ void amec_health_check_dimm_temp()
l_max_port = NUM_DIMM_PORTS;
l_max_dimm_per_port = NUM_DIMMS_PER_I2CPORT;
}
else if(G_sysConfigData.mem_type == MEM_TYPE_OCM)
{
l_max_port = MAX_NUM_OCMBS;
l_max_dimm_per_port = NUM_DIMMS_PER_OCMB;
}
else // MEM_TYPE_CUMULUS
{
l_max_port = MAX_NUM_CENTAURS;
Expand All @@ -166,7 +171,8 @@ void amec_health_check_dimm_temp()

// Check to see if any dimms have reached the error temperature that
// haven't been called out already
if(G_dimm_overtemp_bitmap.bigword == G_dimm_overtemp_logged_bitmap.bigword)
if( (G_dimm_overtemp_bitmap.dw[0] == G_dimm_overtemp_logged_bitmap.dw[0]) &&
(G_dimm_overtemp_bitmap.dw[1] == G_dimm_overtemp_logged_bitmap.dw[1]) )
{
return;
}
Expand Down Expand Up @@ -282,7 +288,7 @@ void amec_health_check_dimm_temp()
*/
void amec_health_check_dimm_timeout()
{
static dimm_sensor_flags_t L_temp_update_bitmap_prev = {0};
static dimm_sensor_flags_t L_temp_update_bitmap_prev = {{0}};
dimm_sensor_flags_t l_need_inc, l_need_clr, l_temp_update_bitmap;
uint8_t l_dimm, l_port;
fru_temp_t* l_fru;
Expand All @@ -300,33 +306,47 @@ void amec_health_check_dimm_timeout()
//3) sensor is enabled and updated and was updated on previous check (do nothing)

//Grab snapshot of G_dimm_temp_updated_bitmap and clear it
l_temp_update_bitmap.bigword = G_dimm_temp_updated_bitmap.bigword;
G_dimm_temp_updated_bitmap.bigword = 0;
l_temp_update_bitmap.dw[0] = G_dimm_temp_updated_bitmap.dw[0];
l_temp_update_bitmap.dw[1] = G_dimm_temp_updated_bitmap.dw[1];
G_dimm_temp_updated_bitmap.dw[0] = 0;
G_dimm_temp_updated_bitmap.dw[1] = 0;

//check if we need to increment any timers (haven't been updated in the last second)
l_need_inc.bigword = G_dimm_enabled_sensors.bigword & ~l_temp_update_bitmap.bigword;
l_need_inc.dw[0] = G_dimm_enabled_sensors.dw[0] & ~l_temp_update_bitmap.dw[0];
l_need_inc.dw[1] = G_dimm_enabled_sensors.dw[1] & ~l_temp_update_bitmap.dw[1];

//check if we need to clear any timers (updated now but not updated previously)
l_need_clr.bigword = l_temp_update_bitmap.bigword & ~L_temp_update_bitmap_prev.bigword;
l_need_clr.dw[0] = l_temp_update_bitmap.dw[0] & ~L_temp_update_bitmap_prev.dw[0];
l_need_clr.dw[1] = l_temp_update_bitmap.dw[1] & ~L_temp_update_bitmap_prev.dw[1];

//save off the previous bitmap of updated sensors for next time
L_temp_update_bitmap_prev.bigword = l_temp_update_bitmap.bigword;
L_temp_update_bitmap_prev.dw[0] = l_temp_update_bitmap.dw[0];
L_temp_update_bitmap_prev.dw[1] = l_temp_update_bitmap.dw[1];

//only go further if we actually have work to do here.
if(!l_need_inc.bigword && !l_need_clr.bigword)
if(!l_need_inc.dw[0] && !l_need_inc.dw[1] &&
!l_need_clr.dw[0] && !l_need_clr.dw[1])
{
//nothing to do
break;
}

uint8_t l_max_port; // #ports in nimbus/#centaurs in cumulus
uint8_t l_max_port; // #ports in nimbus/#mem buffs in cumulus/OCM
uint8_t l_max_dimm_per_port; // per port in nimbus/per mem buff in cumulus/OCM
if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS)
{
l_max_port = NUM_DIMM_PORTS;
l_max_dimm_per_port = NUM_DIMMS_PER_I2CPORT;
}
else if(G_sysConfigData.mem_type == MEM_TYPE_OCM)
{
l_max_port = MAX_NUM_OCMBS;
l_max_dimm_per_port = NUM_DIMMS_PER_OCMB;
}
else // MEM_TYPE_CUMULUS
{
l_max_port = MAX_NUM_CENTAURS;
l_max_dimm_per_port = NUM_DIMMS_PER_CENTAUR;
}
//iterate across all ports incrementing dimm sensor timers as needed
for(l_port = 0; l_port < l_max_port; l_port++)
Expand All @@ -345,7 +365,7 @@ void amec_health_check_dimm_timeout()
}

//There's at least one dimm requiring an increment, find the dimm
for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++)
for(l_dimm = 0; l_dimm < l_max_dimm_per_port; l_dimm++)
{
//not this one, check if we need to clear the dimm timeout and go to the next one
if(!(l_need_inc.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm)))
Expand Down Expand Up @@ -467,13 +487,13 @@ void amec_health_check_dimm_timeout()
}

//skip clearing if no dimms need it
if(!l_need_clr.bigword)
if( (!l_need_clr.dw[0]) && (!l_need_clr.dw[1]) )
{
break;
}

//iterate across all centaurs/ports clearing dimm sensor timers as needed
for(l_port = 0; l_port < MAX_NUM_CENTAURS; l_port++)
for(l_port = 0; l_port < l_max_port; l_port++)
{

if(!l_need_clr.bytes[l_port])
Expand All @@ -482,7 +502,7 @@ void amec_health_check_dimm_timeout()
}

//iterate over all dimms
for(l_dimm = 0; l_dimm < NUM_DIMMS_PER_CENTAUR; l_dimm++)
for(l_dimm = 0; l_dimm < l_max_dimm_per_port; l_dimm++)
{
//not this one, go to next one
if(!(l_need_clr.bytes[l_port] & (DIMM_SENSOR0 >> l_dimm)))
Expand Down Expand Up @@ -531,9 +551,9 @@ void amec_health_check_cent_temp()
/*------------------------------------------------------------------------*/
uint16_t l_ot_error, l_cur_temp, l_max_temp;
sensor_t *l_sensor;
uint32_t l_cent;
uint32_t l_cent, l_max_mem_buf;
uint32_t l_callouts_count = 0;
uint8_t l_new_callouts;
uint16_t l_new_callouts;
uint64_t l_huid;
errlHndl_t l_err = NULL;

Expand All @@ -560,7 +580,15 @@ void amec_health_check_cent_temp()
l_new_callouts);

//find the centaur(s) that need to be called out
for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++)
if(G_sysConfigData.mem_type == MEM_TYPE_OCM)
{
l_max_mem_buf = MAX_NUM_OCMBS;
}
else // MEM_TYPE_CUMULUS
{
l_max_mem_buf = MAX_NUM_CENTAURS;
}
for(l_cent = 0; l_cent < l_max_mem_buf; l_cent++)
{
if(!(l_new_callouts & (CENTAUR0_PRESENT_MASK >> l_cent)))
{
Expand Down Expand Up @@ -640,9 +668,9 @@ void amec_health_check_cent_temp()
*/
void amec_health_check_cent_timeout()
{
static uint8_t L_temp_update_bitmap_prev = 0;
uint8_t l_need_inc, l_need_clr, l_temp_update_bitmap;
uint8_t l_cent;
static uint16_t L_temp_update_bitmap_prev = 0;
uint16_t l_need_inc, l_need_clr, l_temp_update_bitmap;
uint16_t l_cent;
fru_temp_t* l_fru;
errlHndl_t l_err = NULL;
uint32_t l_callouts_count = 0;
Expand Down
4 changes: 2 additions & 2 deletions src/occ_405/amec/amec_health.h
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2011,2017 */
/* Contributors Listed Below - COPYRIGHT 2011,2019 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -49,7 +49,7 @@ void amec_health_check_dimm_temp(void);
void amec_health_check_dimm_timeout(void);
void amec_mem_mark_logged(uint8_t i_cent,
uint8_t i_dimm,
uint8_t* i_clog_bitmap,
uint16_t* i_clog_bitmap,
uint8_t* i_dlog_bitmap);
void amec_health_check_vrm_vdd_temp(const sensor_t *i_sensor);
void amec_health_check_vrm_vdd_temp_timeout(void);
Expand Down
10 changes: 5 additions & 5 deletions src/occ_405/amec/amec_sensors_centaur.c
Expand Up @@ -48,10 +48,10 @@
/******************************************************************************/
/* Globals */
/******************************************************************************/
dimm_sensor_flags_t G_dimm_overtemp_bitmap = {0};
dimm_sensor_flags_t G_dimm_temp_updated_bitmap = {0};
uint8_t G_cent_overtemp_bitmap = 0;
uint8_t G_cent_temp_updated_bitmap = 0;
dimm_sensor_flags_t G_dimm_overtemp_bitmap = {{0}};
dimm_sensor_flags_t G_dimm_temp_updated_bitmap = {{0}};
uint16_t G_cent_overtemp_bitmap = 0;
uint16_t G_cent_temp_updated_bitmap = 0;
extern uint8_t G_centaur_needs_recovery;
extern uint8_t G_centaur_nest_lfir6;
extern uint64_t G_inject_dimm;
Expand Down Expand Up @@ -441,7 +441,7 @@ void amec_update_centaur_temp_sensors(void)
}
}

sensor_update(&g_amec->proc[0].temp2mscent,l_hot_centaur);
sensor_update(&g_amec->proc[0].tempcent,l_hot_centaur);
sensor_update(&g_amec->proc[0].tempdimmthrm,l_hot_dimm);
AMEC_DBG("HotCentaur=[%d] HotDimm=[%d]",l_hot_centaur, l_hot_dimm);
}
Expand Down
10 changes: 5 additions & 5 deletions src/occ_405/amec/amec_sensors_ocmb.c
Expand Up @@ -50,8 +50,8 @@
/******************************************************************************/
extern dimm_sensor_flags_t G_dimm_overtemp_bitmap;
extern dimm_sensor_flags_t G_dimm_temp_updated_bitmap;
extern uint8_t G_cent_overtemp_bitmap;
extern uint8_t G_cent_temp_updated_bitmap;
extern uint16_t G_cent_overtemp_bitmap;
extern uint16_t G_cent_temp_updated_bitmap;
extern uint8_t G_centaur_needs_recovery;
extern uint64_t G_inject_dimm;
extern uint32_t G_inject_dimm_trace[MAX_NUM_OCMBS][NUM_DIMMS_PER_OCMB];
Expand Down Expand Up @@ -337,7 +337,7 @@ void amec_update_ocmb_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_membuf
}
else
{
//don't allow temp to change more than is reasonable for 2ms
//don't allow temp to change more than is reasonable since last read
if(l_sens_temp > (l_prev_temp + MAX_MEM_TEMP_CHANGE))
{
l_dts = l_prev_temp + MAX_MEM_TEMP_CHANGE;
Expand Down Expand Up @@ -365,7 +365,7 @@ void amec_update_ocmb_dts_sensors(OcmbMemData * i_sensor_cache, uint8_t i_membuf
}

//Notify thermal thread that temperature has been updated
G_cent_temp_updated_bitmap |= CENTAUR0_PRESENT_MASK >> i_membuf;
G_cent_temp_updated_bitmap |= (CENTAUR0_PRESENT_MASK >> i_membuf);

//clear error flags
l_fru->flags &= FRU_TEMP_FAST_CHANGE;
Expand Down Expand Up @@ -434,7 +434,7 @@ void amec_update_ocmb_temp_sensors(void)
l_hot_dimm = g_amec->proc[0].memctl[k].centaur.tempdimmax.sample;
}
}
sensor_update(&g_amec->proc[0].temp2mscent,l_hot_mb);
sensor_update(&g_amec->proc[0].tempcent,l_hot_mb);
AMEC_DBG("HotMembuf=%d\n",l_hot_mb);

sensor_update(&g_amec->proc[0].tempdimmthrm,l_hot_dimm);
Expand Down

0 comments on commit 7888141

Please sign in to comment.