Skip to content

Commit

Permalink
Only call out DIMMs when health monitor time has expired
Browse files Browse the repository at this point in the history
Previously OCC would call out the DIMM if we got 2 consecutive
I2C failures trying to read DIMM temperatures.  Health monitor
already has code to handle timeout, so we will just keep retrying
on failures.

- Remove 60 second delay before starting to read DIMM temps
  since SW398808 should resolve the lock problem.
- Added debug cmd to retrieve the GPE0/GPE1 trace buffers.

Change-Id: I65156347e24ff8e68414a64aaf7e00ff4c12a2f8
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/45073
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: William A. Bryan <wilbryan@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
  • Loading branch information
cjcain committed Aug 25, 2017
1 parent 3f57751 commit df32663
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 62 deletions.
39 changes: 33 additions & 6 deletions src/occ_405/cmdh/cmdh_fsp_cmds.c
Expand Up @@ -48,12 +48,14 @@
#include "homer.h"
#include <centaur_data.h>
#include <avsbus.h>
#include "cmdh_dbug_cmd.h"
#include "wof.h"
#include "sensor_main_memory.h"
extern dimm_sensor_flags_t G_dimm_temp_expired_bitmap;
extern bool G_vrm_thermal_monitoring;

#include <gpe_export.h>
extern gpe_shared_data_t G_shared_gpe_data;

// This table contains tunable parameter information that can be exposed to
// customers (only Master OCC should access/control this table)
cmdh_tunable_param_table_t G_mst_tunable_parameter_table[CMDH_DEFAULT_TUNABLE_PARAM_NUM] =
Expand Down Expand Up @@ -889,9 +891,34 @@ void cmdh_dbug_get_trace (const cmdh_fsp_cmd_t * i_cmd_ptr,
cmdh_dbug_get_trace_query_t *l_get_trace_query_ptr = (cmdh_dbug_get_trace_query_t*) i_cmd_ptr;
cmdh_dbug_get_trace_resp_t *l_get_trace_resp_ptr = (cmdh_dbug_get_trace_resp_t*) o_rsp_ptr;

const trace_descriptor_array_t* l_trace_ptr = TRAC_get_td((char *)l_get_trace_query_ptr->comp);
l_rc = TRAC_get_buffer_partial(l_trace_ptr, l_get_trace_resp_ptr->data,&l_trace_buffer_size);
l_trace_size = l_trace_buffer_size;
if (memcmp((char *)l_get_trace_query_ptr->comp, "GP", 2) == 0)
{
// Return a GPE0/GPE1 trace buffer
if (l_get_trace_query_ptr->comp[2] == '0')
{
if (G_shared_gpe_data.gpe0_tb_ptr != 0)
{
l_trace_size = G_shared_gpe_data.gpe0_tb_sz;
memcpy(l_get_trace_resp_ptr->data, (uint8_t*)G_shared_gpe_data.gpe0_tb_ptr, (size_t)l_trace_size);
}
}
else if (l_get_trace_query_ptr->comp[2] == '1')
{
if (G_shared_gpe_data.gpe0_tb_ptr != 0)
{
l_trace_size = G_shared_gpe_data.gpe1_tb_sz;
memcpy(l_get_trace_resp_ptr->data, (uint8_t*)G_shared_gpe_data.gpe1_tb_ptr, (size_t)l_trace_size);
}
}
else l_rc = 255;
}
else
{
// Return a 405 trace buffer
const trace_descriptor_array_t* l_trace_ptr = TRAC_get_td((char *)l_get_trace_query_ptr->comp);
l_rc = TRAC_get_buffer_partial(l_trace_ptr, l_get_trace_resp_ptr->data,&l_trace_buffer_size);
l_trace_size = l_trace_buffer_size;
}
if(l_rc==0)
{
G_rsp_status = ERRL_RC_SUCCESS;
Expand Down Expand Up @@ -1924,7 +1951,7 @@ uint8_t cmdh_set_user_pcap_common(uint16_t i_pcap,

//Indicate there is new PCAP data available
G_master_pcap_data.pcap_data_count++;
// if user pcap was just disabled set source to 0 (no user pcap)
// if user pcap was just disabled set source to 0 (no user pcap)
if(i_pcap == 0)
{
G_master_pcap_data.source = 0;
Expand Down Expand Up @@ -2089,7 +2116,7 @@ uint8_t cmdh_set_pcap_inband(const uint16_t i_cmd_data_length,
uint16_t l_pcap = CONVERT_UINT8_ARRAY_UINT16(l_cmd_ptr->power_cap[0],
l_cmd_ptr->power_cap[1]);
l_rc = cmdh_set_user_pcap_common(l_pcap, IN_BAND);

// if successful copy the power cap to the response buffer and set the rsp length
if(l_rc == ERRL_RC_SUCCESS)
{
Expand Down
94 changes: 40 additions & 54 deletions src/occ_405/dimm/dimm.c
Expand Up @@ -58,11 +58,9 @@ uint8_t G_maxDimmPort = NUM_DIMM_PORTS - 1;
bool G_dimm_i2c_reset_required = false;
uint32_t G_dimm_i2c_reset_cause = 0;

#define MAX_CONSECUTIVE_DIMM_RESETS 1

typedef struct {
bool disabled;
uint8_t errorCount;
uint8_t errorCount; // # consecutive errors for this DIMM
} dimmData_t;
dimmData_t G_dimm[NUM_DIMM_PORTS][NUM_DIMMS_PER_I2CPORT] = {{{false,0}}};

Expand Down Expand Up @@ -263,12 +261,17 @@ void mark_dimm_failed()
{
const uint8_t port = G_dimm_sm_args.i2cPort;
const uint8_t dimm = G_dimm_sm_args.dimm;
INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X "
"(ffdc 0x%08X%08X, completion_state 0x%02X)",
DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount,
WORD_HIGH(G_dimm_sm_args.error.ffdc),
WORD_LOW(G_dimm_sm_args.error.ffdc),
G_dimm_sm_request.request.completion_state);

// Trace the first 3 consecutive failures for this DIMM
if (G_dimm[port][dimm].errorCount < 3)
{
INTR_TRAC_ERR("mark_dimm_failed: DIMM%04X failed in state/rc/count=0x%06X "
"(ffdc 0x%08X%08X, completion_state 0x%02X)",
DIMM_AND_PORT, (G_dimm_sm_args.state << 16) | (G_dimm_sm_args.error.rc << 8) | G_dimm[port][dimm].errorCount,
WORD_HIGH(G_dimm_sm_args.error.ffdc),
WORD_LOW(G_dimm_sm_args.error.ffdc),
G_dimm_sm_request.request.completion_state);
}

g_amec->proc[0].memctl[port].centaur.dimm_temps[dimm].flags |= FRU_SENSOR_STATUS_ERROR;

Expand All @@ -281,43 +284,20 @@ void mark_dimm_failed()
INCREMENT_ERR_HISTORY(ERRH_DIMM_I2C_PORT1);
}

if (++G_dimm[port][dimm].errorCount > MAX_CONSECUTIVE_DIMM_RESETS)
if (G_dimm[port][dimm].errorCount < 255)
{
// Disable collection on this DIMM, collect FFDC and log error
G_dimm[port][dimm].disabled = true;
INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to %d consecutive errors (state=%d)",
DIMM_AND_PORT, G_dimm[port][dimm].errorCount, G_dimm_sm_args.state);
errlHndl_t l_err = NULL;
/*
* @errortype
* @moduleid DIMM_MID_MARK_DIMM_FAILED
* @reasoncode DIMM_GPE_FAILURE
* @userdata1 GPE returned rc code
* @userdata4 ERC_DIMM_COMPLETE_FAILURE
* @devdesc Disabling DIMM due to repeated I2C failures
*/
l_err = createErrl(DIMM_MID_MARK_DIMM_FAILED,
DIMM_GPE_FAILURE,
ERC_DIMM_COMPLETE_FAILURE,
ERRL_SEV_PREDICTIVE,
NULL,
DEFAULT_TRACE_SIZE,
G_dimm_sm_args.error.rc,
0);
addUsrDtlsToErrl(l_err,
(uint8_t*)&G_dimm_sm_request.ffdc,
sizeof(G_dimm_sm_request.ffdc),
ERRL_STRUCT_VERSION_1,
ERRL_USR_DTL_BINARY_DATA);
addCalloutToErrl(l_err,
ERRL_CALLOUT_TYPE_HUID,
G_sysConfigData.dimm_huids[port][dimm],
ERRL_CALLOUT_PRIORITY_HIGH);
//Mark DIMM as logged so we don't log it again
amec_mem_mark_logged(0, dimm,
&G_cent_timeout_logged_bitmap,
&G_dimm_timeout_logged_bitmap.bytes[port]);
commitErrl(&l_err);
++G_dimm[port][dimm].errorCount;
}

if (false == G_dimm[port][dimm].disabled)
{
if(G_dimm_timeout_logged_bitmap.bytes[port] & (DIMM_SENSOR0 >> dimm))
{
//Health monitor has already logged a timeout for this DIMM
G_dimm[port][dimm].disabled = true;
INTR_TRAC_ERR("mark_dimm_failed: disabling DIMM%04X due to health monitor timeout (consecutive errors: %d)",
DIMM_AND_PORT, G_dimm[port][dimm].errorCount);
}
}

// Reset DIMM I2C engine
Expand Down Expand Up @@ -471,6 +451,7 @@ uint8_t dimm_reset_sm()
case DIMM_STATE_RESET_MASTER:
if (DIMM_TICK == 0)
{
TRAC_INFO("dimm_reset_sm: Initiating I2C reset of engine %d", G_sysConfigData.dimm_i2c_engine);
L_new_dimm_args.i2cEngine = G_sysConfigData.dimm_i2c_engine;
if (schedule_dimm_req(DIMM_STATE_RESET_MASTER, L_new_dimm_args))
{
Expand Down Expand Up @@ -710,6 +691,12 @@ void process_dimm_temp()
// Store DIMM temp in sensor
sensor_update(&g_amec->proc[0].tempdimm[DIMM_INDEX(port, dimm)], l_dimm_temp);

// Successful temp collected, reset error count
if (G_dimm[port][dimm].errorCount > 2)
{
INTR_TRAC_INFO("process_dimm_temp: successfully read temp for DIMM%04X (after %d consecutive errors)",
DIMM_AND_PORT, G_dimm[port][dimm].errorCount);
}
G_dimm[port][dimm].errorCount = 0;

} // end process_dimm_temp()
Expand All @@ -736,18 +723,16 @@ void task_dimm_sm(struct task *i_self)
static bool L_readIssued = false;
const uint8_t engine = G_sysConfigData.dimm_i2c_engine;
static bool L_occ_owns_lock = false;
// 60,000 x 500us (tick time) x 2 (called every other tick) = 60 seconds
static unsigned int L_startup_delay = 60000;

if (L_startup_delay > 0)
static unsigned int L_dimms_enabled = false;
if (!L_dimms_enabled)
{
if (--L_startup_delay == 0)
{
TRAC_INFO("task_dimm_sm: Startup delay completed, DIMM temp collection will be started (0x%08X)", G_dimm_present_sensors.words[0]);
G_dimm_enabled_sensors = G_dimm_present_sensors;
}
L_dimms_enabled = true;
TRAC_INFO("task_dimm_sm: DIMM temp collection is being started (0x%08X)", G_dimm_present_sensors.words[0]);
G_dimm_enabled_sensors = G_dimm_present_sensors;
}
else if (G_mem_monitoring_allowed)

if (G_mem_monitoring_allowed)
{
#ifdef DEBUG_LOCK_TESTING
SIMULATE_HOST();
Expand Down Expand Up @@ -929,6 +914,7 @@ void task_dimm_sm(struct task *i_self)
if ((DIMM_TICK == 0) || (DIMM_TICK == 8))
{
// If DIMM has huid/sensor then it should be present
// and if not disabled yet, start temp collection
if (NIMBUS_DIMM_PRESENT(L_dimmPort,L_dimmIndex) &&
(G_dimm[L_dimmPort][L_dimmIndex].disabled == false))
{
Expand Down
2 changes: 1 addition & 1 deletion src/occ_405/lock/lock.c
Expand Up @@ -160,7 +160,7 @@ void update_i2c_lock(const lockOperation_e i_op, const uint8_t i_engine)
{
out32(OCB_OCCFLG_OR, occ_flags.value);

TRAC_IMP("update_i2c_lock: OCC has aquired lock for I2C engine %d", i_engine);
TRAC_IMP("update_i2c_lock: OCC has acquired lock for I2C engine %d", i_engine);
}

} // end update_i2c_lock()
Expand Down
2 changes: 1 addition & 1 deletion src/occ_405/occbuildname.c
Expand Up @@ -34,6 +34,6 @@ volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) =

#else

volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170822a\0" /*</BuildName>*/ ;
volatile const char G_occ_buildname[16] __attribute__((section(".buildname"))) = /*<BuildName>*/ "op_occ_170825a\0" /*</BuildName>*/ ;

#endif

0 comments on commit df32663

Please sign in to comment.