Skip to content

Commit

Permalink
Fix DIMM callout on temperature timeout
Browse files Browse the repository at this point in the history
Check all Centaurs for DIMM temperature timeouts
Add DIMM error injection debug command
Add internal flag debug command

RTC: 189531
Change-Id: I1917e6329894991419517338ba38007b8c493f1d
Reviewed-on: http://ralgit01.raleigh.ibm.com/gerrit1/60255
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Martha Broyles <mbroyles@us.ibm.com>
Reviewed-by: Andres A. Lugo-Reyes <aalugore@us.ibm.com>
Reviewed-by: Christopher J. Cain <cjcain@us.ibm.com>
  • Loading branch information
cjcain committed Jun 14, 2018
1 parent 41f0c2c commit a8a2c3a
Show file tree
Hide file tree
Showing 8 changed files with 253 additions and 22 deletions.
11 changes: 10 additions & 1 deletion src/occ_405/amec/amec_health.c
Original file line number Diff line number Diff line change
Expand Up @@ -311,8 +311,17 @@ void amec_health_check_dimm_timeout()
break;
}

uint8_t l_max_port; // #ports in nimbus/#centaurs in cumulus
if(G_sysConfigData.mem_type == MEM_TYPE_NIMBUS)
{
l_max_port = NUM_DIMM_PORTS;
}
else // MEM_TYPE_CUMULUS
{
l_max_port = MAX_NUM_CENTAURS;
}
//iterate across all ports incrementing dimm sensor timers as needed
for(l_port = 0; l_port < NUM_DIMM_PORTS; l_port++)
for(l_port = 0; l_port < l_max_port; l_port++)
{
//any dimm timers on this port need incrementing?
if(!l_need_inc.bytes[l_port])
Expand Down
26 changes: 26 additions & 0 deletions src/occ_405/amec/amec_sensors_centaur.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ uint8_t G_cent_overtemp_bitmap = 0;
uint8_t G_cent_temp_updated_bitmap = 0;
extern uint8_t G_centaur_needs_recovery;
extern uint8_t G_centaur_nest_lfir6;
extern uint64_t G_inject_dimm;
extern uint32_t G_inject_dimm_trace[MAX_NUM_CENTAURS][NUM_DIMMS_PER_CENTAUR];


/******************************************************************************/
/* Forward Declarations */
Expand Down Expand Up @@ -122,6 +125,29 @@ void amec_update_dimm_dts_sensors(CentaurMemData * i_sensor_cache, uint8_t i_cen
continue;
}


if (g_amec->proc[0].memctl[i_centaur].centaur.dimm_temps[k].temp_sid) // DIMM has sensor ID
{
if ((G_inject_dimm & ((uint64_t)1 << ((i_centaur * 8) + k))) == 0)
{
if (G_inject_dimm_trace[i_centaur][k] != 0)
{
TRAC_INFO("amec_update_dimm_dts_sensors: stopping injection of errors for DIMM%04X", (i_centaur<<8)|k);
G_inject_dimm_trace[i_centaur][k] = 0;
}
}
else
{
if (G_inject_dimm_trace[i_centaur][k] == 0)
{
TRAC_INFO("amec_update_dimm_dts_sensors: injecting errors for DIMM%04X", (i_centaur<<8)|k);
G_inject_dimm_trace[i_centaur][k] = 1;
}
continue; // Skip this DIMM
}
}


l_sens_status = i_sensor_cache->scache.dimm_thermal_sensor[k].fields.status;
fru_temp_t* l_fru = &g_amec->proc[0].memctl[i_centaur].centaur.dimm_temps[k];

Expand Down
19 changes: 9 additions & 10 deletions src/occ_405/cent/centaur_data.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER OnChipController Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2014,2017 */
/* Contributors Listed Below - COPYRIGHT 2014,2018 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -1029,49 +1029,49 @@ void centaur_init( void )

// Set up recovery scom list entries
G_cent_scom_list_entry[L4_LINE_DELETE].scom = MBCCFGQ_REG; //scom address
G_cent_scom_list_entry[L4_LINE_DELETE].commandType = CENTAUR_SCOM_RMW; //scom operation to perform
G_cent_scom_list_entry[L4_LINE_DELETE].commandType = CENTAUR_SCOM_RMW; //scom operation to perform
G_cent_scom_list_entry[L4_LINE_DELETE].mask = LINE_DELETE_ON_NEXT_CE; //mask of bits to change
G_cent_scom_list_entry[L4_LINE_DELETE].data = LINE_DELETE_ON_NEXT_CE; //scom data (always set the bit)

//one time init for reading LFIR6
G_cent_scom_list_entry[READ_NEST_LFIR6].scom = CENT_NEST_LFIR_REG; //scom address
G_cent_scom_list_entry[READ_NEST_LFIR6].commandType = CENTAUR_SCOM_READ; //scom operation to perform
G_cent_scom_list_entry[READ_NEST_LFIR6].commandType = CENTAUR_SCOM_READ; //scom operation to perform
G_cent_scom_list_entry[READ_NEST_LFIR6].mask = 0; //mask (not used for reads)
G_cent_scom_list_entry[READ_NEST_LFIR6].data = 0; //scom data (initialize to 0)

//one time init for reading centaur thermal status register
G_cent_scom_list_entry[READ_THERM_STATUS].scom = CENT_THRM_STATUS_REG; //scom address
G_cent_scom_list_entry[READ_THERM_STATUS].commandType = CENTAUR_SCOM_READ; //scom operation to perform
G_cent_scom_list_entry[READ_THERM_STATUS].commandType = CENTAUR_SCOM_READ; //scom operation to perform
G_cent_scom_list_entry[READ_THERM_STATUS].mask = 0; //mask (not used for reads)
G_cent_scom_list_entry[READ_THERM_STATUS].data = 0; //scom data (initialize to 0)

//one time init to reset the centaur dts FSM
G_cent_scom_list_entry[RESET_DTS_FSM].scom = CENT_THRM_CTRL_REG; //scom address
G_cent_scom_list_entry[RESET_DTS_FSM].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[RESET_DTS_FSM].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[RESET_DTS_FSM].mask = 0; //mask (not used for writes)
G_cent_scom_list_entry[RESET_DTS_FSM].data = CENT_THRM_CTRL4; //scom data (sets bit4)

//one time init to clear centaur NEST LFIR 6
G_cent_scom_list_entry[CLEAR_NEST_LFIR6].scom = CENT_NEST_LFIR_AND_REG; //scom address
G_cent_scom_list_entry[CLEAR_NEST_LFIR6].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[CLEAR_NEST_LFIR6].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[CLEAR_NEST_LFIR6].mask = 0; //mask (not used for writes)
G_cent_scom_list_entry[CLEAR_NEST_LFIR6].data = ~CENT_NEST_LFIR6; //scom data

//one time init to disable centaur sensor cache
G_cent_scom_list_entry[DISABLE_SC].scom = SCAC_CONFIG_REG; //scom address
G_cent_scom_list_entry[DISABLE_SC].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[DISABLE_SC].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[DISABLE_SC].mask = SCAC_MASTER_ENABLE; //mask of bits to change
G_cent_scom_list_entry[DISABLE_SC].data = 0; //scom data (disable sensor cache)

//one time init to enable centaur sensor cache
G_cent_scom_list_entry[ENABLE_SC].scom = SCAC_CONFIG_REG; //scom address
G_cent_scom_list_entry[ENABLE_SC].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[ENABLE_SC].commandType = CENTAUR_SCOM_NOP; //init to no-op (only runs if needed)
G_cent_scom_list_entry[ENABLE_SC].mask = SCAC_MASTER_ENABLE; //mask of bits to change
G_cent_scom_list_entry[ENABLE_SC].data = SCAC_MASTER_ENABLE; //scom data (enable sensor cache)

//one time init for reading centaur sensor cache lfir
G_cent_scom_list_entry[READ_SCAC_LFIR].scom = SCAC_LFIR_REG; //scom address
G_cent_scom_list_entry[READ_SCAC_LFIR].commandType = CENTAUR_SCOM_READ; //scom operation to perform
G_cent_scom_list_entry[READ_SCAC_LFIR].commandType = CENTAUR_SCOM_READ; //scom operation to perform
G_cent_scom_list_entry[READ_SCAC_LFIR].mask = 0; //mask (not used for reads)
G_cent_scom_list_entry[READ_SCAC_LFIR].data = 0; //scom data (initialize to 0)

Expand All @@ -1090,7 +1090,6 @@ void centaur_init( void )
L_scomList[0].mask = l_mbscfg.value;

//set up the data bits
// TODO RTC 190643 disable until phyp has centaur support
l_mbscfg.fields.occ_deadman_timer_sel = CENT_DEADMAN_TIMER_2SEC;
L_scomList[0].data = l_mbscfg.value;

Expand Down
151 changes: 148 additions & 3 deletions src/occ_405/cmdh/cmdh_fsp_cmds.c
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,8 @@ uint8_t G_mst_tunable_parameter_overwrite = 0;
uint8_t G_apss_ch_to_function[MAX_APSS_ADC_CHANNELS] = {0};

uint16_t G_allow_trace_flags = 0x0000;
uint32_t G_internal_flags = 0x00000000;
extern uint64_t G_inject_dimm;

ERRL_RC cmdh_poll_v20 (cmdh_fsp_rsp_t * i_rsp_ptr);

Expand Down Expand Up @@ -326,6 +328,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
}
else if (G_sysConfigData.mem_type == MEM_TYPE_CUMULUS)
{
static bool l_traced_missing_sid = FALSE;
for (l_cent=0; l_cent < MAX_NUM_MEM_CONTROLLERS; l_cent++)
{
if (CENTAUR_PRESENT(l_cent))
Expand All @@ -350,9 +353,13 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
{
l_temp_sid = g_amec->proc[0].memctl[l_cent].centaur.dimm_temps[l_dimm].temp_sid;

// TODO temp fix until the dimm numbering gets sorted out
if(FSP_SUPPORTED_OCC == G_occ_interrupt_type && l_temp_sid == 0)
if((FSP_SUPPORTED_OCC == G_occ_interrupt_type) && (l_temp_sid == 0) && (g_amec->proc[0].memctl[l_cent].centaur.dimm_temps[l_dimm].cur_temp != 0))
{
if (!l_traced_missing_sid)
{
CMDH_TRAC_ERR("cmdh_poll_v20: DIMM%04X sensor not defined but temperature was non-zero", ((l_cent << 8)|l_dimm));
l_traced_missing_sid = TRUE;
}
l_temp_sid = 1 + l_dimm; // If sid is zero them make up a sid for FSP
}

Expand Down Expand Up @@ -529,7 +536,7 @@ ERRL_RC cmdh_poll_v20(cmdh_fsp_rsp_t * o_rsp_ptr)
( (G_apss_ch_to_function[k] == ADC_GPU_1_0) && (G_first_sys_gpu_config & 0x08) ) ||
( (G_apss_ch_to_function[k] == ADC_GPU_1_1) && (G_first_sys_gpu_config & 0x10) ) ||
( (G_apss_ch_to_function[k] == ADC_GPU_1_2) && (G_first_sys_gpu_config & 0x20) ) )
{
{
l_pwrSensorList[l_sensorHeader.count].id = G_amec_sensor_list[PWRAPSSCH0 + k]->ipmi_sid;
l_pwrSensorList[l_sensorHeader.count].function_id = G_apss_ch_to_function[k];
l_pwrSensorList[l_sensorHeader.count].apss_channel = k;
Expand Down Expand Up @@ -1516,6 +1523,136 @@ void cmdh_dbug_allow_trace( const cmdh_fsp_cmd_t * i_cmd_ptr,



// Function Specification
//
// Name: cmdh_dbug_dimm_inject
//
// Description: Set/Clear internal debug flags
//
// End Function Specification
void cmdh_dbug_dimm_inject( const cmdh_fsp_cmd_t * i_cmd_ptr,
cmdh_fsp_rsp_t * o_rsp_ptr )
{
const cmdh_dbug_dimm_inject_cmd_t * l_cmd_ptr =
(cmdh_dbug_dimm_inject_cmd_t*)i_cmd_ptr;
cmdh_dbug_dimm_inject_rsp_t * l_rsp_ptr =
(cmdh_dbug_dimm_inject_rsp_t*)o_rsp_ptr;

uint8_t l_rc = ERRL_RC_SUCCESS;
// confirm inject data length (ignore sub_command byte)
const uint16_t inject_length = CMDH_DATALEN_FIELD_UINT16(l_cmd_ptr) - 1;

if((NULL == l_cmd_ptr) || (NULL == l_rsp_ptr))
{
l_rc = ERRL_RC_INTERNAL_FAIL;
}
// Command Length Check
else if ((inject_length != 0) && (inject_length != MAX_NUM_CENTAURS))
{
TRAC_ERR("cmdh_dbug_dimm_inject: Invalid inject data length %u (expected %u)",
inject_length, MAX_NUM_CENTAURS);
l_rc = ERRL_RC_INVALID_CMD_LEN;
}
else
{
if (inject_length == MAX_NUM_CENTAURS)
{
TRAC_INFO("cmdh_dbug_dimm_inject: updating DIMM inject mask from 0x%08X.%08X to 0x%08X.%08X",
G_inject_dimm >> 32, G_inject_dimm & 0xFFFFFFFF,
l_cmd_ptr->inject_mask >> 32, l_cmd_ptr->inject_mask & 0xFFFFFFFF);
G_inject_dimm = l_cmd_ptr->inject_mask;

unsigned int l_cent;
for(l_cent = 0; l_cent < MAX_NUM_CENTAURS; l_cent++)
{
uint8_t dimms = (G_inject_dimm >> (l_cent*8)) & 0xFF;
if (dimms != 0)
{
unsigned int k;
for(k=0; k < NUM_DIMMS_PER_CENTAUR; k++)
{
if (dimms & (1 << k))
{
if(!CENTAUR_SENSOR_ENABLED(l_cent, k))
{
TRAC_ERR("cmdh_dbug_dimm_inject: centaur%d DIMM%d is not enabled", l_cent, k);
}
}
}
}
}
}
// else just return current values

// Return the current DIMM inject mask
if( l_rsp_ptr != NULL )
{
l_rsp_ptr->data_length[0] = 0x00;
l_rsp_ptr->data_length[1] = MAX_NUM_CENTAURS;
memcpy(&o_rsp_ptr->data[0], &G_inject_dimm, MAX_NUM_CENTAURS);
}
}
G_rsp_status = l_rc;
return;
}



// Function Specification
//
// Name: cmdh_dbug_internal_flags
//
// Description: Set/Clear internal debug flags
//
// End Function Specification
void cmdh_dbug_internal_flags( const cmdh_fsp_cmd_t * i_cmd_ptr,
cmdh_fsp_rsp_t * o_rsp_ptr )
{
const cmdh_dbug_internal_flags_cmd_t * l_cmd_ptr =
(cmdh_dbug_internal_flags_cmd_t*)i_cmd_ptr;
cmdh_dbug_internal_flags_rsp_t * l_rsp_ptr =
(cmdh_dbug_internal_flags_rsp_t*)o_rsp_ptr;

uint8_t l_rc = ERRL_RC_SUCCESS;
const unsigned int flag_size = sizeof(G_internal_flags);
// data length (ignore sub_command byte)
const uint16_t data_length = CMDH_DATALEN_FIELD_UINT16(l_cmd_ptr) - 1;

if ((NULL == l_cmd_ptr) || (NULL == l_rsp_ptr))
{
l_rc = ERRL_RC_INTERNAL_FAIL;
}
else if ((data_length != 0) && (data_length != flag_size))
{
TRAC_ERR("cmdh_dbug_dimm_inject: Invalid internal flags length %u (expected %u)",
data_length, flag_size);
l_rc = ERRL_RC_INVALID_CMD_LEN;
}
else
{
if (data_length == flag_size)
{
TRAC_INFO("DEBUG - updating internal flags from 0x%08X to 0x%08X",
G_internal_flags, l_cmd_ptr->flags);

G_internal_flags = l_cmd_ptr->flags;
}

// always respond with the current flag value
if( l_rsp_ptr != NULL )
{
l_rsp_ptr->data_length[0] = 0x00;
l_rsp_ptr->data_length[1] = flag_size;
// Fill in response data
memcpy(&o_rsp_ptr->data[0], &G_internal_flags, flag_size);
}
}

G_rsp_status = l_rc;
return;
}



// Function Specification
//
Expand Down Expand Up @@ -1708,6 +1845,14 @@ void cmdh_dbug_cmd (const cmdh_fsp_cmd_t * i_cmd_ptr,
cmdh_dbug_allow_trace( i_cmd_ptr, o_rsp_ptr );
break;

case DBUG_DIMM_INJECT:
cmdh_dbug_dimm_inject( i_cmd_ptr, o_rsp_ptr );
break;

case DBUG_INTERNAL_FLAGS:
cmdh_dbug_internal_flags( i_cmd_ptr, o_rsp_ptr );
break;

case DBUG_POKE:
case DBUG_SET_PEXE_EVENT:
case DBUG_DUMP_THEMAL:
Expand Down
39 changes: 37 additions & 2 deletions src/occ_405/cmdh/cmdh_fsp_cmds.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "cmdh_fsp_cmds_datacnfg.h"
#include "sensor.h"
#include "apss.h"
#include "occ_sys_config.h"

// Enum of the various commands that may be sent to OCC
typedef enum
Expand Down Expand Up @@ -394,7 +395,7 @@ typedef enum
// free = 0x12
// free = 0x13
DBUG_INJECT_ERRL = 0x14,
// free = 0x15
DBUG_DIMM_INJECT = 0x15,
// free = 0x16
DBUG_GPIO_READ = 0x17,
DBUG_FSP_ATTN = 0x18,
Expand All @@ -404,7 +405,7 @@ typedef enum
DBUG_INJECT_ERR = 0x1C,
DBUG_VERIFY_V_F = 0x1D,
DBUG_DUMP_PPM_DATA = 0x1E,
// free = 0x1F
DBUG_INTERNAL_FLAGS = 0x1F,
DBUG_FLUSH_DCACHE = 0x20,
DBUG_INVALIDATE_DCACHE = 0x21,
DBUG_CENTAUR_SENSOR_CACHE = 0x22,
Expand Down Expand Up @@ -598,6 +599,40 @@ typedef struct __attribute__ ((packed))
uint8_t checksum[CMDH_FSP_CHECKSUM_SIZE];
}cmdh_dbug_allow_trace_rsp_t;

// DBUG_DIMM_INJECT command struct
typedef struct __attribute__ ((packed))
{
struct cmdh_fsp_cmd_header;
uint8_t sub_cmd;
uint64_t inject_mask;
}cmdh_dbug_dimm_inject_cmd_t;

// DBUG_DIMM_INJECT response struct
typedef struct __attribute__ ((packed))
{
struct cmdh_fsp_rsp_header;
uint64_t inject_mask;
uint8_t checksum[CMDH_FSP_CHECKSUM_SIZE];
}cmdh_dbug_dimm_inject_rsp_t;

// DBUG_INTERNAL_FLAGS command struct
typedef struct __attribute__ ((packed))
{
struct cmdh_fsp_cmd_header;
uint8_t sub_cmd;
uint32_t flags;
}cmdh_dbug_internal_flags_cmd_t;

// DBUG_INTERNAL_FLAGS response struct
typedef struct __attribute__ ((packed))
{
struct cmdh_fsp_rsp_header;
uint32_t flags;
uint8_t checksum[CMDH_FSP_CHECKSUM_SIZE];
}cmdh_dbug_internal_flags_rsp_t;

extern uint32_t G_internal_flags;

//---------------------------------------------------------
// Tunable Parameter Command
//---------------------------------------------------------
Expand Down

0 comments on commit a8a2c3a

Please sign in to comment.