Skip to content

Commit

Permalink
PRD: NVDIMM workaround for ES TEMP glitches
Browse files Browse the repository at this point in the history
There is currently a glitch with the energy source temperature
readings. We are getting intermittent incorrect readings for
the temperature that is causing us to analyze to an ES temperature
warning or error because of these glitched readings. This is a
workaround to make the ES temp logs hidden until we hit
threshold. The fix for the root cause of the glitches will
need to come from SMART.

Change-Id: I8df3f2531fda2e4c0273bfe3714c7efb9662971d
CQ: SW476951
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84345
Reviewed-by: Paul Greenwood <paul.greenwood@ibm.com>
Reviewed-by: Benjamen G Tyner <ben.tyner@ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/84363
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
  • Loading branch information
cnpalmer authored and zane131 committed Sep 27, 2019
1 parent 62c252e commit b93f63a
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 16 deletions.
2 changes: 2 additions & 0 deletions src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
Expand Up @@ -112,6 +112,8 @@ PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Thresh
PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error");
PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error");
PRDR_ERROR_SIGNATURE(FirEvntGone, 0xffff009C, "", "NVDIMM Event Triggering the FIR no longer present");
PRDR_ERROR_SIGNATURE(EsTmpWarnFa, 0xffff009D, "", "NVDIMM Energy Source Temperature Warning - False Alarm");
PRDR_ERROR_SIGNATURE(EsTmpErrFa, 0xffff009E, "", "NVDIMM Energy Source Temperature Error - False Alarm");

#endif // __prdfMemExtraSig_H

80 changes: 64 additions & 16 deletions src/usr/diag/prdf/plat/mem/prdfP9Mca.C
Expand Up @@ -739,15 +739,18 @@ uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg,
* @param io_sc The step code data struct.
* @param i_dimm The target dimm.
* @param io_errFound Whether an error has already been found or not.
* @param o_esTempErr A flag for whether we hit an ES TEMP error or not.
* @return FAIL if unable to read register, else SUCCESS
*/
uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
TargetHandle_t i_dimm, bool & io_errFound )
TargetHandle_t i_dimm, bool & io_errFound,
bool & o_esTempErr )
{
#define PRDF_FUNC "[__analyzeErrorThrStatusReg] "

uint32_t o_rc = SUCCESS;
uint8_t data = 0;
o_esTempErr = false;

// Get MCA, for signatures
TargetHandle_t mca = getConnectedParent( i_dimm, TYPE_MCA );
Expand Down Expand Up @@ -787,21 +790,29 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
// BIT 2: ES Temperature Error
if ( bitList.count(2) )
{
// Sleep two seconds to avoid exiting PRD analysis faster than the
// ES_TEMP sample rate.
PlatServices::milliSleep( 2, 0 );

// Read the ES_TEMP and ES_TEMP_ERROR_HIGH_THRESHOLD values
uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1;
uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0;

uint16_t esTemp = 0;
o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp );
if ( SUCCESS != o_rc ) break;

uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD1;
uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_HIGH_THRESHOLD0;

uint16_t esTempHighTh = 0;
o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh );
if ( SUCCESS != o_rc ) break;

msbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD1;
lsbThReg = NVDIMM::i2cReg::ES_TEMP_ERROR_LOW_THRESHOLD0;
uint16_t esTempLowTh = 0;
o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh );
if ( SUCCESS != o_rc ) break;

// Check to see if the ES_TEMP is negative (bit 12)
bool esTempNeg = false;
if ( esTemp & 0x1000 ) esTempNeg = true;
Expand All @@ -814,19 +825,29 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpErrHigh );
}
// Else assume the warning is because of a low threshold.
else
// Else check if the error hit the low threshold, again with the
// same 2°C margin.
else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg )
{
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpErrLow );
}
// Else the temperature must have gone back to a normal value, so
// we will label this as a false alarm case.
else
{
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpErrFa );
}

// Callout BPM (backup power module) high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
if ( SUCCESS != o_rc ) break;

// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );

o_esTempErr = true;
io_errFound = true;
}
// BIT 3:7: Reserved
Expand Down Expand Up @@ -1072,21 +1093,29 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
// BIT 2: ES_TEMP_WARNING
if ( bitList.count(2) )
{
// Sleep two seconds to avoid exiting PRD analysis faster than the
// ES_TEMP sample rate.
PlatServices::milliSleep( 2, 0 );

// Read the ES_TEMP and ES_TEMP_WARNING_HIGH_THRESHOLD values
uint16_t msbEsTempReg = NVDIMM::i2cReg::ES_TEMP1;
uint16_t lsbEsTempReg = NVDIMM::i2cReg::ES_TEMP0;

uint16_t esTemp = 0;
o_rc = __readTemp( i_dimm, msbEsTempReg, lsbEsTempReg, esTemp );
if ( SUCCESS != o_rc ) break;

uint16_t msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD1;
uint16_t lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_HIGH_THRESHOLD0;

uint16_t esTempHighTh = 0;
o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempHighTh );
if ( SUCCESS != o_rc ) break;

msbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD1;
lsbThReg = NVDIMM::i2cReg::ES_TEMP_WARNING_LOW_THRESHOLD0;
uint16_t esTempLowTh = 0;
o_rc = __readTemp( i_dimm, msbThReg, lsbThReg, esTempLowTh );
if ( SUCCESS != o_rc ) break;

// Check to see if the ES_TEMP is negative (bit 12)
bool esTempNeg = false;
if ( esTemp & 0x1000 ) esTempNeg = true;
Expand All @@ -1099,12 +1128,20 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpWarnHigh );
}
// Else assume the warning is because of a low threshold.
else
// Else check if the warning hit the low threshold, again with the
// same 2°C margin.
else if ( (esTemp <= (esTempLowTh + 0x0020)) || esTempNeg )
{
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpWarnLow );
}
// Else the temperature must have gone back to a normal value, so
// we will label this as a false alarm case.
else
{
__addSignature( io_sc, mca, io_errFound,
PRDFSIG_EsTmpWarnFa );
}

// Callout BPM (backup power module) high
o_rc = __addBpmCallout( i_dimm, HWAS::SRCI_PRIORITY_HIGH );
Expand All @@ -1113,13 +1150,19 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
// Callout NVDIMM low, no gard
io_sc.service_data->SetCallout( i_dimm, MRU_LOW, NO_GARD );

// Make the log predictive and mask the FIR.
io_sc.service_data->SetThresholdMaskId(0);
// Because of the possibility of intermittent ES temperature
// false alarm readings, we will keep the log hidden. If there is
// an actual ES temperature problem, we assume we will continue
// to be called to handle the temperature warning and hit threshold.

// Send message to PHYP that save/restore may work
o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
NVDIMM::NVDIMM_RISKY_HW_ERROR );
if ( SUCCESS != o_rc ) break;
// Only send the save/restore message to PHYP if we hit threshold.
if ( io_sc.service_data->IsAtThreshold() )
{
// Send message to PHYP that save/restore may work
o_rc = PlatServices::nvdimmNotifyProtChange( i_dimm,
NVDIMM::NVDIMM_RISKY_HW_ERROR );
if ( SUCCESS != o_rc ) break;
}

io_errFound = true;
}
Expand Down Expand Up @@ -1348,9 +1391,14 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
if ( SUCCESS != l_rc ) continue;
l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound );
if ( SUCCESS != l_rc ) continue;
l_rc = __analyzeErrorThrStatusReg( io_sc, dimm, errFound );
bool esTempErr = false;
l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr);
if ( SUCCESS != l_rc ) continue;

// If we hit an ES temperature error and have not yet hit threshold,
// then keep the log hidden.
if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue;

// If we didn't find any error, then keep the log hidden.
if ( !errFound )
{
Expand Down

0 comments on commit b93f63a

Please sign in to comment.