Skip to content

Commit

Permalink
PRD: NVDIMM callouts for register access errors
Browse files Browse the repository at this point in the history
Change-Id: I2cef2d19db633d9cfb98f72cfd865ecdf960a1fe
CQ: SW487048
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/92994
Reviewed-by: Brian J Stegmiller <bjs@us.ibm.com>
Reviewed-by: Benjamen G Tyner <ben.tyner@ibm.com>
Tested-by: Jenkins Server <pfd-jenkins+hostboot@us.ibm.com>
Reviewed-by: Zane C Shelley <zshelle@us.ibm.com>
Reviewed-on: http://rchgit01.rchland.ibm.com/gerrit1/93481
Tested-by: Jenkins OP Build CI <op-jenkins+hostboot@us.ibm.com>
Tested-by: Jenkins OP HW <op-hw-jenkins+hostboot@us.ibm.com>
Tested-by: FSP CI Jenkins <fsp-CI-jenkins+hostboot@us.ibm.com>
  • Loading branch information
cnpalmer authored and zane131 committed Mar 18, 2020
1 parent 820a099 commit 51440d9
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 38 deletions.
27 changes: 14 additions & 13 deletions src/usr/diag/prdf/common/plat/mem/prdfMemExtraSig.H
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* Contributors Listed Below - COPYRIGHT 2016,2020 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -102,18 +102,19 @@ PRDR_ERROR_SIGNATURE(EsLifeErr, 0xffff0090, "", "NVDIMM Final Energy Source
PRDR_ERROR_SIGNATURE(EsTmpErrHigh, 0xffff0091, "", "NVDIMM Energy Source Temperature Error - High Temp Threshold");
PRDR_ERROR_SIGNATURE(EsTmpErrLow, 0xffff0092, "", "NVDIMM Energy Source Temperature Error - Low Temp Threshold");

PRDR_ERROR_SIGNATURE(NvmLifeWarn1, 0xffff0093, "", "NVDIMM First NVM Lifetime Warning");
PRDR_ERROR_SIGNATURE(NvmLifeWarn2, 0xffff0094, "", "NVDIMM Second NVM Lifetime Warning");
PRDR_ERROR_SIGNATURE(EsLifeWarn1, 0xffff0095, "", "NVDIMM First Energy Source Lifetime Warning");
PRDR_ERROR_SIGNATURE(EsLifeWarn2, 0xffff0096, "", "NVDIMM Second Energy Source Lifetime Warning");
PRDR_ERROR_SIGNATURE(EsTmpWarnHigh, 0xffff0097, "", "NVDIMM Energy Source Temperature Warning - High Temp Threshold");
PRDR_ERROR_SIGNATURE(EsTmpWarnLow, 0xffff0098, "", "NVDIMM Energy Source Temperature Warning - Low Temp Threshold");
PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Threshold");
PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error");
PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error");
PRDR_ERROR_SIGNATURE(FirEvntGone, 0xffff009C, "", "NVDIMM Event Triggering the FIR no longer present");
PRDR_ERROR_SIGNATURE(EsTmpWarnFa, 0xffff009D, "", "NVDIMM Energy Source Temperature Warning - False Alarm");
PRDR_ERROR_SIGNATURE(EsTmpErrFa, 0xffff009E, "", "NVDIMM Energy Source Temperature Error - False Alarm");
PRDR_ERROR_SIGNATURE(NvmLifeWarn1, 0xffff0093, "", "NVDIMM First NVM Lifetime Warning");
PRDR_ERROR_SIGNATURE(NvmLifeWarn2, 0xffff0094, "", "NVDIMM Second NVM Lifetime Warning");
PRDR_ERROR_SIGNATURE(EsLifeWarn1, 0xffff0095, "", "NVDIMM First Energy Source Lifetime Warning");
PRDR_ERROR_SIGNATURE(EsLifeWarn2, 0xffff0096, "", "NVDIMM Second Energy Source Lifetime Warning");
PRDR_ERROR_SIGNATURE(EsTmpWarnHigh, 0xffff0097, "", "NVDIMM Energy Source Temperature Warning - High Temp Threshold");
PRDR_ERROR_SIGNATURE(EsTmpWarnLow, 0xffff0098, "", "NVDIMM Energy Source Temperature Warning - Low Temp Threshold");
PRDR_ERROR_SIGNATURE(BelowWarnTh, 0xffff0099, "", "NVDIMM Below Warning Threshold");
PRDR_ERROR_SIGNATURE(IntNvdimmErr, 0xffff009A, "", "NVDIMM Intermittent error");
PRDR_ERROR_SIGNATURE(NotifStatErr, 0xffff009B, "", "NVDIMM Set Event Notification Status Error");
PRDR_ERROR_SIGNATURE(FirEvntGone, 0xffff009C, "", "NVDIMM Event Triggering the FIR no longer present");
PRDR_ERROR_SIGNATURE(EsTmpWarnFa, 0xffff009D, "", "NVDIMM Energy Source Temperature Warning - False Alarm");
PRDR_ERROR_SIGNATURE(EsTmpErrFa, 0xffff009E, "", "NVDIMM Energy Source Temperature Error - False Alarm");
PRDR_ERROR_SIGNATURE(NvdimmReadFail, 0xffff009F, "", "NVDIMM Failure to read NVDIMM register");

#endif // __prdfMemExtraSig_H

95 changes: 70 additions & 25 deletions src/usr/diag/prdf/plat/mem/prdfP9Mca.C
Expand Up @@ -5,7 +5,7 @@
/* */
/* OpenPOWER HostBoot Project */
/* */
/* Contributors Listed Below - COPYRIGHT 2016,2019 */
/* Contributors Listed Below - COPYRIGHT 2016,2020 */
/* [+] International Business Machines Corp. */
/* */
/* */
Expand Down Expand Up @@ -442,7 +442,7 @@ uint32_t __analyzeHealthStatus0Reg(STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Health Status0 Register. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
Expand Down Expand Up @@ -551,7 +551,7 @@ uint32_t __analyzeHealthStatus1Reg( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Health Status1 Register. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
Expand Down Expand Up @@ -710,7 +710,7 @@ uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg,
PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature MSB Register. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand All @@ -721,7 +721,7 @@ uint32_t __readTemp( TargetHandle_t i_dimm, uint16_t i_tempMsbReg,
PRDF_ERR( PRDF_FUNC "Failed to read ES Temperature LSB Register. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand Down Expand Up @@ -768,7 +768,7 @@ uint32_t __analyzeErrorThrStatusReg( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Status Reg. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
Expand Down Expand Up @@ -899,7 +899,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Reg. HUID: "
"0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand All @@ -912,7 +912,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Error Threshold Reg. HUID: "
"0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand Down Expand Up @@ -941,7 +941,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to clear Set Event Notification "
"Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand All @@ -955,7 +955,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
"Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( notifStat );
Expand Down Expand Up @@ -991,7 +991,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to write Warning Threshold Reg. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand All @@ -1004,7 +1004,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to write Set Event Notification "
"Cmd Reg. HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand All @@ -1017,7 +1017,7 @@ uint32_t __adjustThreshold( STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Set Event Notification "
"Status Reg. HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}
bitList = __nvdimmGetActiveBits( notifStat );
Expand Down Expand Up @@ -1084,7 +1084,7 @@ uint32_t __analyzeWarningThrStatusReg(STEP_CODE_DATA_STRUCT & io_sc,
PRDF_ERR( PRDF_FUNC "Failed to read Warning Threshold Status Reg. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
Expand Down Expand Up @@ -1298,7 +1298,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm )
PRDF_ERR( PRDF_FUNC "Failed to read NVDIMM_MGT_CMD1. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand All @@ -1313,7 +1313,7 @@ uint32_t __deassertEventN( TargetHandle_t i_dimm )
PRDF_ERR( PRDF_FUNC "Failed to write NVDIMM_MGT_CMD1. "
"HUID: 0x%08x", getHuid(i_dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );
o_rc = FAIL;
o_rc = PRD_SCANCOM_FAILURE;
break;
}

Expand Down Expand Up @@ -1378,6 +1378,20 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
PRDF_ERR( PRDF_FUNC "Failed to read Module Health Register. "
"HUID: 0x%08x", getHuid(dimm) );
PRDF_COMMIT_ERRL( errl, ERRL_ACTION_REPORT );

// If we got a failure reading one of the NVDIMM registers,
// add a signature and make the log predictive.
__addSignature( io_sc, i_chip->getTrgt(), errFound,
PRDFSIG_NvdimmReadFail );
errFound = true;
io_sc.service_data->SetThresholdMaskId(0);

// Callout NVDIMM
io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );

// Send message to PHYP that save/restore may work
l_rc = PlatServices::nvdimmNotifyProtChange( dimm,
NVDIMM::NVDIMM_RISKY_HW_ERROR );
continue;
}
std::map<uint8_t,bool> bitList = __nvdimmGetActiveBits( data );
Expand All @@ -1387,17 +1401,33 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
{
// Analyze Health Status0 Reg, Health Status1 Reg,
// and Error Theshold Status Reg
l_rc = __analyzeHealthStatus0Reg( io_sc, dimm, errFound );
if ( SUCCESS != l_rc ) continue;
l_rc = __analyzeHealthStatus1Reg( io_sc, dimm, errFound );
if ( SUCCESS != l_rc ) continue;
bool esTempErr = false;
l_rc = __analyzeErrorThrStatusReg(io_sc, dimm, errFound, esTempErr);
if ( SUCCESS != l_rc ) continue;
bool esTmpErr = false;
uint32_t l_rcStat0 = SUCCESS;
uint32_t l_rcStat1 = SUCCESS;
uint32_t l_rcErrTh = SUCCESS;
l_rcStat0 = __analyzeHealthStatus0Reg( io_sc, dimm, errFound );
l_rcStat1 = __analyzeHealthStatus1Reg( io_sc, dimm, errFound );
l_rcErrTh = __analyzeErrorThrStatusReg( io_sc, dimm, errFound,
esTmpErr );

if ( PRD_SCANCOM_FAILURE == l_rcStat0 ||
PRD_SCANCOM_FAILURE == l_rcStat1 ||
PRD_SCANCOM_FAILURE == l_rcErrTh )
{
// If we got a failure reading one of the NVDIMM registers,
// add a signature and make the log predictive.
__addSignature( io_sc, i_chip->getTrgt(), errFound,
PRDFSIG_NvdimmReadFail );
errFound = true;
io_sc.service_data->SetThresholdMaskId(0);

// Callout NVDIMM
io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );
}

// If we hit an ES temperature error and have not yet hit threshold,
// then keep the log hidden.
if ( esTempErr && !io_sc.service_data->IsAtThreshold() ) continue;
if ( esTmpErr && !io_sc.service_data->IsAtThreshold() ) continue;

// If we didn't find any error, then keep the log hidden.
if ( !errFound )
Expand Down Expand Up @@ -1426,7 +1456,22 @@ int32_t AnalyzeNvdimmHealthStatRegs( ExtensibleChip * i_chip,
else if ( bitList.count(1) )
{
l_rc = __analyzeWarningThrStatusReg( io_sc, dimm, errFound );
if ( SUCCESS != l_rc ) continue;
if ( PRD_SCANCOM_FAILURE == l_rc )
{
// If we got a failure reading one of the NVDIMM registers,
// add a signature and make the log predictive.
__addSignature( io_sc, i_chip->getTrgt(), errFound,
PRDFSIG_NvdimmReadFail );
errFound = true;
io_sc.service_data->SetThresholdMaskId(0);

// Callout NVDIMM
io_sc.service_data->SetCallout( dimm, MRU_MED, NO_GARD );

// Send message to PHYP that save/restore may work
l_rc = PlatServices::nvdimmNotifyProtChange( dimm,
NVDIMM::NVDIMM_RISKY_HW_ERROR );
}

if ( !errFound )
{
Expand Down

0 comments on commit 51440d9

Please sign in to comment.